git.rot13.org Git - MARC-Fast/blob - Fast.pm

   1
   2 package MARC::Fast;
   3 use strict;
   4 use Carp;
   5 use Data::Dumper;
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.02;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16 }
  17
  18 =head1 NAME
  19
  20 MARC::Fast - Very fast implementation of MARC database reader
  21
  22 =head1 SYNOPSIS
  23
  24   use MARC::Fast;
  25
  26
  27 =head1 DESCRIPTION
  28
  29 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
  30
  31 It's is also very sutable for random access to MARC records (as opposed to
  32 sequential one).
  33
  34 =head1 METHODS
  35
  36 =head2 new
  37
  38 Read MARC database
  39
  40   my $marc = new MARC::Fast(
  41         marcdb => 'unimarc.iso',
  42         quiet => 0,
  43         debug => 0,
  44         assert => 0,
  45   );
  46
  47 =cut
  48
  49 ################################################## subroutine header end ##
  50
  51
  52 sub new {
  53         my $class = shift;
  54         my $self = {@_};
  55         bless ($self, $class);
  56
  57         croak "need marcdb parametar" unless ($self->{marcdb});
  58
  59         print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
  60
  61         open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
  62         binmode($self->{fh});
  63
  64         $self->{count} = 0;
  65
  66         while (! eof($self->{fh})) {
  67                 $self->{count}++;
  68
  69                 # save record position
  70                 push @{$self->{fh_offset}}, tell($self->{fh});
  71
  72                 my $leader;
  73                 my $len = read($self->{fh}, $leader, 24);
  74
  75                 if ($len < 24) {
  76                         carp "short read of leader, aborting\n";
  77                         last;
  78                 }
  79
  80                 # Byte        Name
  81                 # ----        ----
  82                 # 0-4         Record Length
  83                 # 5           Status (n=new, c=corrected and d=deleted)
  84                 # 6           Type of Record (a=printed material)
  85                 # 7           Bibliographic Level (m=monograph)
  86                 # 8-9         Blanks
  87                 # 10          Indictator count (2 for monographs)
  88                 # 11          Subfield code count (2 - 0x1F+subfield code itself)
  89                 # 12-16       Base address of data
  90                 # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
  91                 #               3=sublevel 3)
  92                 # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
  93                 #               n=record is in non-ISBD format, i=record is in
  94                 #               an incomplete ISBD format)
  95                 # 19          Blank
  96                 # 20          Length of length field in directory (always 4 in UNIMARC)
  97                 # 21          Length of Starting Character Position in directory (always
  98                 #               5 in UNIMARC)
  99                 # 22          Length of implementation defined portion in directory (always
 100                 #               0 in UNIMARC)
 101                 # 23          Blank
 102                 #
 103                 #           |0   45  89  |12 16|1n 450 |
 104                 #           |xxxxxnam  22(.....)   45 <---
 105
 106                 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
 107
 108                 # store leader for later
 109                 push @{$self->{leaders}}, $leader;
 110
 111                 # skip to next record
 112                 my $o = substr($leader,0,5);
 113                 if ($o > 24) {
 114                         seek($self->{fh},$o-24,1) if ($o);
 115                 } else {
 116                         last;
 117                 }
 118
 119         }
 120
 121         return $self;
 122 }
 123
 124 =head2 count
 125
 126 Return number of records in database
 127
 128   print $marc->count;
 129
 130 =cut
 131
 132 sub count {
 133         my $self = shift;
 134         return $self->{count};
 135 }
 136
 137 =head2 fetch
 138
 139 Fetch record from database
 140
 141   my $hash = $marc->fetch(42);
 142
 143 =cut
 144
 145 sub fetch {
 146         my $self = shift;
 147
 148         my $rec_nr = shift || return;
 149
 150         my $leader = $self->{leaders}->[$rec_nr - 1];
 151         unless ($leader) {
 152                 carp "can't find record $rec_nr";
 153                 return;
 154         };
 155         my $offset = $self->{fh_offset}->[$rec_nr - 1];
 156         unless (defined($offset)) {
 157                 carp "can't find offset for record $rec_nr";
 158                 return;
 159         };
 160
 161         my $reclen = substr($leader,0,5);
 162         my $base_addr = substr($leader,12,5);
 163
 164         print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
 165
 166         my $skip = 0;
 167
 168         print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
 169
 170         if ( ! seek($self->{fh}, $offset+24, 0) ) {
 171                 carp "can't seek to $offset: $!";
 172                 return;
 173         }
 174
 175         print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
 176
 177         my $directory;
 178         if( ! read($self->{fh},$directory,$base_addr-24) ) {
 179                 carp "can't read directory: $!";
 180                 $skip = 1;
 181         } else {
 182                 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
 183         }
 184
 185         print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
 186
 187         my $fields;
 188         if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
 189                 carp "can't read fields: $!";
 190                 $skip = 1;
 191         } else {
 192                 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
 193         }
 194
 195         my $row;
 196
 197         while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
 198                 my ($tag,$len,$addr) = ($1,$2,$3);
 199
 200                 if (($addr+$len) > length($fields)) {
 201                         print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
 202                         $skip = 1;
 203                         next;
 204                 }
 205
 206                 # take field
 207                 my $f = substr($fields,$addr,$len);
 208                 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
 209
 210                 push @{ $row->{$tag} }, $f;
 211
 212                 my $del = substr($fields,$addr+$len-1,1);
 213
 214                 # check field delimiters...
 215                 if ($self->{assert} && $del ne chr(30)) {
 216                         print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
 217                         $skip = 1;
 218                         next;
 219                 }
 220
 221                 if ($self->{assert} && length($f) < 2) {
 222                         print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
 223                         next;
 224                 }
 225
 226         }
 227
 228         return $row;
 229 }
 230
 231
 232 =head2 to_hash
 233
 234 Read record with specified MFN and convert it to hash
 235
 236   my $hash = $marc->to_hash($mfn);
 237
 238 It has ability to convert characters (using C<hash_filter>) from MARC
 239 database before creating structures enabling character re-mapping or quick
 240 fix-up of data.
 241
 242 This function returns hash which is like this:
 243
 244   '200' => [
 245              {
 246                'i1' => '1',
 247                'i2' => ' '
 248                'a' => 'Goa',
 249                'f' => 'Valdo D\'Arienzo',
 250                'e' => 'tipografie e tipografi nel XVI secolo',
 251              }
 252            ],
 253
 254 This method will also create additional field C<000> with MFN.
 255
 256 =cut
 257
 258 sub to_hash {
 259         my $self = shift;
 260
 261         my $mfn = shift || confess "need mfn!";
 262
 263         # init record to include MFN as field 000
 264         my $rec = { '000' => [ $mfn ] };
 265
 266         my $row = $self->fetch($mfn) || return;
 267
 268         foreach my $k (keys %{$row}) {
 269                 foreach my $l (@{$row->{$k}}) {
 270
 271                         # remove end marker
 272                         $l =~ s/\x1E$//;
 273
 274                         # filter output
 275                         $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
 276
 277                         my $val;
 278
 279                         # has identifiers?
 280                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
 281
 282                         # has subfields?
 283                         if ($l =~ m/\x1F/) {
 284                                 foreach my $t (split(/\x1F/,$l)) {
 285                                         next if (! $t);
 286                                         $val->{substr($t,0,1)} = substr($t,1);
 287                                 }
 288                         } else {
 289                                 $val = $l;
 290                         }
 291
 292                         push @{$rec->{$k}}, $val;
 293                 }
 294         }
 295
 296         return $rec;
 297 }
 298
 299
 300 1;
 301 __END__
 302
 303 =head1 BUGS
 304
 305
 306
 307 =head1 SUPPORT
 308
 309
 310
 311 =head1 AUTHOR
 312
 313         Dobrica Pavlinusic
 314         CPAN ID: DPAVLIN
 315         dpavlin@rot13.org
 316         http://www.rot13.org/~dpavlin/
 317
 318 =head1 COPYRIGHT
 319
 320 This program is free software; you can redistribute
 321 it and/or modify it under the same terms as Perl itself.
 322
 323 The full text of the license can be found in the
 324 LICENSE file included with this module.
 325
 326
 327 =head1 SEE ALSO
 328
 329 perl(1).
 330
 331 =cut