git.rot13.org Git - MARC-Fast/blob - Fast.pm

   1 package MARC::Fast;
   2
   3 use strict;
   4 use Carp;
   5 use Data::Dumper;
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.05;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16 }
  17
  18 =head1 NAME
  19
  20 MARC::Fast - Very fast implementation of MARC database reader
  21
  22 =head1 SYNOPSIS
  23
  24   use MARC::Fast;
  25
  26
  27 =head1 DESCRIPTION
  28
  29 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
  30
  31 It's is also very sutable for random access to MARC records (as opposed to
  32 sequential one).
  33
  34 =head1 METHODS
  35
  36 =head2 new
  37
  38 Read MARC database
  39
  40   my $marc = new MARC::Fast(
  41         marcdb => 'unimarc.iso',
  42         quiet => 0,
  43         debug => 0,
  44         assert => 0,
  45         hash_filter => sub {
  46                 my ($t, $record_number) = @_;
  47                 $t =~ s/foo/bar/;
  48                 return $t;
  49         },
  50   );
  51
  52 =cut
  53
  54 ################################################## subroutine header end ##
  55
  56
  57 sub new {
  58         my $class = shift;
  59         my $self = {@_};
  60         bless ($self, $class);
  61
  62         croak "need marcdb parametar" unless ($self->{marcdb});
  63
  64         print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
  65
  66         open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
  67         binmode($self->{fh});
  68
  69         $self->{count} = 0;
  70
  71         while (! eof($self->{fh})) {
  72                 $self->{count}++;
  73
  74                 # save record position
  75                 push @{$self->{fh_offset}}, tell($self->{fh});
  76
  77                 my $leader;
  78                 my $len = read($self->{fh}, $leader, 24);
  79
  80                 if ($len < 24) {
  81                         carp "short read of leader, aborting\n";
  82                         last;
  83                 }
  84
  85                 # Byte        Name
  86                 # ----        ----
  87                 # 0-4         Record Length
  88                 # 5           Status (n=new, c=corrected and d=deleted)
  89                 # 6           Type of Record (a=printed material)
  90                 # 7           Bibliographic Level (m=monograph)
  91                 # 8-9         Blanks
  92                 # 10          Indictator count (2 for monographs)
  93                 # 11          Subfield code count (2 - 0x1F+subfield code itself)
  94                 # 12-16       Base address of data
  95                 # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
  96                 #               3=sublevel 3)
  97                 # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
  98                 #               n=record is in non-ISBD format, i=record is in
  99                 #               an incomplete ISBD format)
 100                 # 19          Blank
 101                 # 20          Length of length field in directory (always 4 in UNIMARC)
 102                 # 21          Length of Starting Character Position in directory (always
 103                 #               5 in UNIMARC)
 104                 # 22          Length of implementation defined portion in directory (always
 105                 #               0 in UNIMARC)
 106                 # 23          Blank
 107                 #
 108                 #           |0   45  89  |12 16|1n 450 |
 109                 #           |xxxxxnam  22(.....)   45 <---
 110
 111                 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
 112
 113                 # store leader for later
 114                 push @{$self->{leaders}}, $leader;
 115
 116                 # skip to next record
 117                 my $o = substr($leader,0,5);
 118                 if ($o > 24) {
 119                         seek($self->{fh},$o-24,1) if ($o);
 120                 } else {
 121                         last;
 122                 }
 123
 124         }
 125
 126         return $self;
 127 }
 128
 129 =head2 count
 130
 131 Return number of records in database
 132
 133   print $marc->count;
 134
 135 =cut
 136
 137 sub count {
 138         my $self = shift;
 139         return $self->{count};
 140 }
 141
 142 =head2 fetch
 143
 144 Fetch record from database
 145
 146   my $hash = $marc->fetch(42);
 147
 148 =cut
 149
 150 sub fetch {
 151         my $self = shift;
 152
 153         my $rec_nr = shift || return;
 154
 155         my $leader = $self->{leaders}->[$rec_nr - 1];
 156         unless ($leader) {
 157                 carp "can't find record $rec_nr";
 158                 return;
 159         };
 160         my $offset = $self->{fh_offset}->[$rec_nr - 1];
 161         unless (defined($offset)) {
 162                 carp "can't find offset for record $rec_nr";
 163                 return;
 164         };
 165
 166         my $reclen = substr($leader,0,5);
 167         my $base_addr = substr($leader,12,5);
 168
 169         print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
 170
 171         my $skip = 0;
 172
 173         print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
 174
 175         if ( ! seek($self->{fh}, $offset+24, 0) ) {
 176                 carp "can't seek to $offset: $!";
 177                 return;
 178         }
 179
 180         print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
 181
 182         my $directory;
 183         if( ! read($self->{fh},$directory,$base_addr-24) ) {
 184                 carp "can't read directory: $!";
 185                 $skip = 1;
 186         } else {
 187                 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
 188         }
 189
 190         print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
 191
 192         my $fields;
 193         if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
 194                 carp "can't read fields: $!";
 195                 $skip = 1;
 196         } else {
 197                 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
 198         }
 199
 200         my $row;
 201
 202         while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
 203                 my ($tag,$len,$addr) = ($1,$2,$3);
 204
 205                 if (($addr+$len) > length($fields)) {
 206                         print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
 207                         $skip = 1;
 208                         next;
 209                 }
 210
 211                 # take field
 212                 my $f = substr($fields,$addr,$len);
 213                 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
 214
 215                 push @{ $row->{$tag} }, $f;
 216
 217                 my $del = substr($fields,$addr+$len-1,1);
 218
 219                 # check field delimiters...
 220                 if ($self->{assert} && $del ne chr(30)) {
 221                         print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
 222                         $skip = 1;
 223                         next;
 224                 }
 225
 226                 if ($self->{assert} && length($f) < 2) {
 227                         print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
 228                         next;
 229                 }
 230
 231         }
 232
 233         return $row;
 234 }
 235
 236
 237 =head2 to_hash
 238
 239 Read record with specified MFN and convert it to hash
 240
 241   my $hash = $marc->to_hash($mfn);
 242
 243 It has ability to convert characters (using C<hash_filter>) from MARC
 244 database before creating structures enabling character re-mapping or quick
 245 fix-up of data.
 246
 247 This function returns hash which is like this:
 248
 249   '200' => [
 250              {
 251                'i1' => '1',
 252                'i2' => ' '
 253                'a' => 'Goa',
 254                'f' => 'Valdo D\'Arienzo',
 255                'e' => 'tipografie e tipografi nel XVI secolo',
 256              }
 257            ],
 258
 259 This method will also create additional field C<000> with MFN.
 260
 261 =cut
 262
 263 sub to_hash {
 264         my $self = shift;
 265
 266         my $mfn = shift || confess "need mfn!";
 267
 268         # init record to include MFN as field 000
 269         my $rec = { '000' => [ $mfn ] };
 270
 271         my $row = $self->fetch($mfn) || return;
 272
 273         foreach my $rec_nr (keys %{$row}) {
 274                 foreach my $l (@{$row->{$rec_nr}}) {
 275
 276                         # remove end marker
 277                         $l =~ s/\x1E$//;
 278
 279                         # filter output
 280                         $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
 281
 282                         my $val;
 283
 284                         # has identifiers?
 285                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
 286
 287                         # has subfields?
 288                         if ($l =~ m/\x1F/) {
 289                                 foreach my $t (split(/\x1F/,$l)) {
 290                                         next if (! $t);
 291                                         my $f = substr($t,0,1);
 292                                         # repeatable subfileds. When we hit first one,
 293                                         # store CURRENT (up to that) in first repetition
 294                                         # of this record. Then, new record with same
 295                                         # identifiers will be created.
 296                                         if ($val->{$f}) {
 297                                                 push @{$rec->{$rec_nr}}, $val;
 298                                                 $val = {
 299                                                         i1 => $val->{i1},
 300                                                         i2 => $val->{i2},
 301                                                 };
 302                                         }
 303                                         $val->{substr($t,0,1)} = substr($t,1);
 304                                 }
 305                         } else {
 306                                 $val = $l;
 307                         }
 308
 309                         push @{$rec->{$rec_nr}}, $val;
 310                 }
 311         }
 312
 313         return $rec;
 314 }
 315
 316 =head2 to_ascii
 317
 318   print $marc->to_ascii( 42 );
 319
 320 =cut
 321
 322 sub to_ascii {
 323         my $self = shift;
 324
 325         my $mfn = shift || confess "need mfn";
 326         my $row = $self->fetch($mfn) || return;
 327
 328         my $out;
 329
 330         foreach my $f (sort keys %{$row}) {
 331                 my $dump = join('', @{ $row->{$f} });
 332                 $dump =~ s/\x1e$//;
 333                 $dump =~ s/\x1f/\$/g;
 334                 $out .= "$f\t$dump\n";
 335         }
 336
 337         return $out;
 338 }
 339
 340 1;
 341 __END__
 342
 343 =head1 AUTHOR
 344
 345         Dobrica Pavlinusic
 346         CPAN ID: DPAVLIN
 347         dpavlin@rot13.org
 348         http://www.rot13.org/~dpavlin/
 349
 350 =head1 COPYRIGHT
 351
 352 This program is free software; you can redistribute
 353 it and/or modify it under the same terms as Perl itself.
 354
 355 The full text of the license can be found in the
 356 LICENSE file included with this module.
 357
 358
 359 =head1 SEE ALSO
 360
 361 L<Biblio::Isis>, perl(1).
 362
 363 =cut