git.rot13.org Git - MARC-Fast/blob - Fast.pm

   1 package MARC::Fast;
   2
   3 use strict;
   4 use Carp;
   5 use Data::Dumper;
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.09;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16 }
  17
  18 =head1 NAME
  19
  20 MARC::Fast - Very fast implementation of MARC database reader
  21
  22 =head1 SYNOPSIS
  23
  24   use MARC::Fast;
  25
  26   my $marc = new MARC::Fast(
  27         marcdb => 'unimarc.iso',
  28   );
  29
  30   foreach my $mfn ( 1 .. $marc->count ) {
  31         print $marc->to_ascii( $mfn );
  32   }
  33
  34 For longer example with command line options look at L<scripts/dump_fastmarc.pl>
  35
  36 =head1 DESCRIPTION
  37
  38 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
  39
  40 It's is also very subtable for random access to MARC records (as opposed to
  41 sequential one).
  42
  43 =head1 METHODS
  44
  45 =head2 new
  46
  47 Read MARC database
  48
  49   my $marc = new MARC::Fast(
  50         marcdb => 'unimarc.iso',
  51         quiet => 0,
  52         debug => 0,
  53         assert => 0,
  54         hash_filter => sub {
  55                 my ($t, $record_number) = @_;
  56                 $t =~ s/foo/bar/;
  57                 return $t;
  58         },
  59   );
  60
  61 =cut
  62
  63 ################################################## subroutine header end ##
  64
  65
  66 sub new {
  67         my $class = shift;
  68         my $self = {@_};
  69         bless ($self, $class);
  70
  71         croak "need marcdb parametar" unless ($self->{marcdb});
  72
  73         print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
  74
  75         open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
  76         binmode($self->{fh});
  77
  78         $self->{count} = 0;
  79
  80         while (! eof($self->{fh})) {
  81                 $self->{count}++;
  82
  83                 # save record position
  84                 push @{$self->{fh_offset}}, tell($self->{fh});
  85
  86                 my $leader;
  87                 my $len = read($self->{fh}, $leader, 24);
  88
  89                 if ($len < 24) {
  90                         carp "short read of leader, aborting\n";
  91                         last;
  92                 }
  93
  94                 # Byte        Name
  95                 # ----        ----
  96                 # 0-4         Record Length
  97                 # 5           Status (n=new, c=corrected and d=deleted)
  98                 # 6           Type of Record (a=printed material)
  99                 # 7           Bibliographic Level (m=monograph)
 100                 # 8-9         Blanks
 101                 # 10          Indictator count (2 for monographs)
 102                 # 11          Subfield code count (2 - 0x1F+subfield code itself)
 103                 # 12-16       Base address of data
 104                 # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
 105                 #               3=sublevel 3)
 106                 # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
 107                 #               n=record is in non-ISBD format, i=record is in
 108                 #               an incomplete ISBD format)
 109                 # 19          Blank
 110                 # 20          Length of length field in directory (always 4 in UNIMARC)
 111                 # 21          Length of Starting Character Position in directory (always
 112                 #               5 in UNIMARC)
 113                 # 22          Length of implementation defined portion in directory (always
 114                 #               0 in UNIMARC)
 115                 # 23          Blank
 116                 #
 117                 #           |0   45  89  |12 16|1n 450 |
 118                 #           |xxxxxnam  22(.....)   45 <---
 119
 120                 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
 121
 122                 # store leader for later
 123                 push @{$self->{leader}}, $leader;
 124
 125                 # skip to next record
 126                 my $o = substr($leader,0,5);
 127                 if ($o > 24) {
 128                         seek($self->{fh},$o-24,1) if ($o);
 129                 } else {
 130                         last;
 131                 }
 132
 133         }
 134
 135         return $self;
 136 }
 137
 138 =head2 count
 139
 140 Return number of records in database
 141
 142   print $marc->count;
 143
 144 =cut
 145
 146 sub count {
 147         my $self = shift;
 148         return $self->{count};
 149 }
 150
 151 =head2 fetch
 152
 153 Fetch record from database
 154
 155   my $hash = $marc->fetch(42);
 156
 157 First record number is C<1>
 158
 159 =cut
 160
 161 sub fetch {
 162         my $self = shift;
 163
 164         my $rec_nr = shift;
 165
 166         if ( ! $rec_nr ) {
 167                 $self->{last_leader} = undef;
 168                 return;
 169         }
 170
 171         my $leader = $self->{leader}->[$rec_nr - 1];
 172         $self->{last_leader} = $leader;
 173         unless ($leader) {
 174                 carp "can't find record $rec_nr";
 175                 return;
 176         };
 177         my $offset = $self->{fh_offset}->[$rec_nr - 1];
 178         unless (defined($offset)) {
 179                 carp "can't find offset for record $rec_nr";
 180                 return;
 181         };
 182
 183         my $reclen = substr($leader,0,5);
 184         my $base_addr = substr($leader,12,5);
 185
 186         print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
 187
 188         my $skip = 0;
 189
 190         print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
 191
 192         if ( ! seek($self->{fh}, $offset+24, 0) ) {
 193                 carp "can't seek to $offset: $!";
 194                 return;
 195         }
 196
 197         print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
 198
 199         my $directory;
 200         if( ! read($self->{fh},$directory,$base_addr-24) ) {
 201                 carp "can't read directory: $!";
 202                 $skip = 1;
 203         } else {
 204                 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
 205         }
 206
 207         print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
 208
 209         my $fields;
 210         if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
 211                 carp "can't read fields: $!";
 212                 $skip = 1;
 213         } else {
 214                 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
 215         }
 216
 217         my $row;
 218
 219         while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
 220                 my ($tag,$len,$addr) = ($1,$2,$3);
 221
 222                 if (($addr+$len) > length($fields)) {
 223                         print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
 224                         $skip = 1;
 225                         next;
 226                 }
 227
 228                 # take field
 229                 my $f = substr($fields,$addr,$len);
 230                 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
 231
 232                 push @{ $row->{$tag} }, $f;
 233
 234                 my $del = substr($fields,$addr+$len-1,1);
 235
 236                 # check field delimiters...
 237                 if ($self->{assert} && $del ne chr(30)) {
 238                         print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
 239                         $skip = 1;
 240                         next;
 241                 }
 242
 243                 if ($self->{assert} && length($f) < 2) {
 244                         print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
 245                         next;
 246                 }
 247
 248         }
 249
 250         return $row;
 251 }
 252
 253
 254 =head2 last_leader
 255
 256 Returns leader of last record L<fetch>ed
 257
 258   print $marc->last_leader;
 259
 260 Added in version 0.08 of this module, so if you need it use:
 261
 262   use MARC::Fast 0.08;
 263
 264 to be sure that it's supported.
 265
 266 =cut
 267
 268 sub last_leader {
 269         my $self = shift;
 270         return $self->{last_leader};
 271 }
 272
 273
 274 =head2 to_hash
 275
 276 Read record with specified MFN and convert it to hash
 277
 278   my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
 279
 280 It has ability to convert characters (using C<hash_filter>) from MARC
 281 database before creating structures enabling character re-mapping or quick
 282 fix-up of data.
 283
 284 This function returns hash which is like this:
 285
 286   '200' => [
 287              {
 288                'i1' => '1',
 289                'i2' => ' '
 290                'a' => 'Goa',
 291                'f' => 'Valdo D\'Arienzo',
 292                'e' => 'tipografie e tipografi nel XVI secolo',
 293              }
 294            ],
 295
 296 This method will also create additional field C<000> with MFN.
 297
 298 =cut
 299
 300 sub to_hash {
 301         my $self = shift;
 302
 303         my $mfn = shift || confess "need mfn!";
 304
 305         my $args = {@_};
 306
 307         # init record to include MFN as field 000
 308         my $rec = { '000' => [ $mfn ] };
 309
 310         my $row = $self->fetch($mfn) || return;
 311
 312         foreach my $rec_nr (keys %{$row}) {
 313                 foreach my $l (@{$row->{$rec_nr}}) {
 314
 315                         # remove end marker
 316                         $l =~ s/\x1E$//;
 317
 318                         # filter output
 319                         $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
 320
 321                         my $val;
 322
 323                         # has identifiers?
 324                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
 325
 326                         my $sf_usage;
 327                         my @subfields;
 328
 329                         # has subfields?
 330                         if ($l =~ m/\x1F/) {
 331                                 foreach my $t (split(/\x1F/,$l)) {
 332                                         next if (! $t);
 333                                         my $f = substr($t,0,1);
 334
 335                                         push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
 336
 337                                         # repeatable subfiled -- convert it to array
 338                                         if ($val->{$f}) {
 339                                                 if ( $sf_usage->{$f} == 2 ) {
 340                                                         $val->{$f} = [ $val->{$f}, $val ];
 341                                                 } else {
 342                                                         push @{$val->{$f}}, $val;
 343                                                 }
 344                                         }
 345                                         $val->{substr($t,0,1)} = substr($t,1);
 346                                 }
 347                                 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
 348                         } else {
 349                                 $val = $l;
 350                         }
 351
 352                         push @{$rec->{$rec_nr}}, $val;
 353                 }
 354         }
 355
 356         return $rec;
 357 }
 358
 359 =head2 to_ascii
 360
 361   print $marc->to_ascii( 42 );
 362
 363 =cut
 364
 365 sub to_ascii {
 366         my $self = shift;
 367
 368         my $mfn = shift || confess "need mfn";
 369         my $row = $self->fetch($mfn) || return;
 370
 371         my $out;
 372
 373         foreach my $f (sort keys %{$row}) {
 374                 my $dump = join('', @{ $row->{$f} });
 375                 $dump =~ s/\x1e$//;
 376                 $dump =~ s/\x1f/\$/g;
 377                 $out .= "$f\t$dump\n";
 378         }
 379
 380         return $out;
 381 }
 382
 383 1;
 384 __END__
 385
 386 =head1 AUTHOR
 387
 388         Dobrica Pavlinusic
 389         CPAN ID: DPAVLIN
 390         dpavlin@rot13.org
 391         http://www.rot13.org/~dpavlin/
 392
 393 =head1 COPYRIGHT
 394
 395 This program is free software; you can redistribute
 396 it and/or modify it under the same terms as Perl itself.
 397
 398 The full text of the license can be found in the
 399 LICENSE file included with this module.
 400
 401
 402 =head1 SEE ALSO
 403
 404 L<Biblio::Isis>, perl(1).
 405
 406 =cut