git.rot13.org Git - MARC-Fast/blob - lib/MARC/Fast.pm

   1 package MARC::Fast;
   2
   3 use strict;
   4 use Carp;
   5 use Data::Dump qw/dump/;
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.12;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16 }
  17
  18 =head1 NAME
  19
  20 MARC::Fast - Very fast implementation of MARC database reader
  21
  22 =head1 SYNOPSIS
  23
  24   use MARC::Fast;
  25
  26   my $marc = new MARC::Fast(
  27         marcdb => 'unimarc.iso',
  28   );
  29
  30   foreach my $mfn ( 1 .. $marc->count ) {
  31         print $marc->to_ascii( $mfn );
  32   }
  33
  34 For longer example with command line options look at L<scripts/dump_fastmarc.pl>
  35
  36 =head1 DESCRIPTION
  37
  38 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
  39
  40 It's is also very subtable for random access to MARC records (as opposed to
  41 sequential one).
  42
  43 =head1 METHODS
  44
  45 =head2 new
  46
  47 Read MARC database
  48
  49   my $marc = new MARC::Fast(
  50         marcdb => 'unimarc.iso',
  51         quiet => 0,
  52         debug => 0,
  53         assert => 0,
  54         hash_filter => sub {
  55                 my ($t, $record_number) = @_;
  56                 $t =~ s/foo/bar/;
  57                 return $t;
  58         },
  59   );
  60
  61 =cut
  62
  63 ################################################## subroutine header end ##
  64
  65
  66 sub new {
  67         my $class = shift;
  68         my $self = {@_};
  69         bless ($self, $class);
  70
  71         croak "need marcdb parametar" unless ($self->{marcdb});
  72
  73         print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
  74
  75         open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
  76         binmode($self->{fh});
  77
  78         $self->{count} = 0;
  79
  80         while (! eof($self->{fh})) {
  81                 $self->{count}++;
  82
  83                 # save record position
  84                 push @{$self->{fh_offset}}, tell($self->{fh});
  85
  86                 my $leader;
  87                 my $len = read($self->{fh}, $leader, 24);
  88
  89                 if ($len < 24) {
  90                         warn "short read of leader, aborting\n";
  91                         $self->{count}--;
  92                         last;
  93                 }
  94
  95                 # Byte        Name
  96                 # ----        ----
  97                 # 0-4         Record Length
  98                 # 5           Status (n=new, c=corrected and d=deleted)
  99                 # 6           Type of Record (a=printed material)
 100                 # 7           Bibliographic Level (m=monograph)
 101                 # 8-9         Blanks
 102                 # 10          Indictator count (2 for monographs)
 103                 # 11          Subfield code count (2 - 0x1F+subfield code itself)
 104                 # 12-16       Base address of data
 105                 # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
 106                 #               3=sublevel 3)
 107                 # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
 108                 #               n=record is in non-ISBD format, i=record is in
 109                 #               an incomplete ISBD format)
 110                 # 19          Blank
 111                 # 20          Length of length field in directory (always 4 in UNIMARC)
 112                 # 21          Length of Starting Character Position in directory (always
 113                 #               5 in UNIMARC)
 114                 # 22          Length of implementation defined portion in directory (always
 115                 #               0 in UNIMARC)
 116                 # 23          Blank
 117                 #
 118                 #           |0   45  89  |12 16|1n 450 |
 119                 #           |xxxxxnam  22(.....)   45 <---
 120
 121                 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
 122
 123                 # store leader for later
 124                 push @{$self->{leader}}, $leader;
 125
 126                 # skip to next record
 127                 my $o = substr($leader,0,5);
 128                 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
 129                 if ($o > 24) {
 130                         seek($self->{fh},$o-24,1) if ($o);
 131                 } else {
 132                         last;
 133                 }
 134
 135         }
 136
 137         return $self;
 138 }
 139
 140 =head2 count
 141
 142 Return number of records in database
 143
 144   print $marc->count;
 145
 146 =cut
 147
 148 sub count {
 149         my $self = shift;
 150         return $self->{count};
 151 }
 152
 153 =head2 fetch
 154
 155 Fetch record from database
 156
 157   my $hash = $marc->fetch(42);
 158
 159 First record number is C<1>
 160
 161 =cut
 162
 163 sub fetch {
 164         my $self = shift;
 165
 166         my $rec_nr = shift;
 167
 168         if ( ! $rec_nr ) {
 169                 $self->{last_leader} = undef;
 170                 return;
 171         }
 172
 173         my $leader = $self->{leader}->[$rec_nr - 1];
 174         $self->{last_leader} = $leader;
 175         unless ($leader) {
 176                 carp "can't find record $rec_nr";
 177                 return;
 178         };
 179         my $offset = $self->{fh_offset}->[$rec_nr - 1];
 180         unless (defined($offset)) {
 181                 carp "can't find offset for record $rec_nr";
 182                 return;
 183         };
 184
 185         my $reclen = substr($leader,0,5);
 186         my $base_addr = substr($leader,12,5);
 187
 188         print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
 189
 190         my $skip = 0;
 191
 192         print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
 193
 194         if ( ! seek($self->{fh}, $offset+24, 0) ) {
 195                 carp "can't seek to $offset: $!";
 196                 return;
 197         }
 198
 199         print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
 200
 201         my $directory;
 202         if( ! read($self->{fh},$directory,$base_addr-24) ) {
 203                 carp "can't read directory: $!";
 204                 $skip = 1;
 205         } else {
 206                 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
 207         }
 208
 209         print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
 210
 211         my $fields;
 212         if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
 213                 carp "can't read fields: $!";
 214                 $skip = 1;
 215         } else {
 216                 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
 217         }
 218
 219         my $row;
 220
 221         while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
 222                 my ($tag,$len,$addr) = ($1,$2,$3);
 223
 224                 if (($addr+$len) > length($fields)) {
 225                         print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
 226                         $skip = 1;
 227                         next;
 228                 }
 229
 230                 # take field
 231                 my $f = substr($fields,$addr,$len);
 232                 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
 233
 234                 push @{ $row->{$tag} }, $f;
 235
 236                 my $del = substr($fields,$addr+$len-1,1);
 237
 238                 # check field delimiters...
 239                 if ($self->{assert} && $del ne chr(30)) {
 240                         print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
 241                         $skip = 1;
 242                         next;
 243                 }
 244
 245                 if ($self->{assert} && length($f) < 2) {
 246                         print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
 247                         next;
 248                 }
 249
 250         }
 251
 252         return $row;
 253 }
 254
 255
 256 =head2 last_leader
 257
 258 Returns leader of last record L<fetch>ed
 259
 260   print $marc->last_leader;
 261
 262 Added in version 0.08 of this module, so if you need it use:
 263
 264   use MARC::Fast 0.08;
 265
 266 to be sure that it's supported.
 267
 268 =cut
 269
 270 sub last_leader {
 271         my $self = shift;
 272         return $self->{last_leader};
 273 }
 274
 275
 276 =head2 to_hash
 277
 278 Read record with specified MFN and convert it to hash
 279
 280   my $hash = $marc->to_hash( $mfn, include_subfields => 1,
 281         hash_filter => sub { my ($l,$tag) = @_; return $l; }
 282   );
 283
 284 It has ability to convert characters (using C<hash_filter>) from MARC
 285 database before creating structures enabling character re-mapping or quick
 286 fix-up of data. If you specified C<hash_filter> both in C<new> and C<to_hash>
 287 only the one from C<to_hash> will be used.
 288
 289 This function returns hash which is like this:
 290
 291   '200' => [
 292              {
 293                'i1' => '1',
 294                'i2' => ' '
 295                'a' => 'Goa',
 296                'f' => 'Valdo D\'Arienzo',
 297                'e' => 'tipografie e tipografi nel XVI secolo',
 298              }
 299            ],
 300
 301 This method will also create additional field C<000> with MFN.
 302
 303 =cut
 304
 305 sub to_hash {
 306         my $self = shift;
 307
 308         my $mfn = shift || confess "need mfn!";
 309
 310         my $args = {@_};
 311         my $filter_coderef = $args->{'hash_filter'} || $self->{'hash_filter'};
 312
 313         # init record to include MFN as field 000
 314         my $rec = { '000' => [ $mfn ] };
 315
 316         my $row = $self->fetch($mfn) || return;
 317
 318         foreach my $tag (keys %{$row}) {
 319                 foreach my $l (@{$row->{$tag}}) {
 320
 321                         # remove end marker
 322                         $l =~ s/\x1E$//;
 323
 324                         # filter output
 325                         $l = $filter_coderef->($l, $tag) if $filter_coderef;
 326
 327                         my $val;
 328
 329                         # has identifiers?
 330                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([0-9 #])([0-9 #])\x1F/\x1F/);
 331
 332                         my $sf_usage;
 333                         my @subfields;
 334
 335                         # has subfields?
 336                         if ($l =~ m/\x1F/) {
 337                                 foreach my $t (split(/\x1F/,$l)) {
 338                                         next if (! $t);
 339                                         my $f = substr($t,0,1);
 340                                         my $v = substr($t,1);
 341
 342                                         push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
 343
 344                                         # repeatable subfiled -- convert it to array
 345                                         if ( defined $val->{$f} ) {
 346                                                 if ( ref($val->{$f}) ne 'ARRAY' ) {
 347                                                         $val->{$f} = [ $val->{$f}, $v ];
 348                                                 } else {
 349                                                         push @{$val->{$f}}, $v;
 350                                                 }
 351                                         } else {
 352                                                 $val->{$f} = $v;
 353                                         }
 354                                 }
 355                                 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
 356                         } else {
 357                                 $val = $l;
 358                         }
 359
 360                         push @{$rec->{$tag}}, $val;
 361                 }
 362         }
 363
 364         return $rec;
 365 }
 366
 367 =head2 to_ascii
 368
 369   print $marc->to_ascii( 42 );
 370
 371 =cut
 372
 373 sub to_ascii {
 374         my $self = shift;
 375
 376         my $mfn = shift || confess "need mfn";
 377         my $row = $self->fetch($mfn) || return;
 378
 379         my $out;
 380
 381         foreach my $f (sort keys %{$row}) {
 382                 my $dump = join('', @{ $row->{$f} });
 383                 $dump =~ s/\x1e$//;
 384                 $dump =~ s/\x1f/\$/g;
 385                 $out .= "$f\t$dump\n";
 386         }
 387
 388         return $out;
 389 }
 390
 391 1;
 392 __END__
 393
 394 =head1 UTF-8 ENCODING
 395
 396 This module does nothing with encoding. But, since MARC format is byte
 397 oriented even when using UTF-8 which has variable number of bytes for each
 398 character, file is opened in binary mode.
 399
 400 As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
 401 to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
 402
 403   use Encode;
 404
 405   my $marc = new MARC::Fast(
 406         marcdb => 'utf8.marc',
 407         hash_filter => sub {
 408                 Encode::decode( 'utf-8', $_[0] );
 409         },
 410   );
 411
 412 This will affect C<to_hash>, but C<fetch> will still return binary representation
 413 since it doesn't support C<hash_filter>.
 414
 415 =head1 AUTHOR
 416
 417         Dobrica Pavlinusic
 418         CPAN ID: DPAVLIN
 419         dpavlin@rot13.org
 420         http://www.rot13.org/~dpavlin/
 421
 422 =head1 COPYRIGHT
 423
 424 This program is free software; you can redistribute
 425 it and/or modify it under the same terms as Perl itself.
 426
 427 The full text of the license can be found in the
 428 LICENSE file included with this module.
 429
 430
 431 =head1 SEE ALSO
 432
 433 L<Biblio::Isis>, perl(1).
 434
 435 =cut