git.rot13.org Git - MARC-Fast/blob - Fast.pm

   1 package MARC::Fast;
   2
   3 use strict;
   4 use Carp;
   5 use Data::Dump qw/dump/;
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.09;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16 }
  17
  18 =head1 NAME
  19
  20 MARC::Fast - Very fast implementation of MARC database reader
  21
  22 =head1 SYNOPSIS
  23
  24   use MARC::Fast;
  25
  26   my $marc = new MARC::Fast(
  27         marcdb => 'unimarc.iso',
  28   );
  29
  30   foreach my $mfn ( 1 .. $marc->count ) {
  31         print $marc->to_ascii( $mfn );
  32   }
  33
  34 For longer example with command line options look at L<scripts/dump_fastmarc.pl>
  35
  36 =head1 DESCRIPTION
  37
  38 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
  39
  40 It's is also very subtable for random access to MARC records (as opposed to
  41 sequential one).
  42
  43 =head1 METHODS
  44
  45 =head2 new
  46
  47 Read MARC database
  48
  49   my $marc = new MARC::Fast(
  50         marcdb => 'unimarc.iso',
  51         quiet => 0,
  52         debug => 0,
  53         assert => 0,
  54         hash_filter => sub {
  55                 my ($t, $record_number) = @_;
  56                 $t =~ s/foo/bar/;
  57                 return $t;
  58         },
  59   );
  60
  61 =cut
  62
  63 ################################################## subroutine header end ##
  64
  65
  66 sub new {
  67         my $class = shift;
  68         my $self = {@_};
  69         bless ($self, $class);
  70
  71         croak "need marcdb parametar" unless ($self->{marcdb});
  72
  73         print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
  74
  75         open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
  76         binmode($self->{fh});
  77
  78         $self->{count} = 0;
  79
  80         while (! eof($self->{fh})) {
  81                 $self->{count}++;
  82
  83                 # save record position
  84                 push @{$self->{fh_offset}}, tell($self->{fh});
  85
  86                 my $leader;
  87                 my $len = read($self->{fh}, $leader, 24);
  88
  89                 if ($len < 24) {
  90                         carp "short read of leader, aborting\n";
  91                         last;
  92                 }
  93
  94                 # Byte        Name
  95                 # ----        ----
  96                 # 0-4         Record Length
  97                 # 5           Status (n=new, c=corrected and d=deleted)
  98                 # 6           Type of Record (a=printed material)
  99                 # 7           Bibliographic Level (m=monograph)
 100                 # 8-9         Blanks
 101                 # 10          Indictator count (2 for monographs)
 102                 # 11          Subfield code count (2 - 0x1F+subfield code itself)
 103                 # 12-16       Base address of data
 104                 # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
 105                 #               3=sublevel 3)
 106                 # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
 107                 #               n=record is in non-ISBD format, i=record is in
 108                 #               an incomplete ISBD format)
 109                 # 19          Blank
 110                 # 20          Length of length field in directory (always 4 in UNIMARC)
 111                 # 21          Length of Starting Character Position in directory (always
 112                 #               5 in UNIMARC)
 113                 # 22          Length of implementation defined portion in directory (always
 114                 #               0 in UNIMARC)
 115                 # 23          Blank
 116                 #
 117                 #           |0   45  89  |12 16|1n 450 |
 118                 #           |xxxxxnam  22(.....)   45 <---
 119
 120                 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
 121
 122                 # store leader for later
 123                 push @{$self->{leader}}, $leader;
 124
 125                 # skip to next record
 126                 my $o = substr($leader,0,5);
 127                 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
 128                 if ($o > 24) {
 129                         seek($self->{fh},$o-24,1) if ($o);
 130                 } else {
 131                         last;
 132                 }
 133
 134         }
 135
 136         return $self;
 137 }
 138
 139 =head2 count
 140
 141 Return number of records in database
 142
 143   print $marc->count;
 144
 145 =cut
 146
 147 sub count {
 148         my $self = shift;
 149         return $self->{count};
 150 }
 151
 152 =head2 fetch
 153
 154 Fetch record from database
 155
 156   my $hash = $marc->fetch(42);
 157
 158 First record number is C<1>
 159
 160 =cut
 161
 162 sub fetch {
 163         my $self = shift;
 164
 165         my $rec_nr = shift;
 166
 167         if ( ! $rec_nr ) {
 168                 $self->{last_leader} = undef;
 169                 return;
 170         }
 171
 172         my $leader = $self->{leader}->[$rec_nr - 1];
 173         $self->{last_leader} = $leader;
 174         unless ($leader) {
 175                 carp "can't find record $rec_nr";
 176                 return;
 177         };
 178         my $offset = $self->{fh_offset}->[$rec_nr - 1];
 179         unless (defined($offset)) {
 180                 carp "can't find offset for record $rec_nr";
 181                 return;
 182         };
 183
 184         my $reclen = substr($leader,0,5);
 185         my $base_addr = substr($leader,12,5);
 186
 187         print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
 188
 189         my $skip = 0;
 190
 191         print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
 192
 193         if ( ! seek($self->{fh}, $offset+24, 0) ) {
 194                 carp "can't seek to $offset: $!";
 195                 return;
 196         }
 197
 198         print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
 199
 200         my $directory;
 201         if( ! read($self->{fh},$directory,$base_addr-24) ) {
 202                 carp "can't read directory: $!";
 203                 $skip = 1;
 204         } else {
 205                 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
 206         }
 207
 208         print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
 209
 210         my $fields;
 211         if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
 212                 carp "can't read fields: $!";
 213                 $skip = 1;
 214         } else {
 215                 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
 216         }
 217
 218         my $row;
 219
 220         while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
 221                 my ($tag,$len,$addr) = ($1,$2,$3);
 222
 223                 if (($addr+$len) > length($fields)) {
 224                         print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
 225                         $skip = 1;
 226                         next;
 227                 }
 228
 229                 # take field
 230                 my $f = substr($fields,$addr,$len);
 231                 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
 232
 233                 push @{ $row->{$tag} }, $f;
 234
 235                 my $del = substr($fields,$addr+$len-1,1);
 236
 237                 # check field delimiters...
 238                 if ($self->{assert} && $del ne chr(30)) {
 239                         print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
 240                         $skip = 1;
 241                         next;
 242                 }
 243
 244                 if ($self->{assert} && length($f) < 2) {
 245                         print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
 246                         next;
 247                 }
 248
 249         }
 250
 251         return $row;
 252 }
 253
 254
 255 =head2 last_leader
 256
 257 Returns leader of last record L<fetch>ed
 258
 259   print $marc->last_leader;
 260
 261 Added in version 0.08 of this module, so if you need it use:
 262
 263   use MARC::Fast 0.08;
 264
 265 to be sure that it's supported.
 266
 267 =cut
 268
 269 sub last_leader {
 270         my $self = shift;
 271         return $self->{last_leader};
 272 }
 273
 274
 275 =head2 to_hash
 276
 277 Read record with specified MFN and convert it to hash
 278
 279   my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
 280
 281 It has ability to convert characters (using C<hash_filter>) from MARC
 282 database before creating structures enabling character re-mapping or quick
 283 fix-up of data.
 284
 285 This function returns hash which is like this:
 286
 287   '200' => [
 288              {
 289                'i1' => '1',
 290                'i2' => ' '
 291                'a' => 'Goa',
 292                'f' => 'Valdo D\'Arienzo',
 293                'e' => 'tipografie e tipografi nel XVI secolo',
 294              }
 295            ],
 296
 297 This method will also create additional field C<000> with MFN.
 298
 299 =cut
 300
 301 sub to_hash {
 302         my $self = shift;
 303
 304         my $mfn = shift || confess "need mfn!";
 305
 306         my $args = {@_};
 307
 308         # init record to include MFN as field 000
 309         my $rec = { '000' => [ $mfn ] };
 310
 311         my $row = $self->fetch($mfn) || return;
 312
 313         foreach my $rec_nr (keys %{$row}) {
 314                 foreach my $l (@{$row->{$rec_nr}}) {
 315
 316                         # remove end marker
 317                         $l =~ s/\x1E$//;
 318
 319                         # filter output
 320                         $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
 321
 322                         my $val;
 323
 324                         # has identifiers?
 325                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
 326
 327                         my $sf_usage;
 328                         my @subfields;
 329
 330                         # has subfields?
 331                         if ($l =~ m/\x1F/) {
 332                                 foreach my $t (split(/\x1F/,$l)) {
 333                                         next if (! $t);
 334                                         my $f = substr($t,0,1);
 335
 336                                         push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
 337
 338                                         # repeatable subfiled -- convert it to array
 339                                         if ($val->{$f}) {
 340                                                 if ( ref($val->{$f}) ne 'ARRAY' ) {
 341                                                         $val->{$f} = [ $val->{$f}, $val ];
 342                                                 } else {
 343                                                         push @{$val->{$f}}, $val;
 344                                                 }
 345                                         }
 346                                         $val->{substr($t,0,1)} = substr($t,1);
 347                                 }
 348                                 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
 349                         } else {
 350                                 $val = $l;
 351                         }
 352
 353                         push @{$rec->{$rec_nr}}, $val;
 354                 }
 355         }
 356
 357         return $rec;
 358 }
 359
 360 =head2 to_ascii
 361
 362   print $marc->to_ascii( 42 );
 363
 364 =cut
 365
 366 sub to_ascii {
 367         my $self = shift;
 368
 369         my $mfn = shift || confess "need mfn";
 370         my $row = $self->fetch($mfn) || return;
 371
 372         my $out;
 373
 374         foreach my $f (sort keys %{$row}) {
 375                 my $dump = join('', @{ $row->{$f} });
 376                 $dump =~ s/\x1e$//;
 377                 $dump =~ s/\x1f/\$/g;
 378                 $out .= "$f\t$dump\n";
 379         }
 380
 381         return $out;
 382 }
 383
 384 1;
 385 __END__
 386
 387 =head1 AUTHOR
 388
 389         Dobrica Pavlinusic
 390         CPAN ID: DPAVLIN
 391         dpavlin@rot13.org
 392         http://www.rot13.org/~dpavlin/
 393
 394 =head1 COPYRIGHT
 395
 396 This program is free software; you can redistribute
 397 it and/or modify it under the same terms as Perl itself.
 398
 399 The full text of the license can be found in the
 400 LICENSE file included with this module.
 401
 402
 403 =head1 SEE ALSO
 404
 405 L<Biblio::Isis>, perl(1).
 406
 407 =cut