git.rot13.org Git - MARC-Fast/blob - Fast.pm

   1
   2 package MARC::Fast;
   3 use strict;
   4 use Carp;
   5 use Data::Dumper;
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.02;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16 }
  17
  18 =head1 NAME
  19
  20 MARC::Fast - Very fast implementation of MARC database reader
  21
  22 =head1 SYNOPSIS
  23
  24   use MARC::Fast;
  25
  26
  27 =head1 DESCRIPTION
  28
  29 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
  30
  31 It's is also very sutable for random access to MARC records (as opposed to
  32 sequential one).
  33
  34 =head1 METHODS
  35
  36 =head2 new
  37
  38 Read MARC database
  39
  40   my $marc = new MARC::Fast(
  41         marcdb => 'unimarc.iso',
  42         quiet => 0,
  43         debug => 0,
  44         assert => 0,
  45   );
  46
  47 =cut
  48
  49 ################################################## subroutine header end ##
  50
  51
  52 sub new {
  53         my $class = shift;
  54         my $self = {@_};
  55         bless ($self, $class);
  56
  57         croak "need marcdb parametar" unless ($self->{marcdb});
  58
  59         print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
  60
  61         open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
  62
  63         $self->{count} = 0;
  64
  65         while (! eof($self->{fh})) {
  66                 $self->{count}++;
  67
  68                 # save record position
  69                 push @{$self->{fh_offset}}, tell($self->{fh});
  70
  71                 my $leader;
  72                 read($self->{fh}, $leader, 24);
  73
  74                 # Byte        Name
  75                 # ----        ----
  76                 # 0-4         Record Length
  77                 # 5           Status (n=new, c=corrected and d=deleted)
  78                 # 6           Type of Record (a=printed material)
  79                 # 7           Bibliographic Level (m=monograph)
  80                 # 8-9         Blanks
  81                 # 10          Indictator count (2 for monographs)
  82                 # 11          Subfield code count (2 - 0x1F+subfield code itself)
  83                 # 12-16       Base address of data
  84                 # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
  85                 #               3=sublevel 3)
  86                 # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
  87                 #               n=record is in non-ISBD format, i=record is in
  88                 #               an incomplete ISBD format)
  89                 # 19          Blank
  90                 # 20          Length of length field in directory (always 4 in UNIMARC)
  91                 # 21          Length of Starting Character Position in directory (always
  92                 #               5 in UNIMARC)
  93                 # 22          Length of implementation defined portion in directory (always
  94                 #               0 in UNIMARC)
  95                 # 23          Blank
  96                 #
  97                 #           |0   45  89  |12 16|1n 450 |
  98                 #           |xxxxxnam  22(.....)   45 <---
  99
 100                 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
 101
 102                 # store leader for later
 103                 push @{$self->{leaders}}, $leader;
 104
 105                 # skip to next record
 106                 seek($self->{fh},substr($leader,0,5)-24,1);
 107
 108         }
 109
 110         return $self;
 111 }
 112
 113 =head2 count
 114
 115 Return number of records in database
 116
 117   print $marc->count;
 118
 119 =cut
 120
 121 sub count {
 122         my $self = shift;
 123         return $self->{count};
 124 }
 125
 126 =head2 fetch
 127
 128 Fetch record from database
 129
 130   my $hash = $marc->fetch(42);
 131
 132 =cut
 133
 134 sub fetch {
 135         my $self = shift;
 136
 137         my $rec_nr = shift || return;
 138
 139         my $leader = $self->{leaders}->[$rec_nr - 1];
 140         unless ($leader) {
 141                 carp "can't find record $rec_nr";
 142                 return;
 143         };
 144         my $offset = $self->{fh_offset}->[$rec_nr - 1];
 145         unless (defined($offset)) {
 146                 carp "can't find offset for record $rec_nr";
 147                 return;
 148         };
 149
 150         my $reclen = substr($leader,0,5);
 151         my $base_addr = substr($leader,12,5);
 152
 153         print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
 154
 155         my $skip = 0;
 156
 157         print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
 158
 159         if ( ! seek($self->{fh}, $offset+24, 0) ) {
 160                 carp "can't seek to $offset: $!";
 161                 return;
 162         }
 163
 164         print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
 165
 166         my $directory;
 167         if( ! read($self->{fh},$directory,$base_addr-24) ) {
 168                 carp "can't read directory: $!";
 169                 $skip = 1;
 170         } else {
 171                 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
 172         }
 173
 174         print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
 175
 176         my $fields;
 177         if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
 178                 carp "can't read fields: $!";
 179                 $skip = 1;
 180         } else {
 181                 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
 182         }
 183
 184         my $row;
 185
 186         while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
 187                 my ($tag,$len,$addr) = ($1,$2,$3);
 188
 189                 if (($addr+$len) > length($fields)) {
 190                         print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
 191                         $skip = 1;
 192                         next;
 193                 }
 194
 195                 # take field
 196                 my $f = substr($fields,$addr,$len);
 197                 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
 198
 199                 if ($row->{$tag}) {
 200                         $row->{$tag} .= $f;
 201                 } else {
 202                         $row->{$tag} = $f;
 203                 }
 204
 205                 my $del = substr($fields,$addr+$len-1,1);
 206
 207                 # check field delimiters...
 208                 if ($self->{assert} && $del ne chr(30)) {
 209                         print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
 210                         $skip = 1;
 211                         next;
 212                 }
 213
 214                 if ($self->{assert} && length($f) < 2) {
 215                         print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
 216                         next;
 217                 }
 218
 219         }
 220
 221         return $row;
 222 }
 223
 224 1;
 225 __END__
 226
 227 =head1 BUGS
 228
 229
 230
 231 =head1 SUPPORT
 232
 233
 234
 235 =head1 AUTHOR
 236
 237         Dobrica Pavlinusic
 238         CPAN ID: DPAVLIN
 239         dpavlin@rot13.org
 240         http://www.rot13.org/~dpavlin/
 241
 242 =head1 COPYRIGHT
 243
 244 This program is free software; you can redistribute
 245 it and/or modify it under the same terms as Perl itself.
 246
 247 The full text of the license can be found in the
 248 LICENSE file included with this module.
 249
 250
 251 =head1 SEE ALSO
 252
 253 perl(1).
 254
 255 =cut