git.rot13.org Git - Biblio-Isis/blob - lib/Biblio/Isis.pm

   1 package Biblio::Isis;
   2 use strict;
   3
   4 use Carp;
   5 use File::Glob qw(:globally :nocase);
   6
   7 use Data::Dumper;
   8
   9 BEGIN {
  10         use Exporter ();
  11         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  12         $VERSION     = 0.13;
  13         @ISA         = qw (Exporter);
  14         #Give a hoot don't pollute, do not export more than needed by default
  15         @EXPORT      = qw ();
  16         @EXPORT_OK   = qw ();
  17         %EXPORT_TAGS = ();
  18
  19 }
  20
  21 =head1 NAME
  22
  23 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
  24
  25 =head1 SYNOPSIS
  26
  27   use Biblio::Isis;
  28
  29   my $isis = new Biblio::Isis(
  30         isisdb => './cds/cds',
  31   );
  32
  33   for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
  34         print $isis->to_ascii($mfn),"\n";
  35   }
  36
  37 =head1 DESCRIPTION
  38
  39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
  40 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
  41 seems to depriciate it's old C<XS> bindings for perl.
  42
  43 It can create hash values from data in ISIS database (using C<to_hash>),
  44 ASCII dump (using C<to_ascii>) or just hash with field names and packed
  45 values (like C<^asomething^belse>).
  46
  47 Unique feature of this module is ability to C<include_deleted> records.
  48 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
  49 fields which are zero sized will be filled with random junk from memory).
  50
  51 It also has support for identifiers (only if ISIS database is created by
  52 IsisMarc), see C<to_hash>.
  53
  54 This module will always be slower than OpenIsis module which use C
  55 library. However, since it's written in perl, it's platform independent (so
  56 you don't need C compiler), and can be easily modified. I hope that it
  57 creates data structures which are easier to use than ones created by
  58 OpenIsis, so reduced time in other parts of the code should compensate for
  59 slower performance of this module (speed of reading ISIS database is
  60 rarely an issue).
  61
  62 =head1 METHODS
  63
  64 =cut
  65
  66 #  my $ORDN;            # Nodes Order
  67 #  my $ORDF;            # Leafs Order
  68 #  my $N;               # Number of Memory buffers for nodes
  69 #  my $K;               # Number of buffers for first level index
  70 #  my $LIV;             # Current number of Index Levels
  71 #  my $POSRX;           # Pointer to Root Record in N0x
  72 #  my $NMAXPOS;         # Next Available position in N0x
  73 #  my $FMAXPOS;         # Next available position in L0x
  74 #  my $ABNORMAL;        # Formal BTree normality indicator
  75
  76 #
  77 # some binary reads
  78 #
  79
  80 =head2 new
  81
  82 Open ISIS database
  83
  84  my $isis = new Biblio::Isis(
  85         isisdb => './cds/cds',
  86         read_fdt => 1,
  87         include_deleted => 1,
  88         hash_filter => sub {
  89                 my $v = shift;
  90                 $v =~ s#foo#bar#g;
  91         },
  92         debug => 1,
  93  );
  94
  95 Options are described below:
  96
  97 =over 5
  98
  99 =item isisdb
 100
 101 This is full or relative path to ISIS database files which include
 102 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
 103 C<read_fdt> option) files.
 104
 105 In this example it uses C<./cds/cds.MST> and related files.
 106
 107 =item read_fdt
 108
 109 Boolean flag to specify if field definition table should be read. It's off
 110 by default.
 111
 112 =item include_deleted
 113
 114 Don't skip logically deleted records in ISIS.
 115
 116 =item hash_filter
 117
 118 Filter code ref which will be used before data is converted to hash.
 119
 120 =item debug
 121
 122 Dump a B<lot> of debugging output.
 123
 124 =back
 125
 126 =cut
 127
 128 sub new {
 129         my $class = shift;
 130         my $self = {};
 131         bless($self, $class);
 132
 133         croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
 134
 135         foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
 136                 $self->{$v} = {@_}->{$v};
 137         }
 138
 139         my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
 140
 141         foreach my $f (@isis_files) {
 142                 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
 143                 $self->{lc($ext)."_file"} = $f;
 144         }
 145
 146         my @must_exist = qw(mst xrf);
 147         push @must_exist, "fdt" if ($self->{read_fdt});
 148
 149         foreach my $ext (@must_exist) {
 150                 unless ($self->{$ext."_file"}) {
 151                         carp "missing ",uc($ext)," file in ",$self->{isisdb};
 152                         return;
 153                 }
 154         }
 155
 156         print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
 157
 158         # if you want to read .FDT file use read_fdt argument when creating class!
 159         if ($self->{read_fdt} && -e $self->{fdt_file}) {
 160
 161                 # read the $db.FDT file for tags
 162                 my $fieldzone=0;
 163
 164                 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
 165                 binmode($fileFDT);
 166
 167                 while (<$fileFDT>) {
 168                         chomp;
 169                         if ($fieldzone) {
 170                                 my $name=substr($_,0,30);
 171                                 my $tag=substr($_,50,3);
 172
 173                                 $name =~ s/\s+$//;
 174                                 $tag =~ s/\s+$//;
 175
 176                                 $self->{'TagName'}->{$tag}=$name;
 177                         }
 178
 179                         if (/^\*\*\*/) {
 180                                 $fieldzone=1;
 181                         }
 182                 }
 183
 184                 close($fileFDT);
 185         }
 186
 187         # Get the Maximum MFN from $db.MST
 188
 189         open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
 190         binmode($self->{'fileMST'});
 191
 192         # MST format:   (* = 32 bit signed)
 193         # CTLMFN*       always 0
 194         # NXTMFN*       MFN to be assigned to the next record created
 195         # NXTMFB*       last block allocated to master file
 196         # NXTMFP        offset to next available position in last block
 197         # MFTYPE        always 0 for user db file (1 for system)
 198         seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
 199
 200         my $buff;
 201
 202         read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
 203         $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
 204
 205         print STDERR Dumper($self),"\n" if ($self->{debug});
 206
 207         # open files for later
 208         open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
 209         binmode($self->{'fileXRF'});
 210
 211         $self ? return $self : return undef;
 212 }
 213
 214 =head2 count
 215
 216 Return number of records in database
 217
 218   print $isis->count;
 219
 220 =cut
 221
 222 sub count {
 223         my $self = shift;
 224         return $self->{'NXTMFN'} - 1;
 225 }
 226
 227 =head2 fetch
 228
 229 Read record with selected MFN
 230
 231   my $rec = $isis->fetch(55);
 232
 233 Returns hash with keys which are field names and values are unpacked values
 234 for that field like this:
 235
 236   $rec = {
 237     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
 238     '990' => [ '2140', '88', 'HAY' ],
 239   };
 240
 241 =cut
 242
 243 sub fetch {
 244         my $self = shift;
 245
 246         my $mfn = shift || croak "fetch needs MFN as argument!";
 247
 248         # is mfn allready in memory?
 249         my $old_mfn = $self->{'current_mfn'} || -1;
 250         return $self->{record} if ($mfn == $old_mfn);
 251
 252         print STDERR "## fetch: $mfn\n" if ($self->{debug});
 253
 254         # XXX check this?
 255         my $mfnpos=($mfn+int(($mfn-1)/127))*4;
 256
 257         print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
 258         seek($self->{'fileXRF'},$mfnpos,0);
 259
 260         my $buff;
 261
 262         # delete old record
 263         delete $self->{record};
 264
 265         # read XRFMFB abd XRFMFP
 266         read($self->{'fileXRF'}, $buff, 4);
 267         my $pointer=unpack("V",$buff);
 268         if (! $pointer) {
 269                 if ($self->{include_deleted}) {
 270                         return;
 271                 } else {
 272                         warn "pointer for MFN $mfn is null\n";
 273                         return;
 274                 }
 275         }
 276
 277         # check for logically deleted record
 278         if ($pointer & 0x80000000) {
 279                 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
 280                 $self->{deleted} = $mfn;
 281
 282                 return unless $self->{include_deleted};
 283
 284                 # abs
 285                 $pointer = ($pointer ^ 0xffffffff) + 1;
 286         }
 287
 288         my $XRFMFB = int($pointer/2048);
 289         my $XRFMFP = $pointer - ($XRFMFB*2048);
 290
 291         # (XRFMFB - 1) * 512 + XRFMFP
 292         # why do i have to do XRFMFP % 1024 ?
 293
 294         my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
 295
 296         print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
 297
 298         # Get Record Information
 299
 300         seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
 301
 302         read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
 303         my $value=unpack("V",$buff);
 304
 305         print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
 306
 307         if ($value!=$mfn) {
 308                 if ($value == 0) {
 309                         print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
 310                         $self->{deleted} = $mfn;
 311                         return;
 312                 }
 313
 314                 carp "Error: MFN ".$mfn." not found in MST file, found $value";
 315                 return;
 316         }
 317
 318         read($self->{'fileMST'}, $buff, 14);
 319
 320         my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
 321
 322         print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
 323
 324         warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
 325
 326         warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
 327
 328         # Get Directory Format
 329
 330         my @FieldPOS;
 331         my @FieldLEN;
 332         my @FieldTAG;
 333
 334         read($self->{'fileMST'}, $buff, 6 * $NVF);
 335
 336         my $rec_len = 0;
 337
 338         for (my $i = 0 ; $i < $NVF ; $i++) {
 339
 340                 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
 341
 342                 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
 343
 344                 # The TAG does not exists in .FDT so we set it to 0.
 345                 #
 346                 # XXX This is removed from perl version; .FDT file is updated manually, so
 347                 # you will often have fields in .MST file which aren't in .FDT. On the other
 348                 # hand, IsisMarc doesn't use .FDT files at all!
 349
 350                 #if (! $self->{TagName}->{$TAG}) {
 351                 #       $TAG=0;
 352                 #}
 353
 354                 push @FieldTAG,$TAG;
 355                 push @FieldPOS,$POS;
 356                 push @FieldLEN,$LEN;
 357
 358                 $rec_len += $LEN;
 359         }
 360
 361         # Get Variable Fields
 362
 363         read($self->{'fileMST'},$buff,$rec_len);
 364
 365         print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
 366
 367         for (my $i = 0 ; $i < $NVF ; $i++) {
 368                 # skip zero-sized fields
 369                 next if ($FieldLEN[$i] == 0);
 370
 371                 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
 372         }
 373
 374         $self->{'current_mfn'} = $mfn;
 375
 376         print STDERR Dumper($self),"\n" if ($self->{debug});
 377
 378         return $self->{'record'};
 379 }
 380
 381 =head2 to_ascii
 382
 383 Returns ASCII output of record with specified MFN
 384
 385   print $isis->to_ascii(42);
 386
 387 This outputs something like this:
 388
 389   210   ^aNew York^cNew York University press^dcop. 1988
 390   990   2140
 391   990   88
 392   990   HAY
 393
 394 If C<read_fdt> is specified when calling C<new> it will display field names
 395 from C<.FDT> file instead of numeric tags.
 396
 397 =cut
 398
 399 sub to_ascii {
 400         my $self = shift;
 401
 402         my $mfn = shift || croak "need MFN";
 403
 404         my $rec = $self->fetch($mfn) || return;
 405
 406         my $out = "0\t$mfn";
 407
 408         foreach my $f (sort keys %{$rec}) {
 409                 my $fn = $self->tag_name($f);
 410                 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
 411         }
 412
 413         $out .= "\n";
 414
 415         return $out;
 416 }
 417
 418 =head2 to_hash
 419
 420 Read record with specified MFN and convert it to hash
 421
 422   my $hash = $isis->to_hash($mfn);
 423
 424 It has ability to convert characters (using C<hash_filter>) from ISIS
 425 database before creating structures enabling character re-mapping or quick
 426 fix-up of data.
 427
 428 This function returns hash which is like this:
 429
 430   $hash = {
 431     '210' => [
 432                {
 433                  'c' => 'New York University press',
 434                  'a' => 'New York',
 435                  'd' => 'cop. 1988'
 436                }
 437              ],
 438     '990' => [
 439                '2140',
 440                '88',
 441                'HAY'
 442              ],
 443   };
 444
 445 You can later use that hash to produce any output from ISIS data.
 446
 447 If database is created using IsisMarc, it will also have to special fields
 448 which will be used for identifiers, C<i1> and C<i2> like this:
 449
 450   '200' => [
 451              {
 452                'i1' => '1',
 453                'i2' => ' '
 454                'a' => 'Goa',
 455                'f' => 'Valdo D\'Arienzo',
 456                'e' => 'tipografie e tipografi nel XVI secolo',
 457              }
 458            ],
 459
 460 This method will also create additional field C<000> with MFN.
 461
 462 =cut
 463
 464 sub to_hash {
 465         my $self = shift;
 466
 467         my $mfn = shift || confess "need mfn!";
 468
 469         # init record to include MFN as field 000
 470         my $rec = { '000' => [ $mfn ] };
 471
 472         my $row = $self->fetch($mfn) || return;
 473
 474         foreach my $k (keys %{$row}) {
 475                 foreach my $l (@{$row->{$k}}) {
 476
 477                         # filter output
 478                         if ($self->{'hash_filter'}) {
 479                                 $l = $self->{'hash_filter'}->($l);
 480                                 next unless defined($l);
 481                         }
 482
 483                         my $val;
 484
 485                         # has identifiers?
 486                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
 487
 488                         # has subfields?
 489                         if ($l =~ m/\^/) {
 490                                 foreach my $t (split(/\^/,$l)) {
 491                                         next if (! $t);
 492                                         $val->{substr($t,0,1)} = substr($t,1);
 493                                 }
 494                         } else {
 495                                 $val = $l;
 496                         }
 497
 498                         push @{$rec->{$k}}, $val;
 499                 }
 500         }
 501
 502         return $rec;
 503 }
 504
 505 =head2 tag_name
 506
 507 Return name of selected tag
 508
 509  print $isis->tag_name('200');
 510
 511 =cut
 512
 513 sub tag_name {
 514         my $self = shift;
 515         my $tag = shift || return;
 516         return $self->{'TagName'}->{$tag} || $tag;
 517 }
 518
 519
 520 =head2 read_cnt
 521
 522 Read content of C<.CNT> file and return hash containing it.
 523
 524   print Dumper($isis->read_cnt);
 525
 526 This function is not used by module (C<.CNT> files are not required for this
 527 module to work), but it can be useful to examine your index (while debugging
 528 for example).
 529
 530 =cut
 531
 532 sub read_cnt  {
 533         my $self = shift;
 534
 535         croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
 536
 537         # Get the index information from $db.CNT
 538
 539         open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
 540         binmode($fileCNT);
 541
 542         my $buff;
 543
 544         read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
 545         $self->unpack_cnt($buff);
 546
 547         read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
 548         $self->unpack_cnt($buff);
 549
 550         close($fileCNT);
 551
 552         return $self->{cnt};
 553 }
 554
 555 =head2 unpack_cnt
 556
 557 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
 558
 559 Here is definition of record:
 560
 561  off key        description                             size
 562   0: IDTYPE     BTree type                              s
 563   2: ORDN       Nodes Order                             s
 564   4: ORDF       Leafs Order                             s
 565   6: N          Number of Memory buffers for nodes      s
 566   8: K          Number of buffers for first level index s
 567  10: LIV        Current number of Index Levels          s
 568  12: POSRX      Pointer to Root Record in N0x           l
 569  16: NMAXPOS    Next Available position in N0x          l
 570  20: FMAXPOS    Next available position in L0x          l
 571  24: ABNORMAL   Formal BTree normality indicator        s
 572  length: 26 bytes
 573
 574 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
 575
 576 =cut
 577
 578 sub unpack_cnt {
 579         my $self = shift;
 580
 581         my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
 582
 583         my $buff = shift || return;
 584         my @arr = unpack("vvvvvvVVVv", $buff);
 585
 586         print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
 587
 588         my $IDTYPE = shift @arr;
 589         foreach (@flds) {
 590                 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
 591         }
 592 }
 593
 594 1;
 595
 596 =head1 BUGS
 597
 598 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
 599 some variations in input databases which has been tested with this module.
 600 When I was in doubt, I assumed that OpenIsis's implementation was right
 601 (except for obvious bugs).
 602
 603 However, every effort has been made to test this module with as much
 604 databases (and programs that create them) as possible.
 605
 606 I would be very greatful for success or failure reports about usage of this
 607 module with databases from programs other than WinIsis and IsisMarc. I had
 608 tested this against ouput of one C<isis.dll>-based application, but I don't
 609 know any details about it's version.
 610
 611 =head1 AUTHOR
 612
 613         Dobrica Pavlinusic
 614         CPAN ID: DPAVLIN
 615         dpavlin@rot13.org
 616         http://www.rot13.org/~dpavlin/
 617
 618 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
 619 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
 620
 621 =head1 COPYRIGHT
 622
 623 This program is free software; you can redistribute
 624 it and/or modify it under the same terms as Perl itself.
 625
 626 The full text of the license can be found in the
 627 LICENSE file included with this module.
 628
 629
 630 =head1 SEE ALSO
 631
 632 OpenIsis web site L<http://www.openisis.org>
 633
 634 perl4lib site L<http://perl4lib.perl.org>
 635