git.rot13.org Git - Biblio-Isis/blob - lib/Biblio/Isis.pm

   1 package Biblio::Isis;
   2 use strict;
   3
   4 use Carp;
   5 use File::Glob qw(:globally :nocase);
   6
   7 BEGIN {
   8         use Exporter ();
   9         use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         $VERSION     = 0.20;
  11         @ISA         = qw (Exporter);
  12         #Give a hoot don't pollute, do not export more than needed by default
  13         @EXPORT      = qw ();
  14         @EXPORT_OK   = qw ();
  15         %EXPORT_TAGS = ();
  16
  17 }
  18
  19 =head1 NAME
  20
  21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
  22
  23 =head1 SYNOPSIS
  24
  25   use Biblio::Isis;
  26
  27   my $isis = new Biblio::Isis(
  28         isisdb => './cds/cds',
  29   );
  30
  31   for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
  32         print $isis->to_ascii($mfn),"\n";
  33   }
  34
  35 =head1 DESCRIPTION
  36
  37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
  38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
  39 seems to depriciate it's old C<XS> bindings for perl.
  40
  41 It can create hash values from data in ISIS database (using C<to_hash>),
  42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
  43 values (like C<^asomething^belse>).
  44
  45 Unique feature of this module is ability to C<include_deleted> records.
  46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
  47 fields which are zero sized will be filled with random junk from memory).
  48
  49 It also has support for identifiers (only if ISIS database is created by
  50 IsisMarc), see C<to_hash>.
  51
  52 This module will always be slower than OpenIsis module which use C
  53 library. However, since it's written in perl, it's platform independent (so
  54 you don't need C compiler), and can be easily modified. I hope that it
  55 creates data structures which are easier to use than ones created by
  56 OpenIsis, so reduced time in other parts of the code should compensate for
  57 slower performance of this module (speed of reading ISIS database is
  58 rarely an issue).
  59
  60 =head1 METHODS
  61
  62 =cut
  63
  64 #  my $ORDN;            # Nodes Order
  65 #  my $ORDF;            # Leafs Order
  66 #  my $N;               # Number of Memory buffers for nodes
  67 #  my $K;               # Number of buffers for first level index
  68 #  my $LIV;             # Current number of Index Levels
  69 #  my $POSRX;           # Pointer to Root Record in N0x
  70 #  my $NMAXPOS;         # Next Available position in N0x
  71 #  my $FMAXPOS;         # Next available position in L0x
  72 #  my $ABNORMAL;        # Formal BTree normality indicator
  73
  74 #
  75 # some binary reads
  76 #
  77
  78 =head2 new
  79
  80 Open ISIS database
  81
  82  my $isis = new Biblio::Isis(
  83         isisdb => './cds/cds',
  84         read_fdt => 1,
  85         include_deleted => 1,
  86         hash_filter => sub {
  87                 my $v = shift;
  88                 $v =~ s#foo#bar#g;
  89         },
  90         debug => 1,
  91  );
  92
  93 Options are described below:
  94
  95 =over 5
  96
  97 =item isisdb
  98
  99 This is full or relative path to ISIS database files which include
 100 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
 101 C<read_fdt> option) files.
 102
 103 In this example it uses C<./cds/cds.MST> and related files.
 104
 105 =item read_fdt
 106
 107 Boolean flag to specify if field definition table should be read. It's off
 108 by default.
 109
 110 =item include_deleted
 111
 112 Don't skip logically deleted records in ISIS.
 113
 114 =item hash_filter
 115
 116 Filter code ref which will be used before data is converted to hash.
 117
 118 =item debug
 119
 120 Dump a B<lot> of debugging output even at level 1. For even more increase level.
 121
 122 =back
 123
 124 =cut
 125
 126 sub new {
 127         my $class = shift;
 128         my $self = {};
 129         bless($self, $class);
 130
 131         croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
 132
 133         foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
 134                 $self->{$v} = {@_}->{$v};
 135         }
 136
 137         my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
 138
 139         foreach my $f (@isis_files) {
 140                 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
 141                 $self->{lc($ext)."_file"} = $f;
 142         }
 143
 144         my @must_exist = qw(mst xrf);
 145         push @must_exist, "fdt" if ($self->{read_fdt});
 146
 147         foreach my $ext (@must_exist) {
 148                 unless ($self->{$ext."_file"}) {
 149                         carp "missing ",uc($ext)," file in ",$self->{isisdb};
 150                         return;
 151                 }
 152         }
 153
 154         if ($self->{debug}) {
 155                 print STDERR "## using files: ",join(" ",@isis_files),"\n";
 156                 eval "use Data::Dump";
 157
 158                 if (! $@) {
 159                         *Dumper = *Data::Dump::dump;
 160                 } else {
 161                         use Data::Dumper;
 162                 }
 163         }
 164
 165         # if you want to read .FDT file use read_fdt argument when creating class!
 166         if ($self->{read_fdt} && -e $self->{fdt_file}) {
 167
 168                 # read the $db.FDT file for tags
 169                 my $fieldzone=0;
 170
 171                 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
 172                 binmode($fileFDT);
 173
 174                 while (<$fileFDT>) {
 175                         chomp;
 176                         if ($fieldzone) {
 177                                 my $name=substr($_,0,30);
 178                                 my $tag=substr($_,50,3);
 179
 180                                 $name =~ s/\s+$//;
 181                                 $tag =~ s/\s+$//;
 182
 183                                 $self->{'TagName'}->{$tag}=$name;
 184                         }
 185
 186                         if (/^\*\*\*/) {
 187                                 $fieldzone=1;
 188                         }
 189                 }
 190
 191                 close($fileFDT);
 192         }
 193
 194         # Get the Maximum MFN from $db.MST
 195
 196         open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
 197         binmode($self->{'fileMST'});
 198
 199         # MST format:   (* = 32 bit signed)
 200         # CTLMFN*       always 0
 201         # NXTMFN*       MFN to be assigned to the next record created
 202         # NXTMFB*       last block allocated to master file
 203         # NXTMFP        offset to next available position in last block
 204         # MFTYPE        always 0 for user db file (1 for system)
 205         seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
 206
 207         my $buff;
 208
 209         read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
 210         $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
 211
 212         print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
 213
 214         # open files for later
 215         open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
 216         binmode($self->{'fileXRF'});
 217
 218         $self ? return $self : return undef;
 219 }
 220
 221 =head2 count
 222
 223 Return number of records in database
 224
 225   print $isis->count;
 226
 227 =cut
 228
 229 sub count {
 230         my $self = shift;
 231         return $self->{'NXTMFN'} - 1;
 232 }
 233
 234 =head2 fetch
 235
 236 Read record with selected MFN
 237
 238   my $rec = $isis->fetch(55);
 239
 240 Returns hash with keys which are field names and values are unpacked values
 241 for that field like this:
 242
 243   $rec = {
 244     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
 245     '990' => [ '2140', '88', 'HAY' ],
 246   };
 247
 248 =cut
 249
 250 sub fetch {
 251         my $self = shift;
 252
 253         my $mfn = shift || croak "fetch needs MFN as argument!";
 254
 255         # is mfn allready in memory?
 256         my $old_mfn = $self->{'current_mfn'} || -1;
 257         return $self->{record} if ($mfn == $old_mfn);
 258
 259         print STDERR "## fetch: $mfn\n" if ($self->{debug});
 260
 261         # XXX check this?
 262         my $mfnpos=($mfn+int(($mfn-1)/127))*4;
 263
 264         print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
 265         seek($self->{'fileXRF'},$mfnpos,0);
 266
 267         my $buff;
 268
 269         # delete old record
 270         delete $self->{record};
 271
 272         # read XRFMFB abd XRFMFP
 273         read($self->{'fileXRF'}, $buff, 4);
 274         my $pointer=unpack("V",$buff);
 275         if (! $pointer) {
 276                 if ($self->{include_deleted}) {
 277                         return;
 278                 } else {
 279                         warn "pointer for MFN $mfn is null\n";
 280                         return;
 281                 }
 282         }
 283
 284         # check for logically deleted record
 285         if ($pointer & 0x80000000) {
 286                 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
 287                 $self->{deleted} = $mfn;
 288
 289                 return unless $self->{include_deleted};
 290
 291                 # abs
 292                 $pointer = ($pointer ^ 0xffffffff) + 1;
 293         }
 294
 295         my $XRFMFB = int($pointer/2048);
 296         my $XRFMFP = $pointer - ($XRFMFB*2048);
 297
 298         # (XRFMFB - 1) * 512 + XRFMFP
 299         # why do i have to do XRFMFP % 1024 ?
 300
 301         my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
 302
 303         print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
 304
 305         # Get Record Information
 306
 307         seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
 308
 309         read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
 310         my $value=unpack("V",$buff);
 311
 312         print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
 313
 314         if ($value!=$mfn) {
 315                 if ($value == 0) {
 316                         print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
 317                         $self->{deleted} = $mfn;
 318                         return;
 319                 }
 320
 321                 carp "Error: MFN ".$mfn." not found in MST file, found $value";
 322                 return;
 323         }
 324
 325         read($self->{'fileMST'}, $buff, 14);
 326
 327         my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
 328
 329         print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
 330
 331         warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
 332
 333         warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
 334
 335         # Get Directory Format
 336
 337         my @FieldPOS;
 338         my @FieldLEN;
 339         my @FieldTAG;
 340
 341         read($self->{'fileMST'}, $buff, 6 * $NVF);
 342
 343         my $rec_len = 0;
 344
 345         for (my $i = 0 ; $i < $NVF ; $i++) {
 346
 347                 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
 348
 349                 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
 350
 351                 # The TAG does not exists in .FDT so we set it to 0.
 352                 #
 353                 # XXX This is removed from perl version; .FDT file is updated manually, so
 354                 # you will often have fields in .MST file which aren't in .FDT. On the other
 355                 # hand, IsisMarc doesn't use .FDT files at all!
 356
 357                 #if (! $self->{TagName}->{$TAG}) {
 358                 #       $TAG=0;
 359                 #}
 360
 361                 push @FieldTAG,$TAG;
 362                 push @FieldPOS,$POS;
 363                 push @FieldLEN,$LEN;
 364
 365                 $rec_len += $LEN;
 366         }
 367
 368         # Get Variable Fields
 369
 370         read($self->{'fileMST'},$buff,$rec_len);
 371
 372         print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
 373
 374         for (my $i = 0 ; $i < $NVF ; $i++) {
 375                 # skip zero-sized fields
 376                 next if ($FieldLEN[$i] == 0);
 377
 378                 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
 379         }
 380
 381         $self->{'current_mfn'} = $mfn;
 382
 383         print STDERR Dumper($self),"\n" if ($self->{debug});
 384
 385         return $self->{'record'};
 386 }
 387
 388 =head2 mfn
 389
 390 Returns current MFN position
 391
 392   my $mfn = $isis->mfn;
 393
 394 =cut
 395
 396 # This function should be simple return $self->{current_mfn},
 397 # but if new is called with _hack_mfn it becomes setter.
 398 # It's useful in tests when setting $isis->{record} directly
 399
 400 sub mfn {
 401         my $self = shift;
 402         return $self->{current_mfn};
 403 };
 404
 405
 406 =head2 to_ascii
 407
 408 Returns ASCII output of record with specified MFN
 409
 410   print $isis->to_ascii(42);
 411
 412 This outputs something like this:
 413
 414   210   ^aNew York^cNew York University press^dcop. 1988
 415   990   2140
 416   990   88
 417   990   HAY
 418
 419 If C<read_fdt> is specified when calling C<new> it will display field names
 420 from C<.FDT> file instead of numeric tags.
 421
 422 =cut
 423
 424 sub to_ascii {
 425         my $self = shift;
 426
 427         my $mfn = shift || croak "need MFN";
 428
 429         my $rec = $self->fetch($mfn) || return;
 430
 431         my $out = "0\t$mfn";
 432
 433         foreach my $f (sort keys %{$rec}) {
 434                 my $fn = $self->tag_name($f);
 435                 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
 436         }
 437
 438         $out .= "\n";
 439
 440         return $out;
 441 }
 442
 443 =head2 to_hash
 444
 445 Read record with specified MFN and convert it to hash
 446
 447   my $hash = $isis->to_hash($mfn);
 448
 449 It has ability to convert characters (using C<hash_filter>) from ISIS
 450 database before creating structures enabling character re-mapping or quick
 451 fix-up of data.
 452
 453 This function returns hash which is like this:
 454
 455   $hash = {
 456     '210' => [
 457                {
 458                  'c' => 'New York University press',
 459                  'a' => 'New York',
 460                  'd' => 'cop. 1988'
 461                }
 462              ],
 463     '990' => [
 464                '2140',
 465                '88',
 466                'HAY'
 467              ],
 468   };
 469
 470 You can later use that hash to produce any output from ISIS data.
 471
 472 If database is created using IsisMarc, it will also have to special fields
 473 which will be used for identifiers, C<i1> and C<i2> like this:
 474
 475   '200' => [
 476              {
 477                'i1' => '1',
 478                'i2' => ' '
 479                'a' => 'Goa',
 480                'f' => 'Valdo D\'Arienzo',
 481                'e' => 'tipografie e tipografi nel XVI secolo',
 482              }
 483            ],
 484
 485 In case there are repeatable subfields in record, this will create
 486 following structure:
 487
 488   '900' => [ {
 489         'a' => [ 'foo', 'bar', 'baz' ],
 490   }]
 491
 492 This method will also create additional field C<000> with MFN.
 493
 494 =cut
 495
 496 sub to_hash {
 497         my $self = shift;
 498
 499         my $mfn = shift || confess "need mfn!";
 500
 501         # init record to include MFN as field 000
 502         my $rec = { '000' => [ $mfn ] };
 503
 504         my $row = $self->fetch($mfn) || return;
 505
 506         foreach my $k (keys %{$row}) {
 507                 foreach my $l (@{$row->{$k}}) {
 508
 509                         # filter output
 510                         if ($self->{'hash_filter'}) {
 511                                 $l = $self->{'hash_filter'}->($l);
 512                                 next unless defined($l);
 513                         }
 514
 515                         my $val;
 516
 517                         # has identifiers?
 518                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
 519
 520                         # has subfields?
 521                         if ($l =~ m/\^/) {
 522                                 foreach my $t (split(/\^/,$l)) {
 523                                         next if (! $t);
 524                                         my ($sf,$v) = (substr($t,0,1), substr($t,1));
 525                                         # FIXME make this option !
 526                                         next unless ($v);
 527 #                                       warn "### $k^$sf:$v",$/ if ($self->{debug} > 1);
 528
 529                                         # FIXME array return optional, by default unroll to ' ; '
 530                                         if (ref( $val->{$sf} ) eq 'ARRAY') {
 531
 532                                                 push @{ $val->{$sf} }, $v;
 533                                         } elsif (defined( $val->{$sf} )) {
 534                                                 # convert scalar field to array
 535                                                 $val->{$sf} = [ $val->{$sf}, $v ];
 536                                         } else {
 537                                                 $val->{$sf} = $v;
 538                                         }
 539                                 }
 540                         } else {
 541                                 $val = $l;
 542                         }
 543
 544                         push @{$rec->{$k}}, $val;
 545                 }
 546         }
 547
 548         return $rec;
 549 }
 550
 551 =head2 tag_name
 552
 553 Return name of selected tag
 554
 555  print $isis->tag_name('200');
 556
 557 =cut
 558
 559 sub tag_name {
 560         my $self = shift;
 561         my $tag = shift || return;
 562         return $self->{'TagName'}->{$tag} || $tag;
 563 }
 564
 565
 566 =head2 read_cnt
 567
 568 Read content of C<.CNT> file and return hash containing it.
 569
 570   print Dumper($isis->read_cnt);
 571
 572 This function is not used by module (C<.CNT> files are not required for this
 573 module to work), but it can be useful to examine your index (while debugging
 574 for example).
 575
 576 =cut
 577
 578 sub read_cnt  {
 579         my $self = shift;
 580
 581         croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
 582
 583         # Get the index information from $db.CNT
 584
 585         open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
 586         binmode($fileCNT);
 587
 588         my $buff;
 589
 590         read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
 591         $self->unpack_cnt($buff);
 592
 593         read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
 594         $self->unpack_cnt($buff);
 595
 596         close($fileCNT);
 597
 598         return $self->{cnt};
 599 }
 600
 601 =head2 unpack_cnt
 602
 603 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
 604
 605 Here is definition of record:
 606
 607  off key        description                             size
 608   0: IDTYPE     BTree type                              s
 609   2: ORDN       Nodes Order                             s
 610   4: ORDF       Leafs Order                             s
 611   6: N          Number of Memory buffers for nodes      s
 612   8: K          Number of buffers for first level index s
 613  10: LIV        Current number of Index Levels          s
 614  12: POSRX      Pointer to Root Record in N0x           l
 615  16: NMAXPOS    Next Available position in N0x          l
 616  20: FMAXPOS    Next available position in L0x          l
 617  24: ABNORMAL   Formal BTree normality indicator        s
 618  length: 26 bytes
 619
 620 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
 621
 622 =cut
 623
 624 sub unpack_cnt {
 625         my $self = shift;
 626
 627         my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
 628
 629         my $buff = shift || return;
 630         my @arr = unpack("vvvvvvVVVv", $buff);
 631
 632         print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
 633
 634         my $IDTYPE = shift @arr;
 635         foreach (@flds) {
 636                 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
 637         }
 638 }
 639
 640 1;
 641
 642 =head1 BUGS
 643
 644 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
 645 some variations in input databases which has been tested with this module.
 646 When I was in doubt, I assumed that OpenIsis's implementation was right
 647 (except for obvious bugs).
 648
 649 However, every effort has been made to test this module with as much
 650 databases (and programs that create them) as possible.
 651
 652 I would be very greatful for success or failure reports about usage of this
 653 module with databases from programs other than WinIsis and IsisMarc. I had
 654 tested this against ouput of one C<isis.dll>-based application, but I don't
 655 know any details about it's version.
 656
 657 =head1 VERSIONS
 658
 659 You can find version dependencies documented here
 660
 661 =over 8
 662
 663 =item 0.20
 664
 665 Added C<< $isis->mfn >> and support for repeatable subfields
 666
 667 =back
 668
 669 =head1 AUTHOR
 670
 671         Dobrica Pavlinusic
 672         CPAN ID: DPAVLIN
 673         dpavlin@rot13.org
 674         http://www.rot13.org/~dpavlin/
 675
 676 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
 677 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
 678
 679 =head1 COPYRIGHT
 680
 681 This program is free software; you can redistribute
 682 it and/or modify it under the same terms as Perl itself.
 683
 684 The full text of the license can be found in the
 685 LICENSE file included with this module.
 686
 687
 688 =head1 SEE ALSO
 689
 690 OpenIsis web site L<http://www.openisis.org>
 691
 692 perl4lib site L<http://perl4lib.perl.org>
 693