5 use File::Glob qw(:globally :nocase);
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 #Give a hoot don't pollute, do not export more than needed by default
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
91 join_subfields_with => ' ; ',
97 Options are described below:
103 This is full or relative path to ISIS database files which include
104 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
105 C<read_fdt> option) files.
107 In this example it uses C<./cds/cds.MST> and related files.
111 Boolean flag to specify if field definition table should be read. It's off
114 =item include_deleted
116 Don't skip logically deleted records in ISIS.
120 Filter code ref which will be used before data is converted to hash.
124 Dump a B<lot> of debugging output even at level 1. For even more increase level.
126 =item join_subfields_with
128 Define delimiter which will be used to join repeatable subfields. This
129 option is included to support lagacy application written against version
130 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
134 Define (any number) of regexpes to apply at field values before they are
135 splitted into subfield. This is great place to split subfields in input to
136 mulitple subfields if needed or rename subfields.
145 bless($self, $class);
147 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
149 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
150 $self->{$v} = {@_}->{$v};
153 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
155 foreach my $f (@isis_files) {
156 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
157 $self->{lc($ext)."_file"} = $f;
160 my @must_exist = qw(mst xrf);
161 push @must_exist, "fdt" if ($self->{read_fdt});
163 foreach my $ext (@must_exist) {
164 unless ($self->{$ext."_file"}) {
165 carp "missing ",uc($ext)," file in ",$self->{isisdb};
170 if ($self->{debug}) {
171 print STDERR "## using files: ",join(" ",@isis_files),"\n";
172 eval "use Data::Dump";
175 *Dumper = *Data::Dump::dump;
181 # if you want to read .FDT file use read_fdt argument when creating class!
182 if ($self->{read_fdt} && -e $self->{fdt_file}) {
184 # read the $db.FDT file for tags
187 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
193 my $name=substr($_,0,30);
194 my $tag=substr($_,50,3);
199 $self->{'TagName'}->{$tag}=$name;
210 # Get the Maximum MFN from $db.MST
212 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
213 binmode($self->{'fileMST'});
215 # MST format: (* = 32 bit signed)
217 # NXTMFN* MFN to be assigned to the next record created
218 # NXTMFB* last block allocated to master file
219 # NXTMFP offset to next available position in last block
220 # MFTYPE always 0 for user db file (1 for system)
221 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
225 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
226 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
228 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
230 # open files for later
231 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
232 binmode($self->{'fileXRF'});
234 $self ? return $self : return undef;
239 Return number of records in database
247 return $self->{'NXTMFN'} - 1;
252 Read record with selected MFN
254 my $rec = $isis->fetch(55);
256 Returns hash with keys which are field names and values are unpacked values
257 for that field like this:
260 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
261 '990' => [ '2140', '88', 'HAY' ],
269 my $mfn = shift || croak "fetch needs MFN as argument!";
271 # is mfn allready in memory?
272 my $old_mfn = $self->{'current_mfn'} || -1;
273 return $self->{record} if ($mfn == $old_mfn);
275 print STDERR "## fetch: $mfn\n" if ($self->{debug});
278 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
280 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
281 seek($self->{'fileXRF'},$mfnpos,0);
286 delete $self->{record};
288 # read XRFMFB abd XRFMFP
289 read($self->{'fileXRF'}, $buff, 4);
290 my $pointer=unpack("V",$buff);
292 if ($self->{include_deleted}) {
295 warn "pointer for MFN $mfn is null\n";
300 # check for logically deleted record
301 if ($pointer & 0x80000000) {
302 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
303 $self->{deleted} = $mfn;
305 return unless $self->{include_deleted};
308 $pointer = ($pointer ^ 0xffffffff) + 1;
311 my $XRFMFB = int($pointer/2048);
312 my $XRFMFP = $pointer - ($XRFMFB*2048);
314 # (XRFMFB - 1) * 512 + XRFMFP
315 # why do i have to do XRFMFP % 1024 ?
317 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
319 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
321 # Get Record Information
323 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
325 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
326 my $value=unpack("V",$buff);
328 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
332 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
333 $self->{deleted} = $mfn;
337 carp "Error: MFN ".$mfn." not found in MST file, found $value";
341 read($self->{'fileMST'}, $buff, 14);
343 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
345 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
347 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
349 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
351 # Get Directory Format
357 read($self->{'fileMST'}, $buff, 6 * $NVF);
361 for (my $i = 0 ; $i < $NVF ; $i++) {
363 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
365 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
367 # The TAG does not exists in .FDT so we set it to 0.
369 # XXX This is removed from perl version; .FDT file is updated manually, so
370 # you will often have fields in .MST file which aren't in .FDT. On the other
371 # hand, IsisMarc doesn't use .FDT files at all!
373 #if (! $self->{TagName}->{$TAG}) {
384 # Get Variable Fields
386 read($self->{'fileMST'},$buff,$rec_len);
388 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
390 for (my $i = 0 ; $i < $NVF ; $i++) {
391 # skip zero-sized fields
392 next if ($FieldLEN[$i] == 0);
394 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
397 $self->{'current_mfn'} = $mfn;
399 print STDERR Dumper($self),"\n" if ($self->{debug});
401 return $self->{'record'};
406 Returns current MFN position
408 my $mfn = $isis->mfn;
412 # This function should be simple return $self->{current_mfn},
413 # but if new is called with _hack_mfn it becomes setter.
414 # It's useful in tests when setting $isis->{record} directly
418 return $self->{current_mfn};
424 Returns ASCII output of record with specified MFN
426 print $isis->to_ascii(42);
428 This outputs something like this:
430 210 ^aNew York^cNew York University press^dcop. 1988
435 If C<read_fdt> is specified when calling C<new> it will display field names
436 from C<.FDT> file instead of numeric tags.
443 my $mfn = shift || croak "need MFN";
445 my $rec = $self->fetch($mfn) || return;
449 foreach my $f (sort keys %{$rec}) {
450 my $fn = $self->tag_name($f);
451 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
461 Read record with specified MFN and convert it to hash
463 my $hash = $isis->to_hash($mfn);
465 It has ability to convert characters (using C<hash_filter>) from ISIS
466 database before creating structures enabling character re-mapping or quick
469 This function returns hash which is like this:
474 'c' => 'New York University press',
486 You can later use that hash to produce any output from ISIS data.
488 If database is created using IsisMarc, it will also have to special fields
489 which will be used for identifiers, C<i1> and C<i2> like this:
496 'f' => 'Valdo D\'Arienzo',
497 'e' => 'tipografie e tipografi nel XVI secolo',
501 In case there are repeatable subfields in record, this will create
505 'a' => [ 'foo', 'bar', 'baz' ],
508 Or in more complex example of
510 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
515 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
518 This behaviour can be changed using C<join_subfields_with> option to L</new>,
519 in which case C<to_hash> will always create single value for each subfield.
520 This will change result to:
524 This method will also create additional field C<000> with MFN.
526 There is also more elaborative way to call C<to_hash> like this:
528 my $hash = $isis->to_hash({
530 include_subfields => 1,
532 's/something/else/g',
536 Each option controll creation of hash:
542 Specify MFN number of record
544 =item include_subfields
546 This option will create additional key in hash called C<subfields> which will
547 have original record subfield order and index to that subfield like this:
550 a => ["a1", "a2", "a3", "a4", "a5"],
553 subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
556 =item join_subfields_with
558 Define delimiter which will be used to join repeatable subfields. You can
559 specify option here instead in L</new> if you want to have per-record control.
563 Override C<regexpes> specified in L</new>.
573 my $mfn = shift || confess "need mfn!";
576 if (ref($mfn) eq 'HASH') {
578 $mfn = $arg->{mfn} || confess "need mfn in arguments";
581 $arg->{regexpes} ||= $self->{regexpes};
583 confess "regexps must be HASH" if ($arg->{regexps} && ref($arg->{regexps}) ne 'HASH');
585 # init record to include MFN as field 000
586 my $rec = { '000' => [ $mfn ] };
588 my $row = $self->fetch($mfn) || return;
590 my $j_rs = $arg->{join_subfields_with};
591 $j_rs = $self->{join_subfields_with} unless(defined($j_rs));
592 my $i_sf = $arg->{include_subfields};
594 foreach my $f_nr (keys %{$row}) {
595 foreach my $l (@{$row->{$f_nr}}) {
598 if ($self->{'hash_filter'}) {
599 $l = $self->{'hash_filter'}->($l);
600 next unless defined($l);
604 if ($arg->{regexps} && defined($arg->{regexps}->{$f_nr})) {
605 confess "regexps->{$f_nr} must be ARRAY" if (ref($arg->{regexps}->{$f_nr}) ne 'ARRAY');
607 foreach my $r (@{ $arg->{regexps}->{$f_nr} }) {
608 while ( eval '$l =~ ' . $r ) { $c++ };
610 warn "## field $f_nr triggered $c regexpes\n" if ($c && $self->{debug});
614 my $r_sf; # repeatable subfields in this record
617 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
621 foreach my $t (split(/\^/,$l)) {
623 my ($sf,$v) = (substr($t,0,1), substr($t,1));
624 # XXX this might be option, but why?
626 # warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
628 if (ref( $val->{$sf} ) eq 'ARRAY') {
630 push @{ $val->{$sf} }, $v;
632 # record repeatable subfield it it's offset
633 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
636 } elsif (defined( $val->{$sf} )) {
638 # convert scalar field to array
639 $val->{$sf} = [ $val->{$sf}, $v ];
641 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
646 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
655 $val->{$_} = join($j_rs, @{ $val->{$_} });
659 push @{$rec->{$f_nr}}, $val;
668 Return name of selected tag
670 print $isis->tag_name('200');
676 my $tag = shift || return;
677 return $self->{'TagName'}->{$tag} || $tag;
683 Read content of C<.CNT> file and return hash containing it.
685 print Dumper($isis->read_cnt);
687 This function is not used by module (C<.CNT> files are not required for this
688 module to work), but it can be useful to examine your index (while debugging
696 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
698 # Get the index information from $db.CNT
700 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
705 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
706 $self->unpack_cnt($buff);
708 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
709 $self->unpack_cnt($buff);
718 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
720 Here is definition of record:
722 off key description size
723 0: IDTYPE BTree type s
724 2: ORDN Nodes Order s
725 4: ORDF Leafs Order s
726 6: N Number of Memory buffers for nodes s
727 8: K Number of buffers for first level index s
728 10: LIV Current number of Index Levels s
729 12: POSRX Pointer to Root Record in N0x l
730 16: NMAXPOS Next Available position in N0x l
731 20: FMAXPOS Next available position in L0x l
732 24: ABNORMAL Formal BTree normality indicator s
735 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
742 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
744 my $buff = shift || return;
745 my @arr = unpack("vvvvvvVVVv", $buff);
747 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
749 my $IDTYPE = shift @arr;
751 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
759 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
760 some variations in input databases which has been tested with this module.
761 When I was in doubt, I assumed that OpenIsis's implementation was right
762 (except for obvious bugs).
764 However, every effort has been made to test this module with as much
765 databases (and programs that create them) as possible.
767 I would be very greatful for success or failure reports about usage of this
768 module with databases from programs other than WinIsis and IsisMarc. I had
769 tested this against ouput of one C<isis.dll>-based application, but I don't
770 know any details about it's version.
774 As this is young module, new features are added in subsequent version. It's
775 a good idea to specify version when using this module like this:
777 use Biblio::Isis 0.21
779 Below is list of changes in specific version of module (so you can target
780 older versions if you really have to):
786 Added C<join_subfields_with> to L</new> and L</to_hash>.
788 Added C<include_subfields> to L</to_hash>.
792 Added C<< $isis->mfn >>, support for repeatable subfields and
793 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
802 http://www.rot13.org/~dpavlin/
804 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
805 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
809 This program is free software; you can redistribute
810 it and/or modify it under the same terms as Perl itself.
812 The full text of the license can be found in the
813 LICENSE file included with this module.
818 L<Biblio::Isis::Manual> for CDS/ISIS manual appendix F, G and H which describe file format
820 OpenIsis web site L<http://www.openisis.org>
822 perl4lib site L<http://perl4lib.perl.org>