5 use File::Glob qw(:globally :nocase);
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 #Give a hoot don't pollute, do not export more than needed by default
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
87 my ($v,$field_number) = @_;
91 join_subfields_with => ' ; ',
94 Options are described below:
100 This is full or relative path to ISIS database files which include
101 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
104 In this example it uses C<./cds/cds.MST> and related files.
108 Boolean flag to specify if field definition table should be read. It's off
111 =item include_deleted
113 Don't skip logically deleted records in ISIS.
117 Filter code ref which will be used before data is converted to hash. It will
118 receive two arguments, whole line from current field (in C<< $_[0] >>) and
119 field number (in C<< $_[1] >>).
123 Dump a B<lot> of debugging output even at level 1. For even more increase level.
125 =item join_subfields_with
127 Define delimiter which will be used to join repeatable subfields. This
128 option is included to support lagacy application written against version
129 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
131 =item ignore_empty_subfields
133 Remove all empty subfields while reading from ISIS file.
142 bless($self, $class);
144 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
146 foreach my $v (qw{isisdb debug include_deleted hash_filter join_subfields_with ignore_empty_subfields}) {
147 $self->{$v} = {@_}->{$v} if defined({@_}->{$v});
150 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
152 foreach my $f (@isis_files) {
153 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
154 $self->{lc($ext)."_file"} = $f;
157 my @must_exist = qw(mst xrf);
158 push @must_exist, "fdt" if ($self->{read_fdt});
160 foreach my $ext (@must_exist) {
161 unless ($self->{$ext."_file"}) {
162 carp "missing ",uc($ext)," file in ",$self->{isisdb};
167 if ($self->{debug}) {
168 print STDERR "## using files: ",join(" ",@isis_files),"\n";
169 eval "use Data::Dump";
172 *Dumper = *Data::Dump::dump;
178 # if you want to read .FDT file use read_fdt argument when creating class!
179 if ($self->{read_fdt} && -e $self->{fdt_file}) {
181 # read the $db.FDT file for tags
184 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
190 my $name=substr($_,0,30);
191 my $tag=substr($_,50,3);
196 $self->{'TagName'}->{$tag}=$name;
207 # Get the Maximum MFN from $db.MST
209 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
210 binmode($self->{'fileMST'});
212 # MST format: (* = 32 bit signed)
214 # NXTMFN* MFN to be assigned to the next record created
215 # NXTMFB* last block allocated to master file
216 # NXTMFP offset to next available position in last block
217 # MFTYPE always 0 for user db file (1 for system)
218 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
222 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
223 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
225 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
227 # open files for later
228 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
229 binmode($self->{'fileXRF'});
231 $self ? return $self : return undef;
236 Return number of records in database
244 return $self->{'NXTMFN'} - 1;
249 Read record with selected MFN
251 my $rec = $isis->fetch(55);
253 Returns hash with keys which are field names and values are unpacked values
254 for that field like this:
257 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
258 '990' => [ '2140', '88', 'HAY' ],
266 my $mfn = shift || croak "fetch needs MFN as argument!";
268 # is mfn allready in memory?
269 my $old_mfn = $self->{'current_mfn'} || -1;
270 return $self->{record} if ($mfn == $old_mfn);
272 print STDERR "## fetch: $mfn\n" if ($self->{debug});
275 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
277 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
278 seek($self->{'fileXRF'},$mfnpos,0);
283 delete $self->{record};
285 # read XRFMFB abd XRFMFP
286 read($self->{'fileXRF'}, $buff, 4);
287 my $pointer=unpack("V",$buff);
289 if ($self->{include_deleted}) {
292 warn "pointer for MFN $mfn is null\n";
297 # check for logically deleted record
298 if ($pointer & 0x80000000) {
299 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
300 $self->{deleted} = $mfn;
302 return unless $self->{include_deleted};
305 $pointer = ($pointer ^ 0xffffffff) + 1;
308 my $XRFMFB = int($pointer/2048);
309 my $XRFMFP = $pointer - ($XRFMFB*2048);
311 # (XRFMFB - 1) * 512 + XRFMFP
312 # why do i have to do XRFMFP % 1024 ?
314 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
316 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
318 # Get Record Information
320 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
322 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
323 my $value=unpack("V",$buff);
325 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
329 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
330 $self->{deleted} = $mfn;
334 carp "Error: MFN ".$mfn." not found in MST file, found $value";
338 read($self->{'fileMST'}, $buff, 14);
340 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
342 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
344 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
346 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
348 # Get Directory Format
354 read($self->{'fileMST'}, $buff, 6 * $NVF);
358 for (my $i = 0 ; $i < $NVF ; $i++) {
360 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
362 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
364 # The TAG does not exists in .FDT so we set it to 0.
366 # XXX This is removed from perl version; .FDT file is updated manually, so
367 # you will often have fields in .MST file which aren't in .FDT. On the other
368 # hand, IsisMarc doesn't use .FDT files at all!
370 #if (! $self->{TagName}->{$TAG}) {
381 # Get Variable Fields
383 read($self->{'fileMST'},$buff,$rec_len);
385 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
387 for (my $i = 0 ; $i < $NVF ; $i++) {
388 # skip zero-sized fields
389 next if ($FieldLEN[$i] == 0);
391 my $v = substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
393 if ( $self->{ignore_empty_subfields} ) {
394 $v =~ s/(\^\w)+(\^\w)/$2/g;
395 $v =~ s/\^\w$//; # last on line?
399 push @{$self->{record}->{$FieldTAG[$i]}}, $v;
402 $self->{'current_mfn'} = $mfn;
404 print STDERR Dumper($self),"\n" if ($self->{debug});
406 return $self->{'record'};
411 Returns current MFN position
413 my $mfn = $isis->mfn;
417 # This function should be simple return $self->{current_mfn},
418 # but if new is called with _hack_mfn it becomes setter.
419 # It's useful in tests when setting $isis->{record} directly
423 return $self->{current_mfn};
429 Returns ASCII output of record with specified MFN
431 print $isis->to_ascii(42);
433 This outputs something like this:
435 210 ^aNew York^cNew York University press^dcop. 1988
440 If C<read_fdt> is specified when calling C<new> it will display field names
441 from C<.FDT> file instead of numeric tags.
448 my $mfn = shift || croak "need MFN";
450 my $rec = $self->fetch($mfn) || return;
454 foreach my $f (sort keys %{$rec}) {
455 my $fn = $self->tag_name($f);
456 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
466 Read record with specified MFN and convert it to hash
468 my $hash = $isis->to_hash($mfn);
470 It has ability to convert characters (using C<hash_filter>) from ISIS
471 database before creating structures enabling character re-mapping or quick
474 This function returns hash which is like this:
479 'c' => 'New York University press',
491 You can later use that hash to produce any output from ISIS data.
493 If database is created using IsisMarc, it will also have to special fields
494 which will be used for identifiers, C<i1> and C<i2> like this:
501 'f' => 'Valdo D\'Arienzo',
502 'e' => 'tipografie e tipografi nel XVI secolo',
506 In case there are repeatable subfields in record, this will create
510 'a' => [ 'foo', 'bar', 'baz' ],
513 Or in more complex example of
515 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
520 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
523 This behaviour can be changed using C<join_subfields_with> option to L</new>,
524 in which case C<to_hash> will always create single value for each subfield.
525 This will change result to:
529 This method will also create additional field C<000> with MFN.
531 There is also more elaborative way to call C<to_hash> like this:
533 my $hash = $isis->to_hash({
535 include_subfields => 1,
538 Each option controll creation of hash:
544 Specify MFN number of record
546 =item include_subfields
548 This option will create additional key in hash called C<subfields> which will
549 have original record subfield order and index to that subfield like this:
552 a => ["a1", "a2", "a3", "a4", "a5"],
555 subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
558 =item join_subfields_with
560 Define delimiter which will be used to join repeatable subfields. You can
561 specify option here instead in L</new> if you want to have per-record control.
565 You can override C<hash_filter> defined in L</new> using this option.
575 my $mfn = shift || confess "need mfn!";
578 my $hash_filter = $self->{hash_filter};
580 if (ref($mfn) eq 'HASH') {
582 $mfn = $arg->{mfn} || confess "need mfn in arguments";
583 $hash_filter = $arg->{hash_filter} if ($arg->{hash_filter});
586 # init record to include MFN as field 000
587 my $rec = { '000' => [ $mfn ] };
589 my $row = $self->fetch($mfn) || return;
591 my $j_rs = $arg->{join_subfields_with} || $self->{join_subfields_with};
592 $j_rs = $self->{join_subfields_with} unless(defined($j_rs));
593 my $i_sf = $arg->{include_subfields};
595 foreach my $f_nr (keys %{$row}) {
596 foreach my $l (@{$row->{$f_nr}}) {
599 $l = $hash_filter->($l, $f_nr) if ($hash_filter);
600 next unless defined($l);
603 my $r_sf; # repeatable subfields in this record
606 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
610 foreach my $t (split(/\^/,$l)) {
612 my ($sf,$v) = (substr($t,0,1), substr($t,1));
613 # XXX this might be option, but why?
614 next unless (defined($v) && $v ne '');
615 # warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
617 if (ref( $val->{$sf} ) eq 'ARRAY') {
619 push @{ $val->{$sf} }, $v;
621 # record repeatable subfield it it's offset
622 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
625 } elsif (defined( $val->{$sf} )) {
627 # convert scalar field to array
628 $val->{$sf} = [ $val->{$sf}, $v ];
630 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
635 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
644 $val->{$_} = join($j_rs, @{ $val->{$_} });
648 push @{$rec->{$f_nr}}, $val;
657 Return name of selected tag
659 print $isis->tag_name('200');
665 my $tag = shift || return;
666 return $self->{'TagName'}->{$tag} || $tag;
672 Read content of C<.CNT> file and return hash containing it.
674 print Dumper($isis->read_cnt);
676 This function is not used by module (C<.CNT> files are not required for this
677 module to work), but it can be useful to examine your index (while debugging
685 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
687 # Get the index information from $db.CNT
689 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
694 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
695 $self->unpack_cnt($buff);
697 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
698 $self->unpack_cnt($buff);
707 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
709 Here is definition of record:
711 off key description size
712 0: IDTYPE BTree type s
713 2: ORDN Nodes Order s
714 4: ORDF Leafs Order s
715 6: N Number of Memory buffers for nodes s
716 8: K Number of buffers for first level index s
717 10: LIV Current number of Index Levels s
718 12: POSRX Pointer to Root Record in N0x l
719 16: NMAXPOS Next Available position in N0x l
720 20: FMAXPOS Next available position in L0x l
721 24: ABNORMAL Formal BTree normality indicator s
724 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
731 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
733 my $buff = shift || return;
734 my @arr = unpack("vvvvvvVVVv", $buff);
736 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
738 my $IDTYPE = shift @arr;
740 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
748 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
749 some variations in input databases which has been tested with this module.
750 When I was in doubt, I assumed that OpenIsis's implementation was right
751 (except for obvious bugs).
753 However, every effort has been made to test this module with as much
754 databases (and programs that create them) as possible.
756 I would be very greatful for success or failure reports about usage of this
757 module with databases from programs other than WinIsis and IsisMarc. I had
758 tested this against ouput of one C<isis.dll>-based application, but I don't
759 know any details about it's version.
763 As this is young module, new features are added in subsequent version. It's
764 a good idea to specify version when using this module like this:
766 use Biblio::Isis 0.23
768 Below is list of changes in specific version of module (so you can target
769 older versions if you really have to):
775 Added C<ignore_empty_subfields>
779 Added C<hash_filter> to L</to_hash>
781 Fixed bug with documented C<join_subfields_with> in L</new> which wasn't
786 Added field number when calling C<hash_filter>
790 Added C<join_subfields_with> to L</new> and L</to_hash>.
792 Added C<include_subfields> to L</to_hash>.
796 Added C<< $isis->mfn >>, support for repeatable subfields and
797 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
806 http://www.rot13.org/~dpavlin/
808 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
809 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
813 This program is free software; you can redistribute
814 it and/or modify it under the same terms as Perl itself.
816 The full text of the license can be found in the
817 LICENSE file included with this module.
822 L<Biblio::Isis::Manual> for CDS/ISIS manual appendix F, G and H which describe file format
824 OpenIsis web site L<http://www.openisis.org>
826 perl4lib site L<http://perl4lib.perl.org>