5 use File::Glob qw(:globally :nocase);
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 #Give a hoot don't pollute, do not export more than needed by default
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
87 my ($v,$field_number) = @_;
91 join_subfields_with => ' ; ',
94 Options are described below:
100 This is full or relative path to ISIS database files which include
101 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
104 In this example it uses C<./cds/cds.MST> and related files.
108 Boolean flag to specify if field definition table should be read. It's off
111 =item include_deleted
113 Don't skip logically deleted records in ISIS.
117 Filter code ref which will be used before data is converted to hash. It will
118 receive two arguments, whole line from current field (in C<< $_[0] >>) and
119 field number (in C<< $_[1] >>).
123 Dump a B<lot> of debugging output even at level 1. For even more increase level.
125 =item join_subfields_with
127 Define delimiter which will be used to join repeatable subfields. This
128 option is included to support lagacy application written against version
129 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
138 bless($self, $class);
140 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
142 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
143 $self->{$v} = {@_}->{$v};
146 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
148 foreach my $f (@isis_files) {
149 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
150 $self->{lc($ext)."_file"} = $f;
153 my @must_exist = qw(mst xrf);
154 push @must_exist, "fdt" if ($self->{read_fdt});
156 foreach my $ext (@must_exist) {
157 unless ($self->{$ext."_file"}) {
158 carp "missing ",uc($ext)," file in ",$self->{isisdb};
163 if ($self->{debug}) {
164 print STDERR "## using files: ",join(" ",@isis_files),"\n";
165 eval "use Data::Dump";
168 *Dumper = *Data::Dump::dump;
174 # if you want to read .FDT file use read_fdt argument when creating class!
175 if ($self->{read_fdt} && -e $self->{fdt_file}) {
177 # read the $db.FDT file for tags
180 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
186 my $name=substr($_,0,30);
187 my $tag=substr($_,50,3);
192 $self->{'TagName'}->{$tag}=$name;
203 # Get the Maximum MFN from $db.MST
205 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
206 binmode($self->{'fileMST'});
208 # MST format: (* = 32 bit signed)
210 # NXTMFN* MFN to be assigned to the next record created
211 # NXTMFB* last block allocated to master file
212 # NXTMFP offset to next available position in last block
213 # MFTYPE always 0 for user db file (1 for system)
214 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
218 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
219 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
221 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
223 # open files for later
224 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
225 binmode($self->{'fileXRF'});
227 $self ? return $self : return undef;
232 Return number of records in database
240 return $self->{'NXTMFN'} - 1;
245 Read record with selected MFN
247 my $rec = $isis->fetch(55);
249 Returns hash with keys which are field names and values are unpacked values
250 for that field like this:
253 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
254 '990' => [ '2140', '88', 'HAY' ],
262 my $mfn = shift || croak "fetch needs MFN as argument!";
264 # is mfn allready in memory?
265 my $old_mfn = $self->{'current_mfn'} || -1;
266 return $self->{record} if ($mfn == $old_mfn);
268 print STDERR "## fetch: $mfn\n" if ($self->{debug});
271 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
273 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
274 seek($self->{'fileXRF'},$mfnpos,0);
279 delete $self->{record};
281 # read XRFMFB abd XRFMFP
282 read($self->{'fileXRF'}, $buff, 4);
283 my $pointer=unpack("V",$buff);
285 if ($self->{include_deleted}) {
288 warn "pointer for MFN $mfn is null\n";
293 # check for logically deleted record
294 if ($pointer & 0x80000000) {
295 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
296 $self->{deleted} = $mfn;
298 return unless $self->{include_deleted};
301 $pointer = ($pointer ^ 0xffffffff) + 1;
304 my $XRFMFB = int($pointer/2048);
305 my $XRFMFP = $pointer - ($XRFMFB*2048);
307 # (XRFMFB - 1) * 512 + XRFMFP
308 # why do i have to do XRFMFP % 1024 ?
310 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
312 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
314 # Get Record Information
316 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
318 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
319 my $value=unpack("V",$buff);
321 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
325 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
326 $self->{deleted} = $mfn;
330 carp "Error: MFN ".$mfn." not found in MST file, found $value";
334 read($self->{'fileMST'}, $buff, 14);
336 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
338 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
340 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
342 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
344 # Get Directory Format
350 read($self->{'fileMST'}, $buff, 6 * $NVF);
354 for (my $i = 0 ; $i < $NVF ; $i++) {
356 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
358 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
360 # The TAG does not exists in .FDT so we set it to 0.
362 # XXX This is removed from perl version; .FDT file is updated manually, so
363 # you will often have fields in .MST file which aren't in .FDT. On the other
364 # hand, IsisMarc doesn't use .FDT files at all!
366 #if (! $self->{TagName}->{$TAG}) {
377 # Get Variable Fields
379 read($self->{'fileMST'},$buff,$rec_len);
381 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
383 for (my $i = 0 ; $i < $NVF ; $i++) {
384 # skip zero-sized fields
385 next if ($FieldLEN[$i] == 0);
387 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
390 $self->{'current_mfn'} = $mfn;
392 print STDERR Dumper($self),"\n" if ($self->{debug});
394 return $self->{'record'};
399 Returns current MFN position
401 my $mfn = $isis->mfn;
405 # This function should be simple return $self->{current_mfn},
406 # but if new is called with _hack_mfn it becomes setter.
407 # It's useful in tests when setting $isis->{record} directly
411 return $self->{current_mfn};
417 Returns ASCII output of record with specified MFN
419 print $isis->to_ascii(42);
421 This outputs something like this:
423 210 ^aNew York^cNew York University press^dcop. 1988
428 If C<read_fdt> is specified when calling C<new> it will display field names
429 from C<.FDT> file instead of numeric tags.
436 my $mfn = shift || croak "need MFN";
438 my $rec = $self->fetch($mfn) || return;
442 foreach my $f (sort keys %{$rec}) {
443 my $fn = $self->tag_name($f);
444 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
454 Read record with specified MFN and convert it to hash
456 my $hash = $isis->to_hash($mfn);
458 It has ability to convert characters (using C<hash_filter>) from ISIS
459 database before creating structures enabling character re-mapping or quick
462 This function returns hash which is like this:
467 'c' => 'New York University press',
479 You can later use that hash to produce any output from ISIS data.
481 If database is created using IsisMarc, it will also have to special fields
482 which will be used for identifiers, C<i1> and C<i2> like this:
489 'f' => 'Valdo D\'Arienzo',
490 'e' => 'tipografie e tipografi nel XVI secolo',
494 In case there are repeatable subfields in record, this will create
498 'a' => [ 'foo', 'bar', 'baz' ],
501 Or in more complex example of
503 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
508 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
511 This behaviour can be changed using C<join_subfields_with> option to L</new>,
512 in which case C<to_hash> will always create single value for each subfield.
513 This will change result to:
517 This method will also create additional field C<000> with MFN.
519 There is also more elaborative way to call C<to_hash> like this:
521 my $hash = $isis->to_hash({
523 include_subfields => 1,
526 Each option controll creation of hash:
532 Specify MFN number of record
534 =item include_subfields
536 This option will create additional key in hash called C<subfields> which will
537 have original record subfield order and index to that subfield like this:
540 a => ["a1", "a2", "a3", "a4", "a5"],
543 subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
546 =item join_subfields_with
548 Define delimiter which will be used to join repeatable subfields. You can
549 specify option here instead in L</new> if you want to have per-record control.
553 You can override C<hash_filter> defined in L</new> using this option.
563 my $mfn = shift || confess "need mfn!";
566 my $hash_filter = $self->{hash_filter};
568 if (ref($mfn) eq 'HASH') {
570 $mfn = $arg->{mfn} || confess "need mfn in arguments";
571 $hash_filter = $arg->{hash_filter} if ($arg->{hash_filter});
574 # init record to include MFN as field 000
575 my $rec = { '000' => [ $mfn ] };
577 my $row = $self->fetch($mfn) || return;
579 my $j_rs = $arg->{join_subfields_with};
580 $j_rs = $self->{join_subfields_with} unless(defined($j_rs));
581 my $i_sf = $arg->{include_subfields};
583 foreach my $f_nr (keys %{$row}) {
584 foreach my $l (@{$row->{$f_nr}}) {
587 $l = $hash_filter->($l, $f_nr) if ($hash_filter);
588 next unless defined($l);
591 my $r_sf; # repeatable subfields in this record
594 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
598 foreach my $t (split(/\^/,$l)) {
600 my ($sf,$v) = (substr($t,0,1), substr($t,1));
601 # XXX this might be option, but why?
602 next unless (defined($v) && $v ne '');
603 # warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
605 if (ref( $val->{$sf} ) eq 'ARRAY') {
607 push @{ $val->{$sf} }, $v;
609 # record repeatable subfield it it's offset
610 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
613 } elsif (defined( $val->{$sf} )) {
615 # convert scalar field to array
616 $val->{$sf} = [ $val->{$sf}, $v ];
618 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
623 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
632 $val->{$_} = join($j_rs, @{ $val->{$_} });
636 push @{$rec->{$f_nr}}, $val;
645 Return name of selected tag
647 print $isis->tag_name('200');
653 my $tag = shift || return;
654 return $self->{'TagName'}->{$tag} || $tag;
660 Read content of C<.CNT> file and return hash containing it.
662 print Dumper($isis->read_cnt);
664 This function is not used by module (C<.CNT> files are not required for this
665 module to work), but it can be useful to examine your index (while debugging
673 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
675 # Get the index information from $db.CNT
677 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
682 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
683 $self->unpack_cnt($buff);
685 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
686 $self->unpack_cnt($buff);
695 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
697 Here is definition of record:
699 off key description size
700 0: IDTYPE BTree type s
701 2: ORDN Nodes Order s
702 4: ORDF Leafs Order s
703 6: N Number of Memory buffers for nodes s
704 8: K Number of buffers for first level index s
705 10: LIV Current number of Index Levels s
706 12: POSRX Pointer to Root Record in N0x l
707 16: NMAXPOS Next Available position in N0x l
708 20: FMAXPOS Next available position in L0x l
709 24: ABNORMAL Formal BTree normality indicator s
712 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
719 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
721 my $buff = shift || return;
722 my @arr = unpack("vvvvvvVVVv", $buff);
724 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
726 my $IDTYPE = shift @arr;
728 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
736 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
737 some variations in input databases which has been tested with this module.
738 When I was in doubt, I assumed that OpenIsis's implementation was right
739 (except for obvious bugs).
741 However, every effort has been made to test this module with as much
742 databases (and programs that create them) as possible.
744 I would be very greatful for success or failure reports about usage of this
745 module with databases from programs other than WinIsis and IsisMarc. I had
746 tested this against ouput of one C<isis.dll>-based application, but I don't
747 know any details about it's version.
751 As this is young module, new features are added in subsequent version. It's
752 a good idea to specify version when using this module like this:
754 use Biblio::Isis 0.23
756 Below is list of changes in specific version of module (so you can target
757 older versions if you really have to):
763 Added C<hash_filter> to L</to_hash>
767 Added field number when calling C<hash_filter>
771 Added C<join_subfields_with> to L</new> and L</to_hash>.
773 Added C<include_subfields> to L</to_hash>.
777 Added C<< $isis->mfn >>, support for repeatable subfields and
778 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
787 http://www.rot13.org/~dpavlin/
789 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
790 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
794 This program is free software; you can redistribute
795 it and/or modify it under the same terms as Perl itself.
797 The full text of the license can be found in the
798 LICENSE file included with this module.
803 L<Biblio::Isis::Manual> for CDS/ISIS manual appendix F, G and H which describe file format
805 OpenIsis web site L<http://www.openisis.org>
807 perl4lib site L<http://perl4lib.perl.org>