5 use File::Glob qw(:globally :nocase);
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 #Give a hoot don't pollute, do not export more than needed by default
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
91 join_subfields_with => ' ; ',
94 Options are described below:
100 This is full or relative path to ISIS database files which include
101 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
104 In this example it uses C<./cds/cds.MST> and related files.
108 Boolean flag to specify if field definition table should be read. It's off
111 =item include_deleted
113 Don't skip logically deleted records in ISIS.
117 Filter code ref which will be used before data is converted to hash.
121 Dump a B<lot> of debugging output even at level 1. For even more increase level.
123 =item join_subfields_with
125 Define delimiter which will be used to join repeatable subfields. This
126 option is included to support lagacy application written against version
127 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
136 bless($self, $class);
138 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
140 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
141 $self->{$v} = {@_}->{$v};
144 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
146 foreach my $f (@isis_files) {
147 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
148 $self->{lc($ext)."_file"} = $f;
151 my @must_exist = qw(mst xrf);
152 push @must_exist, "fdt" if ($self->{read_fdt});
154 foreach my $ext (@must_exist) {
155 unless ($self->{$ext."_file"}) {
156 carp "missing ",uc($ext)," file in ",$self->{isisdb};
161 if ($self->{debug}) {
162 print STDERR "## using files: ",join(" ",@isis_files),"\n";
163 eval "use Data::Dump";
166 *Dumper = *Data::Dump::dump;
172 # if you want to read .FDT file use read_fdt argument when creating class!
173 if ($self->{read_fdt} && -e $self->{fdt_file}) {
175 # read the $db.FDT file for tags
178 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
184 my $name=substr($_,0,30);
185 my $tag=substr($_,50,3);
190 $self->{'TagName'}->{$tag}=$name;
201 # Get the Maximum MFN from $db.MST
203 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
204 binmode($self->{'fileMST'});
206 # MST format: (* = 32 bit signed)
208 # NXTMFN* MFN to be assigned to the next record created
209 # NXTMFB* last block allocated to master file
210 # NXTMFP offset to next available position in last block
211 # MFTYPE always 0 for user db file (1 for system)
212 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
216 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
217 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
219 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
221 # open files for later
222 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
223 binmode($self->{'fileXRF'});
225 $self ? return $self : return undef;
230 Return number of records in database
238 return $self->{'NXTMFN'} - 1;
243 Read record with selected MFN
245 my $rec = $isis->fetch(55);
247 Returns hash with keys which are field names and values are unpacked values
248 for that field like this:
251 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
252 '990' => [ '2140', '88', 'HAY' ],
260 my $mfn = shift || croak "fetch needs MFN as argument!";
262 # is mfn allready in memory?
263 my $old_mfn = $self->{'current_mfn'} || -1;
264 return $self->{record} if ($mfn == $old_mfn);
266 print STDERR "## fetch: $mfn\n" if ($self->{debug});
269 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
271 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
272 seek($self->{'fileXRF'},$mfnpos,0);
277 delete $self->{record};
279 # read XRFMFB abd XRFMFP
280 read($self->{'fileXRF'}, $buff, 4);
281 my $pointer=unpack("V",$buff);
283 if ($self->{include_deleted}) {
286 warn "pointer for MFN $mfn is null\n";
291 # check for logically deleted record
292 if ($pointer & 0x80000000) {
293 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
294 $self->{deleted} = $mfn;
296 return unless $self->{include_deleted};
299 $pointer = ($pointer ^ 0xffffffff) + 1;
302 my $XRFMFB = int($pointer/2048);
303 my $XRFMFP = $pointer - ($XRFMFB*2048);
305 # (XRFMFB - 1) * 512 + XRFMFP
306 # why do i have to do XRFMFP % 1024 ?
308 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
310 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
312 # Get Record Information
314 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
316 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
317 my $value=unpack("V",$buff);
319 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
323 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
324 $self->{deleted} = $mfn;
328 carp "Error: MFN ".$mfn." not found in MST file, found $value";
332 read($self->{'fileMST'}, $buff, 14);
334 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
336 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
338 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
340 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
342 # Get Directory Format
348 read($self->{'fileMST'}, $buff, 6 * $NVF);
352 for (my $i = 0 ; $i < $NVF ; $i++) {
354 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
356 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
358 # The TAG does not exists in .FDT so we set it to 0.
360 # XXX This is removed from perl version; .FDT file is updated manually, so
361 # you will often have fields in .MST file which aren't in .FDT. On the other
362 # hand, IsisMarc doesn't use .FDT files at all!
364 #if (! $self->{TagName}->{$TAG}) {
375 # Get Variable Fields
377 read($self->{'fileMST'},$buff,$rec_len);
379 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
381 for (my $i = 0 ; $i < $NVF ; $i++) {
382 # skip zero-sized fields
383 next if ($FieldLEN[$i] == 0);
385 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
388 $self->{'current_mfn'} = $mfn;
390 print STDERR Dumper($self),"\n" if ($self->{debug});
392 return $self->{'record'};
397 Returns current MFN position
399 my $mfn = $isis->mfn;
403 # This function should be simple return $self->{current_mfn},
404 # but if new is called with _hack_mfn it becomes setter.
405 # It's useful in tests when setting $isis->{record} directly
409 return $self->{current_mfn};
415 Returns ASCII output of record with specified MFN
417 print $isis->to_ascii(42);
419 This outputs something like this:
421 210 ^aNew York^cNew York University press^dcop. 1988
426 If C<read_fdt> is specified when calling C<new> it will display field names
427 from C<.FDT> file instead of numeric tags.
434 my $mfn = shift || croak "need MFN";
436 my $rec = $self->fetch($mfn) || return;
440 foreach my $f (sort keys %{$rec}) {
441 my $fn = $self->tag_name($f);
442 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
452 Read record with specified MFN and convert it to hash
454 my $hash = $isis->to_hash($mfn);
456 It has ability to convert characters (using C<hash_filter>) from ISIS
457 database before creating structures enabling character re-mapping or quick
460 This function returns hash which is like this:
465 'c' => 'New York University press',
477 You can later use that hash to produce any output from ISIS data.
479 If database is created using IsisMarc, it will also have to special fields
480 which will be used for identifiers, C<i1> and C<i2> like this:
487 'f' => 'Valdo D\'Arienzo',
488 'e' => 'tipografie e tipografi nel XVI secolo',
492 In case there are repeatable subfields in record, this will create
496 'a' => [ 'foo', 'bar', 'baz' ],
499 Or in more complex example of
501 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
506 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
509 This behaviour can be changed using C<join_subfields_with> option to L</new>,
510 in which case C<to_hash> will always create single value for each subfield.
511 This will change result to:
515 This method will also create additional field C<000> with MFN.
517 There is also more elaborative way to call C<to_hash> like this:
519 my $hash = $isis->to_hash({
521 include_subfields => 1,
524 Each option controll creation of hash:
530 Specify MFN number of record
532 =item include_subfields
534 This option will create additional key in hash called C<subfields> which will
535 have original record subfield order and index to that subfield like this:
538 a => ["a1", "a2", "a3", "a4", "a5"],
541 subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
544 =item join_subfields_with
546 Define delimiter which will be used to join repeatable subfields. You can
547 specify option here instead in L</new> if you want to have per-record controll.
557 my $mfn = shift || confess "need mfn!";
560 if (ref($mfn) eq 'HASH') {
562 $mfn = $arg->{mfn} || confess "need mfn in arguments";
565 # init record to include MFN as field 000
566 my $rec = { '000' => [ $mfn ] };
568 my $row = $self->fetch($mfn) || return;
570 my $j_rs = $arg->{join_repeatable_subfields};
571 $j_rs = $self->{join_repeatable_subfields} unless(defined($j_rs));
572 my $i_sf = $arg->{include_subfields};
574 foreach my $f_nr (keys %{$row}) {
575 foreach my $l (@{$row->{$f_nr}}) {
578 if ($self->{'hash_filter'}) {
579 $l = $self->{'hash_filter'}->($l);
580 next unless defined($l);
584 my $r_sf; # repeatable subfields in this record
587 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
591 foreach my $t (split(/\^/,$l)) {
593 my ($sf,$v) = (substr($t,0,1), substr($t,1));
594 # XXX this might be option, but why?
596 # warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
598 if (ref( $val->{$sf} ) eq 'ARRAY') {
600 push @{ $val->{$sf} }, $v;
602 # record repeatable subfield it it's offset
603 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
606 } elsif (defined( $val->{$sf} )) {
608 # convert scalar field to array
609 $val->{$sf} = [ $val->{$sf}, $v ];
611 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
616 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
625 $val->{$_} = join($j_rs, @{ $val->{$_} });
629 push @{$rec->{$f_nr}}, $val;
638 Return name of selected tag
640 print $isis->tag_name('200');
646 my $tag = shift || return;
647 return $self->{'TagName'}->{$tag} || $tag;
653 Read content of C<.CNT> file and return hash containing it.
655 print Dumper($isis->read_cnt);
657 This function is not used by module (C<.CNT> files are not required for this
658 module to work), but it can be useful to examine your index (while debugging
666 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
668 # Get the index information from $db.CNT
670 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
675 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
676 $self->unpack_cnt($buff);
678 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
679 $self->unpack_cnt($buff);
688 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
690 Here is definition of record:
692 off key description size
693 0: IDTYPE BTree type s
694 2: ORDN Nodes Order s
695 4: ORDF Leafs Order s
696 6: N Number of Memory buffers for nodes s
697 8: K Number of buffers for first level index s
698 10: LIV Current number of Index Levels s
699 12: POSRX Pointer to Root Record in N0x l
700 16: NMAXPOS Next Available position in N0x l
701 20: FMAXPOS Next available position in L0x l
702 24: ABNORMAL Formal BTree normality indicator s
705 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
712 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
714 my $buff = shift || return;
715 my @arr = unpack("vvvvvvVVVv", $buff);
717 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
719 my $IDTYPE = shift @arr;
721 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
729 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
730 some variations in input databases which has been tested with this module.
731 When I was in doubt, I assumed that OpenIsis's implementation was right
732 (except for obvious bugs).
734 However, every effort has been made to test this module with as much
735 databases (and programs that create them) as possible.
737 I would be very greatful for success or failure reports about usage of this
738 module with databases from programs other than WinIsis and IsisMarc. I had
739 tested this against ouput of one C<isis.dll>-based application, but I don't
740 know any details about it's version.
744 As this is young module, new features are added in subsequent version. It's
745 a good idea to specify version when using this module like this:
747 use Biblio::Isis 0.21
749 Below is list of changes in specific version of module (so you can target
750 older versions if you really have to):
756 Added C<join_subfields_with> to L</new> and L</to_hash>.
758 Added C<include_subfields> to L</to_hash>.
762 Added C<< $isis->mfn >>, support for repeatable subfields and
763 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
772 http://www.rot13.org/~dpavlin/
774 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
775 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
779 This program is free software; you can redistribute
780 it and/or modify it under the same terms as Perl itself.
782 The full text of the license can be found in the
783 LICENSE file included with this module.
788 OpenIsis web site L<http://www.openisis.org>
790 perl4lib site L<http://perl4lib.perl.org>