5 use File::Glob qw(:globally :nocase);
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 #Give a hoot don't pollute, do not export more than needed by default
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
87 my ($v,$field_number) = @_;
91 join_subfields_with => ' ; ',
94 Options are described below:
100 This is full or relative path to ISIS database files which include
101 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
104 In this example it uses C<./cds/cds.MST> and related files.
108 Boolean flag to specify if field definition table should be read. It's off
111 =item include_deleted
113 Don't skip logically deleted records in ISIS.
117 Filter code ref which will be used before data is converted to hash. It will
118 receive two arguments, whole line from current field (in C<< $_[0] >>) and
119 field number (in C<< $_[1] >>).
123 Dump a B<lot> of debugging output even at level 1. For even more increase level.
125 =item join_subfields_with
127 Define delimiter which will be used to join repeatable subfields. This
128 option is included to support lagacy application written against version
129 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
138 bless($self, $class);
140 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
142 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
143 $self->{$v} = {@_}->{$v};
146 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
148 foreach my $f (@isis_files) {
149 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
150 $self->{lc($ext)."_file"} = $f;
153 my @must_exist = qw(mst xrf);
154 push @must_exist, "fdt" if ($self->{read_fdt});
156 foreach my $ext (@must_exist) {
157 unless ($self->{$ext."_file"}) {
158 carp "missing ",uc($ext)," file in ",$self->{isisdb};
163 if ($self->{debug}) {
164 print STDERR "## using files: ",join(" ",@isis_files),"\n";
165 eval "use Data::Dump";
168 *Dumper = *Data::Dump::dump;
174 # if you want to read .FDT file use read_fdt argument when creating class!
175 if ($self->{read_fdt} && -e $self->{fdt_file}) {
177 # read the $db.FDT file for tags
180 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
186 my $name=substr($_,0,30);
187 my $tag=substr($_,50,3);
192 $self->{'TagName'}->{$tag}=$name;
203 # Get the Maximum MFN from $db.MST
205 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
206 binmode($self->{'fileMST'});
208 # MST format: (* = 32 bit signed)
210 # NXTMFN* MFN to be assigned to the next record created
211 # NXTMFB* last block allocated to master file
212 # NXTMFP offset to next available position in last block
213 # MFTYPE always 0 for user db file (1 for system)
214 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
218 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
219 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
221 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
223 # open files for later
224 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
225 binmode($self->{'fileXRF'});
227 $self ? return $self : return undef;
232 Return number of records in database
240 return $self->{'NXTMFN'} - 1;
245 Read record with selected MFN
247 my $rec = $isis->fetch(55);
249 Returns hash with keys which are field names and values are unpacked values
250 for that field like this:
253 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
254 '990' => [ '2140', '88', 'HAY' ],
262 my $mfn = shift || croak "fetch needs MFN as argument!";
264 # is mfn allready in memory?
265 my $old_mfn = $self->{'current_mfn'} || -1;
266 return $self->{record} if ($mfn == $old_mfn);
268 print STDERR "## fetch: $mfn\n" if ($self->{debug});
271 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
273 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
274 seek($self->{'fileXRF'},$mfnpos,0);
279 delete $self->{record};
281 # read XRFMFB abd XRFMFP
282 read($self->{'fileXRF'}, $buff, 4);
283 my $pointer=unpack("V",$buff);
285 if ($self->{include_deleted}) {
288 warn "pointer for MFN $mfn is null\n";
293 # check for logically deleted record
294 if ($pointer & 0x80000000) {
295 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
296 $self->{deleted} = $mfn;
298 return unless $self->{include_deleted};
301 $pointer = ($pointer ^ 0xffffffff) + 1;
304 my $XRFMFB = int($pointer/2048);
305 my $XRFMFP = $pointer - ($XRFMFB*2048);
307 # (XRFMFB - 1) * 512 + XRFMFP
308 # why do i have to do XRFMFP % 1024 ?
310 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
312 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
314 # Get Record Information
316 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
318 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
319 my $value=unpack("V",$buff);
321 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
325 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
326 $self->{deleted} = $mfn;
330 carp "Error: MFN ".$mfn." not found in MST file, found $value";
334 read($self->{'fileMST'}, $buff, 14);
336 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
338 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
340 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
342 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
344 # Get Directory Format
350 read($self->{'fileMST'}, $buff, 6 * $NVF);
354 for (my $i = 0 ; $i < $NVF ; $i++) {
356 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
358 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
360 # The TAG does not exists in .FDT so we set it to 0.
362 # XXX This is removed from perl version; .FDT file is updated manually, so
363 # you will often have fields in .MST file which aren't in .FDT. On the other
364 # hand, IsisMarc doesn't use .FDT files at all!
366 #if (! $self->{TagName}->{$TAG}) {
377 # Get Variable Fields
379 read($self->{'fileMST'},$buff,$rec_len);
381 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
383 for (my $i = 0 ; $i < $NVF ; $i++) {
384 # skip zero-sized fields
385 next if ($FieldLEN[$i] == 0);
387 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
390 $self->{'current_mfn'} = $mfn;
392 print STDERR Dumper($self),"\n" if ($self->{debug});
394 return $self->{'record'};
399 Returns current MFN position
401 my $mfn = $isis->mfn;
405 # This function should be simple return $self->{current_mfn},
406 # but if new is called with _hack_mfn it becomes setter.
407 # It's useful in tests when setting $isis->{record} directly
411 return $self->{current_mfn};
417 Returns ASCII output of record with specified MFN
419 print $isis->to_ascii(42);
421 This outputs something like this:
423 210 ^aNew York^cNew York University press^dcop. 1988
428 If C<read_fdt> is specified when calling C<new> it will display field names
429 from C<.FDT> file instead of numeric tags.
436 my $mfn = shift || croak "need MFN";
438 my $rec = $self->fetch($mfn) || return;
442 foreach my $f (sort keys %{$rec}) {
443 my $fn = $self->tag_name($f);
444 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
454 Read record with specified MFN and convert it to hash
456 my $hash = $isis->to_hash($mfn);
458 It has ability to convert characters (using C<hash_filter>) from ISIS
459 database before creating structures enabling character re-mapping or quick
462 This function returns hash which is like this:
467 'c' => 'New York University press',
479 You can later use that hash to produce any output from ISIS data.
481 If database is created using IsisMarc, it will also have to special fields
482 which will be used for identifiers, C<i1> and C<i2> like this:
489 'f' => 'Valdo D\'Arienzo',
490 'e' => 'tipografie e tipografi nel XVI secolo',
494 In case there are repeatable subfields in record, this will create
498 'a' => [ 'foo', 'bar', 'baz' ],
501 Or in more complex example of
503 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
508 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
511 This behaviour can be changed using C<join_subfields_with> option to L</new>,
512 in which case C<to_hash> will always create single value for each subfield.
513 This will change result to:
517 This method will also create additional field C<000> with MFN.
519 There is also more elaborative way to call C<to_hash> like this:
521 my $hash = $isis->to_hash({
523 include_subfields => 1,
526 Each option controll creation of hash:
532 Specify MFN number of record
534 =item include_subfields
536 This option will create additional key in hash called C<subfields> which will
537 have original record subfield order and index to that subfield like this:
540 a => ["a1", "a2", "a3", "a4", "a5"],
543 subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
546 =item join_subfields_with
548 Define delimiter which will be used to join repeatable subfields. You can
549 specify option here instead in L</new> if you want to have per-record control.
559 my $mfn = shift || confess "need mfn!";
562 if (ref($mfn) eq 'HASH') {
564 $mfn = $arg->{mfn} || confess "need mfn in arguments";
567 # init record to include MFN as field 000
568 my $rec = { '000' => [ $mfn ] };
570 my $row = $self->fetch($mfn) || return;
572 my $j_rs = $arg->{join_subfields_with};
573 $j_rs = $self->{join_subfields_with} unless(defined($j_rs));
574 my $i_sf = $arg->{include_subfields};
576 foreach my $f_nr (keys %{$row}) {
577 foreach my $l (@{$row->{$f_nr}}) {
580 if ($self->{'hash_filter'}) {
581 $l = $self->{'hash_filter'}->($l, $f_nr);
582 next unless defined($l);
586 my $r_sf; # repeatable subfields in this record
589 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
593 foreach my $t (split(/\^/,$l)) {
595 my ($sf,$v) = (substr($t,0,1), substr($t,1));
596 # XXX this might be option, but why?
598 # warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
600 if (ref( $val->{$sf} ) eq 'ARRAY') {
602 push @{ $val->{$sf} }, $v;
604 # record repeatable subfield it it's offset
605 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
608 } elsif (defined( $val->{$sf} )) {
610 # convert scalar field to array
611 $val->{$sf} = [ $val->{$sf}, $v ];
613 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
618 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
627 $val->{$_} = join($j_rs, @{ $val->{$_} });
631 push @{$rec->{$f_nr}}, $val;
640 Return name of selected tag
642 print $isis->tag_name('200');
648 my $tag = shift || return;
649 return $self->{'TagName'}->{$tag} || $tag;
655 Read content of C<.CNT> file and return hash containing it.
657 print Dumper($isis->read_cnt);
659 This function is not used by module (C<.CNT> files are not required for this
660 module to work), but it can be useful to examine your index (while debugging
668 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
670 # Get the index information from $db.CNT
672 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
677 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
678 $self->unpack_cnt($buff);
680 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
681 $self->unpack_cnt($buff);
690 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
692 Here is definition of record:
694 off key description size
695 0: IDTYPE BTree type s
696 2: ORDN Nodes Order s
697 4: ORDF Leafs Order s
698 6: N Number of Memory buffers for nodes s
699 8: K Number of buffers for first level index s
700 10: LIV Current number of Index Levels s
701 12: POSRX Pointer to Root Record in N0x l
702 16: NMAXPOS Next Available position in N0x l
703 20: FMAXPOS Next available position in L0x l
704 24: ABNORMAL Formal BTree normality indicator s
707 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
714 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
716 my $buff = shift || return;
717 my @arr = unpack("vvvvvvVVVv", $buff);
719 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
721 my $IDTYPE = shift @arr;
723 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
731 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
732 some variations in input databases which has been tested with this module.
733 When I was in doubt, I assumed that OpenIsis's implementation was right
734 (except for obvious bugs).
736 However, every effort has been made to test this module with as much
737 databases (and programs that create them) as possible.
739 I would be very greatful for success or failure reports about usage of this
740 module with databases from programs other than WinIsis and IsisMarc. I had
741 tested this against ouput of one C<isis.dll>-based application, but I don't
742 know any details about it's version.
746 As this is young module, new features are added in subsequent version. It's
747 a good idea to specify version when using this module like this:
749 use Biblio::Isis 0.21
751 Below is list of changes in specific version of module (so you can target
752 older versions if you really have to):
758 Added field number when calling C<hash_filter>
762 Added C<join_subfields_with> to L</new> and L</to_hash>.
764 Added C<include_subfields> to L</to_hash>.
768 Added C<< $isis->mfn >>, support for repeatable subfields and
769 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
778 http://www.rot13.org/~dpavlin/
780 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
781 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
785 This program is free software; you can redistribute
786 it and/or modify it under the same terms as Perl itself.
788 The full text of the license can be found in the
789 LICENSE file included with this module.
794 L<Biblio::Isis::Manual> for CDS/ISIS manual appendix F, G and H which describe file format
796 OpenIsis web site L<http://www.openisis.org>
798 perl4lib site L<http://perl4lib.perl.org>