5 use File::Glob qw(:globally :nocase);
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 #Give a hoot don't pollute, do not export more than needed by default
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
93 Options are described below:
99 This is full or relative path to ISIS database files which include
100 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
101 C<read_fdt> option) files.
103 In this example it uses C<./cds/cds.MST> and related files.
107 Boolean flag to specify if field definition table should be read. It's off
110 =item include_deleted
112 Don't skip logically deleted records in ISIS.
116 Filter code ref which will be used before data is converted to hash.
120 Dump a B<lot> of debugging output even at level 1. For even more increase level.
129 bless($self, $class);
131 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
133 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 $self->{$v} = {@_}->{$v};
137 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
139 foreach my $f (@isis_files) {
140 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
141 $self->{lc($ext)."_file"} = $f;
144 my @must_exist = qw(mst xrf);
145 push @must_exist, "fdt" if ($self->{read_fdt});
147 foreach my $ext (@must_exist) {
148 unless ($self->{$ext."_file"}) {
149 carp "missing ",uc($ext)," file in ",$self->{isisdb};
154 if ($self->{debug}) {
155 print STDERR "## using files: ",join(" ",@isis_files),"\n";
156 eval "use Data::Dump";
159 *Dumper = *Data::Dump::dump;
165 # if you want to read .FDT file use read_fdt argument when creating class!
166 if ($self->{read_fdt} && -e $self->{fdt_file}) {
168 # read the $db.FDT file for tags
171 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
177 my $name=substr($_,0,30);
178 my $tag=substr($_,50,3);
183 $self->{'TagName'}->{$tag}=$name;
194 # Get the Maximum MFN from $db.MST
196 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
197 binmode($self->{'fileMST'});
199 # MST format: (* = 32 bit signed)
201 # NXTMFN* MFN to be assigned to the next record created
202 # NXTMFB* last block allocated to master file
203 # NXTMFP offset to next available position in last block
204 # MFTYPE always 0 for user db file (1 for system)
205 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
209 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
210 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
212 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
214 # open files for later
215 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
216 binmode($self->{'fileXRF'});
218 $self ? return $self : return undef;
223 Return number of records in database
231 return $self->{'NXTMFN'} - 1;
236 Read record with selected MFN
238 my $rec = $isis->fetch(55);
240 Returns hash with keys which are field names and values are unpacked values
241 for that field like this:
244 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
245 '990' => [ '2140', '88', 'HAY' ],
253 my $mfn = shift || croak "fetch needs MFN as argument!";
255 # is mfn allready in memory?
256 my $old_mfn = $self->{'current_mfn'} || -1;
257 return $self->{record} if ($mfn == $old_mfn);
259 print STDERR "## fetch: $mfn\n" if ($self->{debug});
262 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
264 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
265 seek($self->{'fileXRF'},$mfnpos,0);
270 delete $self->{record};
272 # read XRFMFB abd XRFMFP
273 read($self->{'fileXRF'}, $buff, 4);
274 my $pointer=unpack("V",$buff);
276 if ($self->{include_deleted}) {
279 warn "pointer for MFN $mfn is null\n";
284 # check for logically deleted record
285 if ($pointer & 0x80000000) {
286 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
287 $self->{deleted} = $mfn;
289 return unless $self->{include_deleted};
292 $pointer = ($pointer ^ 0xffffffff) + 1;
295 my $XRFMFB = int($pointer/2048);
296 my $XRFMFP = $pointer - ($XRFMFB*2048);
298 # (XRFMFB - 1) * 512 + XRFMFP
299 # why do i have to do XRFMFP % 1024 ?
301 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
303 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
305 # Get Record Information
307 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
309 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
310 my $value=unpack("V",$buff);
312 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
316 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
317 $self->{deleted} = $mfn;
321 carp "Error: MFN ".$mfn." not found in MST file, found $value";
325 read($self->{'fileMST'}, $buff, 14);
327 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
329 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
331 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
333 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
335 # Get Directory Format
341 read($self->{'fileMST'}, $buff, 6 * $NVF);
345 for (my $i = 0 ; $i < $NVF ; $i++) {
347 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
349 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
351 # The TAG does not exists in .FDT so we set it to 0.
353 # XXX This is removed from perl version; .FDT file is updated manually, so
354 # you will often have fields in .MST file which aren't in .FDT. On the other
355 # hand, IsisMarc doesn't use .FDT files at all!
357 #if (! $self->{TagName}->{$TAG}) {
368 # Get Variable Fields
370 read($self->{'fileMST'},$buff,$rec_len);
372 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
374 for (my $i = 0 ; $i < $NVF ; $i++) {
375 # skip zero-sized fields
376 next if ($FieldLEN[$i] == 0);
378 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
381 $self->{'current_mfn'} = $mfn;
383 print STDERR Dumper($self),"\n" if ($self->{debug});
385 return $self->{'record'};
390 Returns current MFN position
392 my $mfn = $isis->mfn;
396 # This function should be simple return $self->{current_mfn},
397 # but if new is called with _hack_mfn it becomes setter.
398 # It's useful in tests when setting $isis->{record} directly
402 return $self->{current_mfn};
408 Returns ASCII output of record with specified MFN
410 print $isis->to_ascii(42);
412 This outputs something like this:
414 210 ^aNew York^cNew York University press^dcop. 1988
419 If C<read_fdt> is specified when calling C<new> it will display field names
420 from C<.FDT> file instead of numeric tags.
427 my $mfn = shift || croak "need MFN";
429 my $rec = $self->fetch($mfn) || return;
433 foreach my $f (sort keys %{$rec}) {
434 my $fn = $self->tag_name($f);
435 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
445 Read record with specified MFN and convert it to hash
447 my $hash = $isis->to_hash($mfn);
449 It has ability to convert characters (using C<hash_filter>) from ISIS
450 database before creating structures enabling character re-mapping or quick
453 This function returns hash which is like this:
458 'c' => 'New York University press',
470 You can later use that hash to produce any output from ISIS data.
472 If database is created using IsisMarc, it will also have to special fields
473 which will be used for identifiers, C<i1> and C<i2> like this:
480 'f' => 'Valdo D\'Arienzo',
481 'e' => 'tipografie e tipografi nel XVI secolo',
485 In case there are repeatable subfields in record, this will create
489 'a' => [ 'foo', 'bar', 'baz' ],
492 This method will also create additional field C<000> with MFN.
494 There is also more elaborative way to call C<to_hash> like this:
496 my $hash = $isis->to_hash({
498 include_empty_subfields => 1,
507 my $mfn = shift || confess "need mfn!";
510 if (ref($mfn) eq 'HASH') {
512 $mfn = $arg->{mfn} || confess "need mfn in arguments";
515 # init record to include MFN as field 000
516 my $rec = { '000' => [ $mfn ] };
518 my $row = $self->fetch($mfn) || return;
520 foreach my $k (keys %{$row}) {
521 foreach my $l (@{$row->{$k}}) {
524 if ($self->{'hash_filter'}) {
525 $l = $self->{'hash_filter'}->($l);
526 next unless defined($l);
532 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
536 foreach my $t (split(/\^/,$l)) {
538 my ($sf,$v) = (substr($t,0,1), substr($t,1));
539 # FIXME make this option !
541 # warn "### $k^$sf:$v",$/ if ($self->{debug} > 1);
543 # FIXME array return optional, by default unroll to ' ; '
544 if (ref( $val->{$sf} ) eq 'ARRAY') {
546 push @{ $val->{$sf} }, $v;
547 } elsif (defined( $val->{$sf} )) {
548 # convert scalar field to array
549 $val->{$sf} = [ $val->{$sf}, $v ];
558 push @{$rec->{$k}}, $val;
567 Return name of selected tag
569 print $isis->tag_name('200');
575 my $tag = shift || return;
576 return $self->{'TagName'}->{$tag} || $tag;
582 Read content of C<.CNT> file and return hash containing it.
584 print Dumper($isis->read_cnt);
586 This function is not used by module (C<.CNT> files are not required for this
587 module to work), but it can be useful to examine your index (while debugging
595 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
597 # Get the index information from $db.CNT
599 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
604 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
605 $self->unpack_cnt($buff);
607 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
608 $self->unpack_cnt($buff);
617 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
619 Here is definition of record:
621 off key description size
622 0: IDTYPE BTree type s
623 2: ORDN Nodes Order s
624 4: ORDF Leafs Order s
625 6: N Number of Memory buffers for nodes s
626 8: K Number of buffers for first level index s
627 10: LIV Current number of Index Levels s
628 12: POSRX Pointer to Root Record in N0x l
629 16: NMAXPOS Next Available position in N0x l
630 20: FMAXPOS Next available position in L0x l
631 24: ABNORMAL Formal BTree normality indicator s
634 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
641 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
643 my $buff = shift || return;
644 my @arr = unpack("vvvvvvVVVv", $buff);
646 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
648 my $IDTYPE = shift @arr;
650 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
658 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
659 some variations in input databases which has been tested with this module.
660 When I was in doubt, I assumed that OpenIsis's implementation was right
661 (except for obvious bugs).
663 However, every effort has been made to test this module with as much
664 databases (and programs that create them) as possible.
666 I would be very greatful for success or failure reports about usage of this
667 module with databases from programs other than WinIsis and IsisMarc. I had
668 tested this against ouput of one C<isis.dll>-based application, but I don't
669 know any details about it's version.
673 You can find version dependencies documented here
679 Added C<< $isis->mfn >>, support for repeatable subfields and
680 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
689 http://www.rot13.org/~dpavlin/
691 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
692 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
696 This program is free software; you can redistribute
697 it and/or modify it under the same terms as Perl itself.
699 The full text of the license can be found in the
700 LICENSE file included with this module.
705 OpenIsis web site L<http://www.openisis.org>
707 perl4lib site L<http://perl4lib.perl.org>