package IsisDB; use strict; use Carp; use File::Glob qw(:globally :nocase); use Data::Dumper; BEGIN { use Exporter (); use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); $VERSION = 0.07; @ISA = qw (Exporter); #Give a hoot don't pollute, do not export more than needed by default @EXPORT = qw (); @EXPORT_OK = qw (); %EXPORT_TAGS = (); } =head1 NAME IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database =head1 SYNOPSIS use IsisDB; my $isis = new IsisDB( isisdb => './cds/cds', ); for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) { print $isis->to_ascii($mfn),"\n"; } =head1 DESCRIPTION This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or IsisMarc. It can be used as perl-only alternative to OpenIsis module. It can create hash values from data in ISIS database (using C), ASCII dump (using C) or just hash with field names and packed values (like C<^asomething^belse>). Unique feature of this module is ability to C records. It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so fields which are zero sized will be filled with random junk from memory). It also has support for identifiers (only if ISIS database is created by IsisMarc), see C. This will module will always be slower than OpenIsis module which use C library. However, since it's written in perl, it's platform independent (so you don't need C compiler), and can be easily modified. I hope that it creates data structures which are easier to use than ones created by OpenIsis, so reduced time in other parts of the code should compensate for slower performance of this module (speed of reading ISIS database is rarely an issue). =head1 METHODS =cut # my $ORDN; # Nodes Order # my $ORDF; # Leafs Order # my $N; # Number of Memory buffers for nodes # my $K; # Number of buffers for first level index # my $LIV; # Current number of Index Levels # my $POSRX; # Pointer to Root Record in N0x # my $NMAXPOS; # Next Available position in N0x # my $FMAXPOS; # Next available position in L0x # my $ABNORMAL; # Formal BTree normality indicator # # some binary reads # =head2 new Open ISIS database my $isis = new IsisDB( isisdb => './cds/cds', read_fdt => 1, include_deleted => 1, hash_filter => sub { my $v = shift; $v =~ s#foo#bar#g; }, debug => 1, ); Options are described below: =over 5 =item isisdb This is full or relative path to ISIS database files which include common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using C option) files. In this example it uses C<./cds/cds.MST> and related files. =item read_fdt Boolean flag to specify if field definition table should be read. It's off by default. =item include_deleted Don't skip logically deleted records in ISIS. =item hash_filter Filter code ref which will be used before data is converted to hash. =item debug Dump a B of debugging output. =back It will also set C<$isis-E{'maxmfn'}> which is maximum MFN stored in database. =cut sub new { my $class = shift; my $self = {}; bless($self, $class); croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb}); foreach my $v (qw{isisdb debug include_deleted hash_filter}) { $self->{$v} = {@_}->{$v}; } my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*")); foreach my $f (@isis_files) { my $ext = $1 if ($f =~ m/\.(\w\w\w)$/); $self->{lc($ext)."_file"} = $f; } my @must_exist = qw(mst xrf); push @must_exist, "fdt" if ($self->{read_fdt}); foreach my $ext (@must_exist) { croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"}); } print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug}); # if you want to read .FDT file use read_fdt argument when creating class! if ($self->{read_fdt} && -e $self->{fdt_file}) { # read the $db.FDT file for tags my $fieldzone=0; open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!"; while () { chomp; if ($fieldzone) { my $name=substr($_,0,30); my $tag=substr($_,50,3); $name =~ s/\s+$//; $tag =~ s/\s+$//; $self->{'TagName'}->{$tag}=$name; } if (/^\*\*\*/) { $fieldzone=1; } } close(fileFDT); } # Get the Maximum MFN from $db.MST open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!"; # MST format: (* = 32 bit signed) # CTLMFN* always 0 # NXTMFN* MFN to be assigned to the next record created # NXTMFB* last block allocated to master file # NXTMFP offset to next available position in last block # MFTYPE always 0 for user db file (1 for system) seek($self->{'fileMST'},4,0); my $buff; read($self->{'fileMST'}, $buff, 4); $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero"; # save maximum MFN $self->{'maxmfn'} = $self->{'NXTMFN'} - 1; print STDERR Dumper($self),"\n" if ($self->{debug}); # open files for later open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!"; $self ? return $self : return undef; } =head2 read_cnt This function is not really used by module, but can be useful to find info about your index (if debugging it for example). print Dumper($isis->read_cnt); =cut sub read_cnt { my $self = shift; croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file}); # Get the index information from $db.CNT open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!"; # There is two 26 Bytes fixed lenght records # 0: IDTYPE BTree type 16 # 2: ORDN Nodes Order 16 # 4: ORDF Leafs Order 16 # 6: N Number of Memory buffers for nodes 16 # 8: K Number of buffers for first level index 16 # 10: LIV Current number of Index Levels 16 # 12: POSRX* Pointer to Root Record in N0x 32 # 16: NMAXPOS* Next Available position in N0x 32 # 20: FMAXPOS* Next available position in L0x 32 # 24: ABNORMAL Formal BTree normality indicator 16 # length: 26 bytes sub unpack_cnt { my $self = shift; my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL); my $buff = shift || return; my @arr = unpack("ssssssllls", $buff); print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'}); my $IDTYPE = shift @arr; foreach (@flds) { $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr); } } my $buff; read(fileCNT, $buff, 26); $self->unpack_cnt($buff); read(fileCNT, $buff, 26); $self->unpack_cnt($buff); close(fileCNT); return $self->{cnt}; } =head2 fetch Read record with selected MFN my $rec = $isis->fetch(55); Returns hash with keys which are field names and values are unpacked values for that field like this: $rec = { '210' => [ '^aNew York^cNew York University press^dcop. 1988' ], '990' => [ '2140', '88', 'HAY' ], }; =cut sub fetch { my $self = shift; my $mfn = shift || croak "fetch needs MFN as argument!"; # is mfn allready in memory? my $old_mfn = $self->{'current_mfn'} || -1; return if ($mfn == $old_mfn); print STDERR "## fetch: $mfn\n" if ($self->{debug}); # XXX check this? my $mfnpos=($mfn+int(($mfn-1)/127))*4; print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug}); seek($self->{'fileXRF'},$mfnpos,0); my $buff; # read XRFMFB abd XRFMFP read($self->{'fileXRF'}, $buff, 4); my $pointer=unpack("l",$buff) || carp "pointer is null"; my $XRFMFB = int($pointer/2048); my $XRFMFP = $pointer - ($XRFMFB*2048); # (XRFMFB - 1) * 512 + XRFMFP # why do i have to do XRFMFP % 1024 ? my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024); print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'}); # Get Record Information seek($self->{'fileMST'},$blk_off,0); read($self->{'fileMST'}, $buff, 4); my $value=unpack("l",$buff); print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug}); if ($value!=$mfn) { carp "Error: MFN ".$mfn." not found in MST(".$value.")"; #return; # XXX deleted record? } # $MFRL=$self->Read16($fileMST); # $MFBWB=$self->Read32($fileMST); # $MFBWP=$self->Read16($fileMST); # $BASE=$self->Read16($fileMST); # $NVF=$self->Read16($fileMST); # $STATUS=$self->Read16($fileMST); read($self->{'fileMST'}, $buff, 14); my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff); print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug}); # delete old record delete $self->{record}; ## FIXME this is a bug if (! $self->{'include_deleted'} && $MFRL < 0) { print "## logically deleted record $mfn, skipping...\n" if ($self->{debug}); return; } warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF); # Get Directory Format my @FieldPOS; my @FieldLEN; my @FieldTAG; read($self->{'fileMST'}, $buff, 6 * $NVF); my $rec_len = 0; for (my $i = 0 ; $i < $NVF ; $i++) { # $TAG=$self->Read16($fileMST); # $POS=$self->Read16($fileMST); # $LEN=$self->Read16($fileMST); my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6)); print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug}); # The TAG does not exists in .FDT so we set it to 0. # # XXX This is removed from perl version; .FDT file is updated manually, so # you will often have fields in .MST file which aren't in .FDT. On the other # hand, IsisMarc doesn't use .FDT files at all! #if (! $self->{TagName}->{$TAG}) { # $TAG=0; #} push @FieldTAG,$TAG; push @FieldPOS,$POS; push @FieldLEN,$LEN; $rec_len += $LEN; } # Get Variable Fields read($self->{'fileMST'},$buff,$rec_len); print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug}); for (my $i = 0 ; $i < $NVF ; $i++) { # skip zero-sized fields next if ($FieldLEN[$i] == 0); push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]); } $self->{'current_mfn'} = $mfn; print Dumper($self),"\n" if ($self->{debug}); return $self->{'record'}; } =head2 to_ascii Dump ASCII output of record with specified MFN print $isis->to_ascii(42); It outputs something like this: 210 ^aNew York^cNew York University press^dcop. 1988 990 2140 990 88 990 HAY If C is specified when calling C it will display field names from C<.FDT> file instead of numeric tags. =cut sub to_ascii { my $self = shift; my $mfn = shift || croak "need MFN"; my $rec = $self->fetch($mfn); my $out = "0\t$mfn"; foreach my $f (sort keys %{$rec}) { my $fn = $self->tag_name($f); $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}}); } $out .= "\n"; return $out; } =head2 to_hash Read record with specified MFN and convert it to hash my $hash = $isis->to_hash($mfn); It has ability to convert characters (using C from ISIS database before creating structures enabling character re-mapping or quick fix-up of data. This function returns hash which is like this: $hash = { '210' => [ { 'c' => 'New York University press', 'a' => 'New York', 'd' => 'cop. 1988' } ], '990' => [ '2140', '88', 'HAY' ], }; You can later use that hash to produce any output from ISIS data. If database is created using IsisMarc, it will also have to special fields which will be used for identifiers, C and C like this: '200' => [ { 'i1' => '1', 'i2' => ' ' 'a' => 'Goa', 'f' => 'Valdo D\'Arienzo', 'e' => 'tipografie e tipografi nel XVI secolo', } ], This method will also create additional field C<000> with MFN. =cut sub to_hash { my $self = shift; my $mfn = shift || confess "need mfn!"; # init record to include MFN as field 000 my $rec = { '000' => [ $mfn ] }; my $row = $self->fetch($mfn); foreach my $k (keys %{$row}) { foreach my $l (@{$row->{$k}}) { # filter output $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'}); my $val; # has identifiers? ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//); # has subfields? if ($l =~ m/\^/) { foreach my $t (split(/\^/,$l)) { next if (! $t); $val->{substr($t,0,1)} = substr($t,1); } } else { $val = $l; } push @{$rec->{$k}}, $val; } } return $rec; } =head2 tag_name Return name of selected tag print $isis->tag_name('200'); =cut sub tag_name { my $self = shift; my $tag = shift || return; return $self->{'TagName'}->{$tag} || $tag; } 1; =head1 BUGS This module has been very lightly tested. Use with caution and report bugs. =head1 AUTHOR Dobrica Pavlinusic CPAN ID: DPAVLIN dpavlin@rot13.org http://www.rot13.org/~dpavlin/ This module is based heavily on code from C library to read ISIS files V0.1.1 written in php and (c) 2000 Franck Martin and released under LGPL. =head1 COPYRIGHT This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The full text of the license can be found in the LICENSE file included with this module. =head1 SEE ALSO OpenIsis web site L perl4lib site L