fix CROASCII (B1.002:1982) filter
[webpac] / tools / isis2marc.pl
index 1b7d2a1..90049f1 100755 (executable)
 #!/usr/bin/perl -w
 
-# this utility will convert some (or all, defined by regex)
-# fields into marc file from one or more CDS/ISIS files
+# This utility will convert some (or all, depending of definition in
+# configuration XMLfile) fields and subfields with remapping into MARC
+# file from one or more CDS/ISIS files
 #
 # 2004-02-23 Dobrica Pavlinusic <dpavlin@rot13.org>
 #
-# if ISIS databases are named same as directories in which they
+#
+# Run without parametars for usage instructions or run without parametars
+# and redirect STDOUT to file to create example configuration file like
+# this:
+#
+# ./isis2marc.pl > config.xml
+#
+# If you want to create unique records, you need to define one or more
+# fields as key (which will be used to produce just one record for one
+# key)
+#
+# Keys are global for one run of script (that means for all ISIS databases
+# used in one run), but you can write arbitrary values (as opposed to field
+# names) inside key tag to produce unique key. For example,
+#
+# <key>author</key>
+# <key>700$a</key>
+#
+# WARNING: When using <key> tag you can enter field with subfield
+# (in format 700$a) just filed name (for fields which doesn't have subfileds
+# like 005) or literal value. Fields which doesn't exist in that record
+# will be skipped, and if key is empty no output record will be produced.
+#
+# So, best way to produce just few record in output is to specify field which
+# doesn't exist at all in ISIS database for key, or just one literal value!!
+#
+#
+# If ISIS databases are named same as directories in which they
 # reside, you can specify just directories (so that shell globing work)
 # like this:
 # 
-# ./isis2marc.pl all.marc /mnt2/*/LIBRI
+# ./isis2marc.pl config.xml all.marc /mnt2/*/LIBRI
+#
 
 use strict;
 use OpenIsis;
 use MARC;
+use XML::Simple;
 use Data::Dumper;
 
-# to select all fields use something like /./
-my $field_filter = '^700$';
+if ($#ARGV < 2) {
+       print STDERR "Usage: $0 config.xml marc_file.iso isis_db [isis_db ...|isis_dir]\n";
+       print STDERR <<'_END_OF_USAGE_';
+
+       isis_db can be path to directory (if ISIS database is called
+       same as database) which will make shell globing work
+       or full path to ISIS database (without any extension)
+
+       Example configuration file will be dumped to standard output
+       after this, so you can just re-direct output of this script
+       to produce config file like this:
+
+       $ ./isis2marc.pl > config.xml
+
+_END_OF_USAGE_
+
+       print <<'_END_OF_CONFIG_';
+
+<?xml version="1.0" encoding="ISO-8859-2"?>
+<!-- template configuration file -->
+<mapping>
+       <record>
+               <key>700$a</key>
+               <key>700$b</key>
+               <field tag="700">
+                       <indicator1>0</indicator1>
+                       <indicator2>#</indicator2>
+                       <subfield id="a">700$a</subfield>
+                       <subfield id="b">700$b</subfield>
+               </field>
+               <field tag="009">
+                       <nosubfield>900</nosubfield>
+               </field>
+       </record>
+
+</mapping>
+
+_END_OF_CONFIG_
+
+       exit 1;
+}
+
+my $xml = new XML::Simple();
 
-my $marc_file = shift @ARGV || die "Usage: $0 [MARC file] [ISIS db]...";
+my $config_file = shift @ARGV || die "no config file?";
+
+my $config = $xml->XMLin($config_file,
+       KeyAttr => { subfield => 'id' },
+       ForceArray => [ 'record', 'field', 'subfield', 'nosubfield' ],
+       ContentKey => '-content',
+       ) || die "can't open configuration file '$config_file': $!";
+
+my $marc_file = shift @ARGV || die "no marc file?";
 
 my $marc=MARC->new;
 
@@ -32,6 +111,7 @@ select(STDOUT); $|=1;
 my %stored;
 my $total = 0;
 
+
 foreach my $db_file (@ARGV) {
 
        print "reading '$db_file'";
@@ -55,43 +135,114 @@ foreach my $db_file (@ARGV) {
        for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
                print "." if ($mfn % $step == 0);
                my $row = OpenIsis::read( $db, $mfn );
-               foreach my $fld (keys %{$row}) {
-                       next if ($fld !~ m/$field_filter/);
 
-                       my @values;
-                       my $num;
+               # unroll this field to in-memory structure data
+               my %data;
 
-                       foreach my $sf (@{$row->{$fld}}) {
+               # delete mfn from $row because it's literal value and
+               # not array, so rest of code would croak
+               delete($row->{mfn});
 
-                               $stored{$sf}++;
+               foreach my $fld (keys %{$row}) {
 
-                               next if ($stored{$sf} > 1);
+                       foreach my $rec_data (@{$row->{$fld}}) {
 
-                               my %v;
-                               while ($sf =~ s/\^(\w)([^\^]+)//) {
-                                       $v{$1} = $2;
+                               while ($rec_data =~ s/\^(\w)([^\^]+)//) {
+                                       $data{$fld.'$'.$1} = $2;
 
                                        # delete last subfield delimiter
-                                       $sf = "" if ($sf =~ /\^\w*$/);
+                                       $rec_data = "" if ($rec_data =~ /(\^\w*$|\^\w\s*$)/);
                                }
-                               if (%v) {
-                                       push @values, %v;
-                               } elsif ($sf && $sf !~ /^(\^\w)*\s*$/) {
-                                       # regex above remove empty subfields
-                                       push @values, $sf;
+
+                               # record data still exist? it's field without
+                               # subfields, then...
+                               if ($rec_data) {
+                                       $data{$fld} = $rec_data;
                                }
+                       }
+               }
 
+               # now, create output MARC record(s)
+       
+               foreach my $cfg_rec (@{$config->{record}}) {
+
+                       # do we have unique key?
+                       my $key;
+                       foreach (@{$cfg_rec->{key}}) {
+                               if ($data{$_}) {
+                                       $key .= $data{$_};
+                               } elsif (! m/^\d{3,4}(\$\w)*$/) {
+                                       $key .= $_;
+                               } else {
+                                       $key .= "";
+                               }
+                       }
+
+                       next if ($key && $stored{$key} || $key eq "");
+
+                       $stored{$key}++ if ($key);
+
+
+                       # this will be new record (if needed)
+                       my $num;
+
+                       # with one or more fields
+                       foreach my $cfg_fld (@{$cfg_rec->{field}}) {
+
+                               my $new_fld = $cfg_fld->{tag};
+
+                               #
+                               # first create fields without subfields
+                               #
+
+                               # with one or more subfields
+                               foreach my $f (@{$cfg_fld->{nosubfield}}) {
+                                       next if (! $data{$f});
+
+                                       if (! $num) {
+                                               $num=$marc->createrecord();
+                                               $new++;
+                                       }
+                                       my $i1 = $cfg_fld->{indicator1} || ' ';
+                                       my $i2 = $cfg_fld->{indicator2} || ' ';
+                                       $marc->addfield({record=>$num,
+                                               field=>$new_fld,
+                                               i1=>$i1,
+                                               i2=>$i2,
+                                               value=>$data{$f}
+                                       });
+                               }
+
+                               #
+                               # then create fields with subfields
+                               #
+
+                               # this will hold subfield values
+                               my @values;
+
+                               # with one or more subfields
+                               foreach my $new_sf (keys %{$cfg_fld->{subfield}}) {
+                                       # field$subfield
+                                       my $f = $cfg_fld->{subfield}->{$new_sf};
+                                       if ($data{$f}) {
+                                               push @values, $new_sf;
+                                               push @values, $data{$f};
+                                       }
+                               }
                                next if (! @values);
 
                                if (! $num) {
                                        $num=$marc->createrecord();
                                        $new++;
                                }
+                               my $i1 = $cfg_fld->{indicator1} || ' ';
+                               my $i2 = $cfg_fld->{indicator2} || ' ';
                                $marc->addfield({record=>$num,
-                                       field=>$fld,
-                                       i1=>" ", i2=>" ",
-                                       value=>\@values});
-
+                                       field=>$new_fld,
+                                       i1=>$i1,
+                                       i2=>$i2,
+                                       value=>\@values}
+                               );
                        }
 
                }