bugfix: check Isis database error in correct place
[webpac] / tools / mods2unimarc.pl
index 310f70b..03892b4 100755 (executable)
@@ -6,7 +6,7 @@ mods2marc.pl - convert MODS XML back to MARC (ISO2709)
 
 =head1 SYNOPSIS
 
-mods2marc.pl mods.xml export.marc
+mods2marc.pl export.marc mods.xml [mods2.xml ... ]
 
 =head1 DESCRIPTION
 
@@ -24,9 +24,15 @@ of fields which are in Croatian).
 
 Feel free to hack this script and convert it to your own needs.
 
-=head1 WARNING
+=head1 CAVEAT
 
-This script is in state of flux.
+This script will parse imput XML twice: once with C<XML::Twig> and
+then each entry with C<XML::Simple> to produce in-memory structure.
+That's because I wanted to keep node selection logical (and perl-like).
+
+If you don't like it, you can rewrite this script to use XPATH. I tried
+and failed (it seems that MODS is too complicated for my limited knowledge
+of XPATH).
 
 =cut
 
@@ -38,42 +44,50 @@ use Text::Iconv;
 
 use Data::Dumper;
 
-my $xml_file = "/data/tehnika/fer/all.xml";
-$xml_file = "/data/tehnika/fer/modsFER_1.xml";
-my $marc_file = "fer.marc";
+my $marc_file = shift @ARGV || die "$0: need MARC export file";
+die "$0: need at least one MODS XML file" if (! @ARGV);
 
 $|=1;
 my $nr = 0;
 
 my $marc = MARC->new;
 
+my $ENCODING = 'ISO-8859-2';
+$ENCODING = 'windows-1250';
+
 my $twig=XML::Twig->new(
-       twig_roots => { 'mods' => \&item },
-       output_encoding => 'iso-8859-2',
+       twig_roots => { 'mods' => \&mods },
+       output_encoding => 'UTF8',
 );
 
-my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2");
+my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
 
-$twig->parsefile($xml_file);
-$twig->purge;
+foreach my $xml_file (@ARGV) {
+       print "$xml_file: ";
+       $twig->parsefile($xml_file);
+       $twig->purge;
+       print "$nr\n";
+}
+
+print "Saving MARC file...\n";
 
 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
 
-sub item {
+sub mods {
        my( $t, $elt)= @_;
 
        my $xml=$elt->xml_string;
-       my $ref = XMLin("<xml>".$xml."</xml>"
+       my $ref = XMLin('<xml>'.$xml.'</xml>'
                ForceArray => [
                        'name',
                        'classification',
                        'topic',
-                       'udc',
+                       'relatedItem',
+                       'partNumber',
                ],
                KeyAttr => {
                        'namePart' => 'type',
                        'identifier' => 'type',
-                       'classification' => 'authority',
                        'namePart' => 'type',
                        'role' => 'type',
                },
@@ -88,49 +102,95 @@ sub item {
        my $m_cache;
 
        sub marc_add {
-               my $m_cache = \shift;
+               my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
-               my $sf = shift || '';
+               my $sf = shift;
+               my $data = shift || return;
+
+#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
+
+               if ($sf) {
+                       push @{$$m_cache->{tmp}->{$fld}}, $sf;
+               }
+               push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
+       }
+
+       sub marc_rep {
+               my $m_cache = \shift || die "need m_cache";
+               foreach my $fld (@_) {
+#print "marc_rep: $fld\n";     
+                       push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
+                       delete $$m_cache->{tmp}->{$fld};
+               }
+       }
+
+       sub marc_single {
+               my $m_cache = \shift || die "need m_cache";
+               foreach my $fld (@_) {
+#print "marc_single: $fld\n";  
 
-               return if (! @_);
+                       die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
 
-               my @a;
-               foreach (@_) {
-                       next if (! $_);
-                       push @a,$sf if ($sf);
-#                      push @a,$utf2iso->convert($_) || $_;
-                       push @a,$_;
+                       $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
+                       delete $$m_cache->{tmp}->{$fld};
                }
+       }
 
-               return if (! @a);
+       sub marc_add_rep {
+               my $m_cache = \shift || die "need m_cache";
+               my $fld = shift || die "need field!";
+               my $sf = shift;
+               my $data = shift || return;
 
-#              print "storing $fld: ",join("|",@a),"\n";
+               marc_add($$m_cache,$fld,$sf,$data);
+               marc_rep($$m_cache,$fld);
+       }
 
-               push @{$$m_cache->{$fld}}, @a;
+       sub marc_add_single {
+               my $m_cache = \shift || die "need m_cache";
+               my $fld = shift || die "need field!";
+               my $sf = shift;
+               my $data = shift || return;
 
+               marc_add($$m_cache,$fld,$sf,$data);
+               marc_single($$m_cache,$fld);
        }
 
        my $journal = 0;
+       # Journals start with c- in our MODS
        $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
-       
-       marc_add($m_cache,'610','a',@{$ref->{subject}->{topic}});
 
-       my $fld = '700';
+       foreach my $t (@{$ref->{subject}->{topic}}) {
+               marc_add($m_cache,'610','a', $t);
+               marc_rep($m_cache,'610');
+       }
+
+       my $fld_700 = '700';
+       my $fld_710 = '710';
 
        foreach my $name (@{$ref->{name}}) {
                my $role = $name->{role}->{roleTerm}->{content};
                next if (! $role);
                if ($role eq "author") {
-                       marc_add($m_cache,$fld,'a',$name->{namePart}->{family});
-                       marc_add($m_cache,$fld,'b',$name->{namePart}->{given});
-                       marc_add($m_cache,$fld,'4',$role);
+                       marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
+                       marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
+                       marc_add($m_cache,$fld_700,'4',$role);
+
+                       marc_rep($m_cache,$fld_700);
 
                        # first author goes in 700, others in 701
-                       $fld = '701';
+                       $fld_700 = '701';
                } elsif ($role eq "editor" or $role eq "illustrator") {
                        marc_add($m_cache,'702','a',$name->{namePart}->{family});
                        marc_add($m_cache,'702','b',$name->{namePart}->{given});
                        marc_add($m_cache,'702','4',$role);
+                       marc_rep($m_cache,'702');
+               } elsif ($role eq "corporate") {
+                       marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
+                       $fld_710 = '711';
+               } elsif ($role eq "conference") {
+                       marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
+                       $fld_710 = '711';
                } else {
                        die "FATAL: don't know how to map role '$role'" if ($role);
                }
@@ -141,23 +201,25 @@ sub item {
        if ($note) {
                foreach my $n (split(/\s*;\s+/, $note)) {
                        if ($n =~ s/bibliogr:\s+//i) {
-                               marc_add($m_cache,'320','a',"Bibliografija: $n");
+                               marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
                        } elsif ($n =~ s/ilustr:\s+//i) {
                                marc_add($m_cache,'215','c', $n);
                        } else {
-                               marc_add($m_cache,'320','a',$n);
+                               marc_add_rep($m_cache,'320','a',$n);
                        }
                }
        }
-                       
+
 
        my $type = $ref->{identifier}->{type};
 
        if ($type) {
                if ($type eq "isbn") {
-                       marc_add($m_cache,'010','a',$ref->{identifier}->{content});
+                       marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
                } elsif ($type eq "issn") {
-                       marc_add($m_cache,'011','a',$ref->{identifier}->{content});
+                       marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
+               } elsif ($type eq "uri") {
+                       marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
                } else {
                        die "unknown identifier type $type";
                }
@@ -170,7 +232,7 @@ sub item {
                        if ($t =~ m/([^:]+):\s+(.+)$/) {
                                $tmp->{$1} = $2;
                        } else {
-                               die "can't parse $t";
+                               print STDERR "can't parse '$t' in ",Dumper($phy_desc);
                        }
                }
                my $data = $tmp->{pagin};
@@ -181,27 +243,53 @@ sub item {
                marc_add($m_cache,'215','a', $data) if ($data);
                marc_add($m_cache,'215','d', $tmp->{visina});
        }
+       marc_rep($m_cache,'215');
 
-       marc_add($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
+       my $mfn = $ref->{recordInfo}->{recordIdentifier};
+       $mfn =~ s/[^0-9]//g;
+       marc_add_single($m_cache,'001',undef,$mfn);
 
        marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
        marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
+       marc_single($m_cache,'200');
 
-       marc_add($m_cache,'675','a',$ref->{classification}->{udc});
+       foreach my $c (@{$ref->{classification}}) {
+               if ($c->{'authority'} eq "udc") {
+                       marc_add_rep($m_cache,'675','a', $c->{'content'});
+               }
+       }
 
-       my $related = $ref->{relatedItem}->{type};
-       if ($related) {
-               if ($related eq "series") {
-                       marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title});
-                       marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
-               } elsif ($related eq "preceding") {
-                       marc_add($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
-               } else {
-                       die "can't parse related item type $related" if ($related);
+       foreach my $ri (@{$ref->{relatedItem}}) {
+               my $related = $ri->{type};
+               if ($related) {
+                       if ($related eq "series") {
+                               marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
+                               foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
+                                       if ($journal) {
+                                               marc_add_rep($m_cache,'999','a',$pn);
+                                       } else {
+                                               marc_add_rep($m_cache,'225','v',$pn);
+                                       }
+                               }
+                       } elsif ($related eq "preceding") {
+                               marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
+                               if ($ri->{identifier}) {
+                                       if ($ri->{identifier}->{type} eq "issn") {
+                                               marc_add($m_cache,'520','x',$ri->{identifier}->{content});
+                                       } else {
+                                               die "can't store identifier type $type";
+                                       }
+                               }
+                               marc_rep($m_cache,'520');
+                       } else {
+                               die "can't parse related item type $related" if ($related);
+                       }
                }
        }
 
-       marc_add($m_cache,'205','a',$ref->{originInfo}->{edition});
+       marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
+
+       marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
 
        my $publisher = $ref->{originInfo}->{publisher};
        if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
@@ -211,25 +299,56 @@ sub item {
                marc_add($m_cache,'210','c', $publisher);
        }
 
-       marc_add($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
+       marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
 
-       marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
+       marc_single($m_cache,'210');
 
-       marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
+       marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
 
        $nr++;
        print "$nr " if ($nr % 100 == 0);
 
        # dump record
-       my $m=$marc->createrecord();
-       foreach my $fld (keys %{$m_cache}) {
-#              print "$fld: ",join(" * ",@{$m_cache->{$fld}}),"\n";
+       my $bib_level = "m";
+       $bib_level = "s" if ($journal);
+       my $m=$marc->createrecord({leader=>"00000na".$bib_level."  2200000 a 4500"});
+
+       foreach my $fld (keys %{$m_cache->{array}}) {
+               foreach my $arr (@{$m_cache->{array}->{$fld}}) {
+#print "array = ",Dumper($arr);
+                       my ($i1,$i2);
+                       # do we have indicators?
+                       if ($fld =~ m/^(.+)\t(.)(.)$/) {
+                               $fld = $1;
+                               ($i1,$i2) = ($2,$3);
+                       }
+                       $marc->addfield({record=>$m,
+                               field=>$fld,
+                               i1=>$i1,
+                               i2=>$i2,
+                               value=>$arr
+                       });
+               }
+       }
+
+       foreach my $fld (keys %{$m_cache->{single}}) {
+#print "single = ",Dumper($m_cache->{single}->{$fld});
+               my ($i1,$i2);
+               # do we have indicators?
+               if ($fld =~ m/^(.+)\t(.)(.)$/) {
+                       $fld = $1;
+                       ($i1,$i2) = ($2,$3);
+               }
                $marc->addfield({record=>$m,
                        field=>$fld,
-                       value=>\@{$m_cache->{$fld}}
+                       i1=>$i1,
+                       i2=>$i2,
+                       value=>$m_cache->{single}->{$fld}
                });
        }
 
+       $m_cache = {};
+
        $t->purge;           # frees the memory
 }