Feel free to hack this script and convert it to your own needs.
-=head1 WARNING
+=head1 CAVEAT
-This script is in state of flux.
+This script will parse imput XML twice: once with C<XML::Twig> and
+then each entry with C<XML::Simple> to produce in-memory structure.
+That's because I wanted to keep node selection logical (and perl-like).
+
+If you don't like it, you can rewrite this script to use XPATH. I tried
+and failed (it seems that MODS is too complicated for my limited knowledge
+of XPATH).
=cut
my $xml_file = "/data/tehnika/fer/all.xml";
$xml_file = "/data/tehnika/fer/modsFER_1.xml";
+$xml_file = "/data/tehnika/fer/mods-small.xml";
my $marc_file = "fer.marc";
$|=1;
my $marc = MARC->new;
+my $ENCODING = 'ISO-8859-2';
+
my $twig=XML::Twig->new(
- twig_roots => { 'mods' => \&item },
- output_encoding => 'iso-8859-2',
+ twig_roots => { 'mods' => \&mods },
+ output_encoding => 'UTF8',
);
-my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2");
+my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
+print "$xml_file: ";
$twig->parsefile($xml_file);
$twig->purge;
+print "$nr\nSaving MARC file...\n";
$marc->output({file=>"> $marc_file",'format'=>"usmarc"});
-sub item {
+sub mods {
my( $t, $elt)= @_;
my $xml=$elt->xml_string;
- my $ref = XMLin("<xml>".$xml."</xml>",
+ my $ref = XMLin('<xml>'.$xml.'</xml>',
ForceArray => [
'name',
'classification',
'topic',
- 'udc',
],
KeyAttr => {
'namePart' => 'type',
'identifier' => 'type',
- 'classification' => 'authority',
'namePart' => 'type',
'role' => 'type',
},
my $m_cache;
sub marc_add {
- my $m_cache = \shift;
+ my $m_cache = \shift || die "need m_cache";
my $fld = shift || die "need field!";
- my $sf = shift || '';
+ my $sf = shift;
+ my $data = shift || return;
- return if (! @_);
+#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
- my @a;
- foreach (@_) {
- next if (! $_);
- push @a,$sf if ($sf);
-# push @a,$utf2iso->convert($_) || $_;
- push @a,$_;
+ if ($sf) {
+ push @{$$m_cache->{tmp}->{$fld}}, $sf;
}
+ push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
+ }
- return if (! @a);
+ sub marc_rep {
+ my $m_cache = \shift || die "need m_cache";
+ foreach my $fld (@_) {
+#print "marc_rep: $fld\n";
+ push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
+ delete $$m_cache->{tmp}->{$fld};
+ }
+ }
-# print "storing $fld: ",join("|",@a),"\n";
+ sub marc_single {
+ my $m_cache = \shift || die "need m_cache";
+ foreach my $fld (@_) {
+#print "marc_single: $fld\n";
- push @{$$m_cache->{$fld}}, @a;
+ die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
+ $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
+ delete $$m_cache->{tmp}->{$fld};
+ }
+ }
+
+ sub marc_add_rep {
+ my $m_cache = \shift || die "need m_cache";
+ my $fld = shift || die "need field!";
+ my $sf = shift;
+ my $data = shift || return;
+
+ marc_add($$m_cache,$fld,$sf,$data);
+ marc_rep($$m_cache,$fld);
+ }
+
+ sub marc_add_single {
+ my $m_cache = \shift || die "need m_cache";
+ my $fld = shift || die "need field!";
+ my $sf = shift;
+ my $data = shift || return;
+
+ marc_add($$m_cache,$fld,$sf,$data);
+ marc_single($$m_cache,$fld);
}
my $journal = 0;
+ # Journals start with c- in our MODS
$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
-
- marc_add($m_cache,'610','a',@{$ref->{subject}->{topic}});
- my $fld = '700';
+ foreach my $t (@{$ref->{subject}->{topic}}) {
+ marc_add($m_cache,'610','a', $t);
+ marc_rep($m_cache,'610');
+ }
+
+ my $fld_700 = '700';
+ my $fld_710 = '710';
foreach my $name (@{$ref->{name}}) {
my $role = $name->{role}->{roleTerm}->{content};
next if (! $role);
if ($role eq "author") {
- marc_add($m_cache,$fld,'a',$name->{namePart}->{family});
- marc_add($m_cache,$fld,'b',$name->{namePart}->{given});
- marc_add($m_cache,$fld,'4',$role);
+ marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
+ marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
+ marc_add($m_cache,$fld_700,'4',$role);
+
+ marc_rep($m_cache,$fld_700);
# first author goes in 700, others in 701
- $fld = '701';
+ $fld_700 = '701';
} elsif ($role eq "editor" or $role eq "illustrator") {
marc_add($m_cache,'702','a',$name->{namePart}->{family});
marc_add($m_cache,'702','b',$name->{namePart}->{given});
marc_add($m_cache,'702','4',$role);
+ marc_rep($m_cache,'702');
+ } elsif ($role eq "corporate") {
+ marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
+ $fld_710 = '711';
+ } elsif ($role eq "conference") {
+ marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
+ $fld_710 = '711';
} else {
die "FATAL: don't know how to map role '$role'" if ($role);
}
if ($note) {
foreach my $n (split(/\s*;\s+/, $note)) {
if ($n =~ s/bibliogr:\s+//i) {
- marc_add($m_cache,'320','a',"Bibliografija: $n");
+ marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
} elsif ($n =~ s/ilustr:\s+//i) {
marc_add($m_cache,'215','c', $n);
} else {
- marc_add($m_cache,'320','a',$n);
+ marc_add_rep($m_cache,'320','a',$n);
}
}
}
-
+
my $type = $ref->{identifier}->{type};
if ($type) {
if ($type eq "isbn") {
- marc_add($m_cache,'010','a',$ref->{identifier}->{content});
+ marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
} elsif ($type eq "issn") {
- marc_add($m_cache,'011','a',$ref->{identifier}->{content});
+ marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
} else {
die "unknown identifier type $type";
}
marc_add($m_cache,'215','a', $data) if ($data);
marc_add($m_cache,'215','d', $tmp->{visina});
}
+ marc_rep($m_cache,'215');
- marc_add($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
+ marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
+ marc_single($m_cache,'200');
- marc_add($m_cache,'675','a',$ref->{classification}->{udc});
+ foreach my $c (@{$ref->{classification}}) {
+ if ($c->{'authority'} eq "udc") {
+ marc_add_rep($m_cache,'675','a', $c->{'content'});
+ }
+ }
my $related = $ref->{relatedItem}->{type};
if ($related) {
if ($related eq "series") {
marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title});
marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
+ marc_rep($m_cache,'225','999');
} elsif ($related eq "preceding") {
- marc_add($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
+ marc_add_rep($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
} else {
die "can't parse related item type $related" if ($related);
}
}
- marc_add($m_cache,'205','a',$ref->{originInfo}->{edition});
+ marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
my $publisher = $ref->{originInfo}->{publisher};
if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
marc_add($m_cache,'210','c', $publisher);
}
- marc_add($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
-
marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
-
marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
+ marc_single($m_cache,'210');
+
+ marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
+
$nr++;
print "$nr " if ($nr % 100 == 0);
# dump record
- my $m=$marc->createrecord();
- foreach my $fld (keys %{$m_cache}) {
-# print "$fld: ",join(" * ",@{$m_cache->{$fld}}),"\n";
+ my $m=$marc->createrecord({leader=>"00000nam 2200000 a 4500"});
+
+ foreach my $fld (keys %{$m_cache->{array}}) {
+ foreach my $arr (@{$m_cache->{array}->{$fld}}) {
+#print "array = ",Dumper($arr);
+ my ($i1,$i2);
+ # do we have indicators?
+ if ($fld =~ m/^(.+)\t(.)(.)$/) {
+ $fld = $1;
+ ($i1,$i2) = ($2,$3);
+ }
+ $marc->addfield({record=>$m,
+ field=>$fld,
+ i1=>$i1,
+ i2=>$i2,
+ value=>$arr
+ });
+ }
+ }
+
+ foreach my $fld (keys %{$m_cache->{single}}) {
+#print "single = ",Dumper($m_cache->{single}->{$fld});
+ my ($i1,$i2);
+ # do we have indicators?
+ if ($fld =~ m/^(.+)\t(.)(.)$/) {
+ $fld = $1;
+ ($i1,$i2) = ($2,$3);
+ }
$marc->addfield({record=>$m,
field=>$fld,
- value=>\@{$m_cache->{$fld}}
+ i1=>$i1,
+ i2=>$i2,
+ value=>$m_cache->{single}->{$fld}
});
}
+ $m_cache = {};
+
$t->purge; # frees the memory
}