All-wrong-and-ugly solution to using exact match variables which are
[webpac] / tools / mods2unimarc.pl
1 #!/usr/bin/perl -w
2
3 =head1 NAME
4
5 mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6
7 =head1 SYNOPSIS
8
9 mods2marc.pl export.marc mods.xml [mods2.xml ... ]
10
11 =head1 DESCRIPTION
12
13 This script will convert MODS format
14 L<http://www.loc.gov/standards/mods/>
15 back to MARC (ISO2709) format.
16
17 Since conversion back to MARC is not simple, lot of things are hard-coded
18 in this script.
19
20 This script B<is somewhat specific> to MODS export from
21 Faculty of Electrical Engineering and Computing
22 so you might want to edit it (among other thing, it includes a lot
23 of fields which are in Croatian).
24
25 Feel free to hack this script and convert it to your own needs.
26
27 =head1 CAVEAT
28
29 This script will parse imput XML twice: once with C<XML::Twig> and
30 then each entry with C<XML::Simple> to produce in-memory structure.
31 That's because I wanted to keep node selection logical (and perl-like).
32
33 If you don't like it, you can rewrite this script to use XPATH. I tried
34 and failed (it seems that MODS is too complicated for my limited knowledge
35 of XPATH).
36
37 =cut
38
39 use strict;
40 use XML::Twig;
41 use XML::Simple;
42 use MARC;
43 use Text::Iconv;
44
45 use Data::Dumper;
46
47 my $marc_file = shift @ARGV || die "$0: need MARC export file";
48 die "$0: need at least one MODS XML file" if (! @ARGV);
49
50 $|=1;
51 my $nr = 0;
52
53 my $marc = MARC->new;
54
55 my $ENCODING = 'ISO-8859-2';
56 $ENCODING = 'windows-1250';
57
58 my $twig=XML::Twig->new(
59         twig_roots => { 'mods' => \&mods },
60         output_encoding => 'UTF8',
61 );
62
63 my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
64
65 foreach my $xml_file (@ARGV) {
66         print "$xml_file: ";
67         $twig->parsefile($xml_file);
68         $twig->purge;
69         print "$nr\n";
70 }
71
72 print "Saving MARC file...\n";
73
74 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
75
76 sub mods {
77         my( $t, $elt)= @_;
78
79         my $xml=$elt->xml_string;
80         my $ref = XMLin('<xml>'.$xml.'</xml>', 
81                 ForceArray => [
82                         'name',
83                         'classification',
84                         'topic',
85                         'relatedItem',
86                         'partNumber',
87                 ],
88                 KeyAttr => {
89                         'namePart' => 'type',
90                         'identifier' => 'type',
91                         'namePart' => 'type',
92                         'role' => 'type',
93                 },
94                 GroupTags => {
95                         'place' => 'placeTerm',
96                         'physicalDescription' => 'extent',
97                         'roleTerm' => 'content',
98                 },
99                 ContentKey => '-content',
100         );
101
102         my $m_cache;
103
104         sub marc_add {
105                 my $m_cache = \shift || die "need m_cache";
106                 my $fld = shift || die "need field!";
107                 my $sf = shift;
108                 my $data = shift || return;
109
110 #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
111
112                 if ($sf) {
113                         push @{$$m_cache->{tmp}->{$fld}}, $sf;
114                 }
115                 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
116         }
117
118         sub marc_rep {
119                 my $m_cache = \shift || die "need m_cache";
120                 foreach my $fld (@_) {
121 #print "marc_rep: $fld\n";      
122                         push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
123                         delete $$m_cache->{tmp}->{$fld};
124                 }
125         }
126
127         sub marc_single {
128                 my $m_cache = \shift || die "need m_cache";
129                 foreach my $fld (@_) {
130 #print "marc_single: $fld\n";   
131
132                         die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
133
134                         $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
135                         delete $$m_cache->{tmp}->{$fld};
136                 }
137         }
138
139         sub marc_add_rep {
140                 my $m_cache = \shift || die "need m_cache";
141                 my $fld = shift || die "need field!";
142                 my $sf = shift;
143                 my $data = shift || return;
144
145                 marc_add($$m_cache,$fld,$sf,$data);
146                 marc_rep($$m_cache,$fld);
147         }
148
149         sub marc_add_single {
150                 my $m_cache = \shift || die "need m_cache";
151                 my $fld = shift || die "need field!";
152                 my $sf = shift;
153                 my $data = shift || return;
154
155                 marc_add($$m_cache,$fld,$sf,$data);
156                 marc_single($$m_cache,$fld);
157         }
158
159         my $journal = 0;
160         # Journals start with c- in our MODS
161         $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
162
163         foreach my $t (@{$ref->{subject}->{topic}}) {
164                 marc_add($m_cache,'610','a', $t);
165                 marc_rep($m_cache,'610');
166         }
167
168         my $fld_700 = '700';
169         my $fld_710 = '710';
170
171         foreach my $name (@{$ref->{name}}) {
172                 my $role = $name->{role}->{roleTerm}->{content};
173                 next if (! $role);
174                 if ($role eq "author") {
175                         marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
176                         marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
177                         marc_add($m_cache,$fld_700,'4',$role);
178
179                         marc_rep($m_cache,$fld_700);
180
181                         # first author goes in 700, others in 701
182                         $fld_700 = '701';
183                 } elsif ($role eq "editor" or $role eq "illustrator") {
184                         marc_add($m_cache,'702','a',$name->{namePart}->{family});
185                         marc_add($m_cache,'702','b',$name->{namePart}->{given});
186                         marc_add($m_cache,'702','4',$role);
187                         marc_rep($m_cache,'702');
188                 } elsif ($role eq "corporate") {
189                         marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
190                         $fld_710 = '711';
191                 } elsif ($role eq "conference") {
192                         marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
193                         $fld_710 = '711';
194                 } else {
195                         die "FATAL: don't know how to map role '$role'" if ($role);
196                 }
197         }
198
199         my $note = $ref->{note};
200
201         if ($note) {
202                 foreach my $n (split(/\s*;\s+/, $note)) {
203                         if ($n =~ s/bibliogr:\s+//i) {
204                                 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
205                         } elsif ($n =~ s/ilustr:\s+//i) {
206                                 marc_add($m_cache,'215','c', $n);
207                         } else {
208                                 marc_add_rep($m_cache,'320','a',$n);
209                         }
210                 }
211         }
212
213
214         my $type = $ref->{identifier}->{type};
215
216         if ($type) {
217                 if ($type eq "isbn") {
218                         marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
219                 } elsif ($type eq "issn") {
220                         marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
221                 } elsif ($type eq "uri") {
222                         marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
223                 } else {
224                         die "unknown identifier type $type";
225                 }
226         }
227
228         my $phy_desc = $ref->{physicalDescription};
229         if ($phy_desc) {
230                 my $tmp;
231                 foreach my $t (split(/\s*;\s+/, $phy_desc)) {
232                         if ($t =~ m/([^:]+):\s+(.+)$/) {
233                                 $tmp->{$1} = $2;
234                         } else {
235                                 print STDERR "can't parse '$t' in ",Dumper($phy_desc);
236                         }
237                 }
238                 my $data = $tmp->{pagin};
239                 $data .= ", " if ($data);
240                 if ($tmp->{str}) {
241                         $data .= $tmp->{str}." str";
242                 }
243                 marc_add($m_cache,'215','a', $data) if ($data);
244                 marc_add($m_cache,'215','d', $tmp->{visina});
245         }
246         marc_rep($m_cache,'215');
247
248         my $mfn = $ref->{recordInfo}->{recordIdentifier};
249         $mfn =~ s/[^0-9]//g;
250         marc_add_single($m_cache,'001',undef,$mfn);
251
252         marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
253         marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
254         marc_single($m_cache,'200');
255
256         foreach my $c (@{$ref->{classification}}) {
257                 if ($c->{'authority'} eq "udc") {
258                         marc_add_rep($m_cache,'675','a', $c->{'content'});
259                 }
260         }
261
262         foreach my $ri (@{$ref->{relatedItem}}) {
263                 my $related = $ri->{type};
264                 if ($related) {
265                         if ($related eq "series") {
266                                 marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
267                                 foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
268                                         if ($journal) {
269                                                 marc_add_rep($m_cache,'999','a',$pn);
270                                         } else {
271                                                 marc_add_rep($m_cache,'225','v',$pn);
272                                         }
273                                 }
274                         } elsif ($related eq "preceding") {
275                                 marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
276                                 if ($ri->{identifier}) {
277                                         if ($ri->{identifier}->{type} eq "issn") {
278                                                 marc_add($m_cache,'520','x',$ri->{identifier}->{content});
279                                         } else {
280                                                 die "can't store identifier type $type";
281                                         }
282                                 }
283                                 marc_rep($m_cache,'520');
284                         } else {
285                                 die "can't parse related item type $related" if ($related);
286                         }
287                 }
288         }
289
290         marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
291
292         marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
293
294         my $publisher = $ref->{originInfo}->{publisher};
295         if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
296                 marc_add($m_cache,'210','a', $2);
297                 marc_add($m_cache,'210','c', $1);
298         } else {
299                 marc_add($m_cache,'210','c', $publisher);
300         }
301
302         marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
303
304         marc_single($m_cache,'210');
305
306         marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
307
308         $nr++;
309         print "$nr " if ($nr % 100 == 0);
310
311         # dump record
312         my $bib_level = "m";
313         $bib_level = "s" if ($journal);
314         my $m=$marc->createrecord({leader=>"00000na".$bib_level."  2200000 a 4500"});
315
316         foreach my $fld (keys %{$m_cache->{array}}) {
317                 foreach my $arr (@{$m_cache->{array}->{$fld}}) {
318 #print "array = ",Dumper($arr);
319                         my ($i1,$i2);
320                         # do we have indicators?
321                         if ($fld =~ m/^(.+)\t(.)(.)$/) {
322                                 $fld = $1;
323                                 ($i1,$i2) = ($2,$3);
324                         }
325                         $marc->addfield({record=>$m,
326                                 field=>$fld,
327                                 i1=>$i1,
328                                 i2=>$i2,
329                                 value=>$arr
330                         });
331                 }
332         }
333
334         foreach my $fld (keys %{$m_cache->{single}}) {
335 #print "single = ",Dumper($m_cache->{single}->{$fld});
336                 my ($i1,$i2);
337                 # do we have indicators?
338                 if ($fld =~ m/^(.+)\t(.)(.)$/) {
339                         $fld = $1;
340                         ($i1,$i2) = ($2,$3);
341                 }
342                 $marc->addfield({record=>$m,
343                         field=>$fld,
344                         i1=>$i1,
345                         i2=>$i2,
346                         value=>$m_cache->{single}->{$fld}
347                 });
348         }
349
350         $m_cache = {};
351
352         $t->purge;           # frees the memory
353 }
354