1 package WebPAC::Normalize::MARC;
4 marc marc_indicators marc_repeatable_subfield
5 marc_compose marc_leader marc_fixed
6 marc_duplicate marc_remove marc_count
14 use Storable qw/dclone/;
15 use Data::Dump qw/dump/;
18 use WebPAC::Normalize;
22 my ($marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
23 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
29 WebPAC::Normalize::MARC - create MARC/ISO2709 records
38 from => 225, to => 440,
52 'a, |x ; |v. |n, |p ; |v',
57 Returns number of records produced.
63 warn "## marc_template(",dump($args),")",$/ if $debug;
65 foreach ( qw/subfields_rename isis_template marc_template/ ) {
66 # warn "ref($_) = ",ref($args->{$_}) if $debug;
67 die "$_ not ARRAY" if defined($args->{$_}) && ref($args->{$_}) ne 'ARRAY';
70 die "marc_template needs isis_template or marc_template"
71 if ! defined $args->{isis_template} && ! defined $args->{marc_template};
73 my $rec = WebPAC::Normalize::_get_rec() || die "_get_rec?";
74 my $r = $rec->{ $args->{from} } || return;
75 die "record field ", $args->{from}, " isn't array ",dump( $rec ) unless (ref($r) eq 'ARRAY');
77 my @subfields_rename = @{ $args->{subfields_rename} };
78 # warn "### subfields_rename [$#subfields_rename] = ",dump( @subfields_rename ) if $debug;
80 confess "need mapping in pairs for subfields_rename"
81 if $#subfields_rename % 2 != 1;
83 my ( $subfields_rename, $from_subfields );
84 our $to_subfields = {};
85 while ( my ( $from, $to ) = splice(@subfields_rename, 0, 2) ) {
87 $from_subfields->{ $from }++,
88 $to_subfields->{ $to }++
90 $subfields_rename->{ $from }->[ $f ] = [ $to => $t ];
92 warn "### subfields_rename = ",dump( $subfields_rename ),$/ if $debug;
93 warn "### from_subfields = ", dump( $from_subfields ),$/ if $debug;
94 warn "### to_subfields = ", dump( $to_subfields ),$/ if $debug;
98 $_template->{isis}->{fields_re} = join('|', keys %$from_subfields );
99 $_template->{marc}->{fields_re} = join('|', keys %$to_subfields );
103 sub _parse_template {
104 my ( $name, $templates ) = @_;
106 my $fields_re = $_template->{$name}->{fields_re} || die "can't find $name in ",dump( $_template->{$name}->{fields_re} );
108 foreach my $template ( @{ $templates } ) {
113 my $nr = $count->{$sf}++;
114 push @order, [ $sf, $nr ];
117 my $pos_template = $template;
118 $pos_template =~ s/($fields_re)/my_count($1)/ge;
119 my $count_key = dump( $count );
120 warn "### template: |$template| -> |$pos_template| count = $count_key order = ",dump( @order ),$/ if $debug;
121 $_template->{$name}->{pos}->{ $count_key } = $pos_template;
122 $_template->{$name}->{order}->{ $pos_template } = [ @order ];
124 warn "### from ",dump( $templates ), " using $fields_re created ", dump( $_template ),$/ if $debug;
127 _parse_template( 'marc', $args->{marc_template} );
128 _parse_template( 'isis', $args->{isis_template} );
129 warn "### _template = ",dump( $_template ),$/ if $debug;
133 foreach my $r ( @{ $rec->{ $args->{from} } } ) {
135 my $i1 = $r->{i1} || ' ';
136 my $i2 = $r->{i2} || ' ';
137 $m = [ $args->{to}, $i1, $i2 ];
139 warn "### r = ",dump( $r ),$/ if $debug;
141 my ( $from_mapping, $to_mapping, $from_count, $to_count );
142 foreach my $from_sf ( keys %{$r} ) {
143 # skip everything which isn't one char subfield (e.g. 'subfields')
144 next unless $from_sf =~ m/^\w$/;
145 my $from_nr = $from_count->{$from_sf}++;
146 my $rename_to = $subfields_rename->{ $from_sf } ||
147 die "can't find subfield rename for $from_sf/$from_nr in ", dump( $subfields_rename );
148 my ( $to_sf, $to_nr ) = @{ $rename_to->[$from_nr] };
149 $to_mapping->{ $to_sf }->[ $to_nr ] = [ $from_sf => $from_nr ];
151 my $to_nr2 = $to_count->{ $to_sf }++;
152 $from_mapping->{ $from_sf }->[ $from_nr ] = [ $to_sf => $to_nr2 ];
154 warn "### from $from_sf/$from_nr -> $to_sf/$to_nr\tto $from_sf/$from_nr -> $to_sf/$to_nr2\n" if $debug;
157 warn "### from_mapping = ",dump( $from_mapping ), "\n### to_mapping = ",dump( $to_mapping ),$/ if $debug;
160 from => dump( $from_count ),
161 to => dump( $to_count),
164 warn "### count_key = ",dump( $count_key ),$/ if $debug;
166 my $processed_templates = 0;
168 # this defines order of traversal
169 foreach ( qw/isis:from marc:to/ ) {
170 my ($name,$count_name) = split(/:/);
172 my $ckey = $count_key->{$count_name} || die "can't find count_key $count_name in ",dump( $count_key );
174 my $template = $_template->{$name}->{pos}->{ $ckey } || next;
175 $processed_templates++;
177 warn "### traverse $name $count_name selected template: |$template|\n" if $debug;
181 my @templates = split(/\|/, $template );
182 @templates = ( $template ) unless @templates;
184 warn "### templates = ",dump( @templates ),$/ if $debug;
186 foreach my $sf ( @templates ) {
188 my ( $name, $r, $pre, $sf, $nr, $post ) = @_;
189 warn "#### fill_in( $name, r, '$pre', $sf, $nr, '$post' )\n" if $debug;
190 my ( $from_sf, $from_nr );
191 if ( $name eq 'marc' ) {
192 die "no $sf/$nr in to_mapping: ",dump( $to_mapping ), " form record ",dump( $r ) unless defined $to_mapping->{$sf}->[$nr];
193 ( $from_sf, $from_nr ) = @{ $to_mapping->{$sf}->[$nr] };
195 ( $from_sf, $from_nr ) = ( $sf, $nr );
197 my $v = $r->{ $from_sf }; # || die "no $from_sf/$from_nr";
198 if ( ref( $v ) eq 'ARRAY' ) {
199 $v = $pre . $v->[$from_nr] . $post;
200 } elsif ( $from_nr == 0 ) {
201 $v = $pre . $v . $post;
203 die "requested subfield $from_sf/$from_nr but it's ",dump( $v );
205 warn "#### fill_in( $sf, $nr ) = $from_sf/$from_nr >>>> ",dump( $v ),$/ if $debug;
206 $fill_in->{$sf}->[$nr] = $v;
209 my $fields_re = $_template->{$name}->{fields_re} || die "can't find $name in ",dump( $_template->{$name}->{fields_re} );
210 warn "#### $sf <<<< $fields_re\n" if $debug;
211 $sf =~ s/^(.*?)($fields_re)(\d+)(.*?)$/fill_in($name,$r,$1,$2,$3,$4)/ge;
212 warn "#### >>>> $sf with fill_in = ",dump( $fill_in ),$/ if $debug;
215 warn "## template: |$template|\n## _template->$name = ",dump( $_template->{$name} ),$/ if $debug;
217 foreach my $sf ( @{ $_template->{$name}->{order}->{$template} } ) {
218 my ( $sf, $nr ) = @$sf;
219 my $v = $fill_in->{$sf}->[$nr] || die "can't find fill_in $sf/$nr";
220 if ( $name eq 'isis') {
221 ( $sf, $nr ) = @{ $from_mapping->{$sf}->[$nr] };
223 warn "++ $sf/$nr |$v|\n" if $debug;
224 push @$m, ( $sf, $v );
227 warn "#### >>>> created MARC record: ", dump( $m ),$/ if $debug;
234 die "I don't have template for fields ",dump( $count_key ), "\n## available templates\n", dump( $_template ) unless $processed_templates;
235 warn ">>> $processed_templates templates applied to data\n",$/ if $debug;
241 foreach my $marc ( @marc_out ) {
242 warn "+++ ",dump( $marc ),$/ if $debug;
247 warn "### marc_template produced $recs MARC records: ",dump( @marc_out ),$/ if $debug;
254 Setup fields within MARC leader or get leader
256 marc_leader('05','c');
257 my $leader = marc_leader();
262 my ($offset,$value) = @_;
265 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
268 if (defined($marc_leader)) {
269 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
270 return $marc_leader->[ $marc_record_offset ];
279 Create control/indentifier fields with values in fixed positions
281 marc_fixed('008', 00, '070402');
282 marc_fixed('008', 39, '|');
284 Positions not specified will be filled with spaces (C<0x20>).
286 There will be no effort to extend last specified value to full length of
292 my ($f, $pos, $val) = @_;
293 die "need marc(field, position, value)" unless defined($f) && defined($pos);
295 confess "need val" unless defined $val;
302 if (length($old) <= $pos) {
303 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
304 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
306 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
307 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
311 } @{ $marc_record->[ $marc_record_offset ] };
314 my $v = ' ' x $pos . $val;
315 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
316 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
322 Save value for MARC field
324 marc('900','a', rec('200','a') );
325 marc('001', rec('000') );
330 my $f = shift or die "marc needs field";
331 die "marc field must be numer" unless ($f =~ /^\d+$/);
335 $sf = shift or die "marc needs subfield";
339 my $v = $_; # make var read-write for Encode
340 next unless (defined($v) && $v !~ /^\s*$/);
341 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
343 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
345 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
350 =head2 marc_repeatable_subfield
352 Save values for MARC repetable subfield
354 marc_repeatable_subfield('910', 'z', rec('909') );
358 sub marc_repeatable_subfield {
360 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
361 $marc_repeatable_subfield->{ $f . $sf }++;
365 =head2 marc_indicators
367 Set both indicators for MARC field
369 marc_indicators('900', ' ', 1);
371 Any indicator value other than C<0-9> will be treated as undefined.
375 sub marc_indicators {
376 my $f = shift || die "marc_indicators need field!\n";
378 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
379 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
381 $i1 = ' ' if ($i1 !~ /^\d$/);
382 $i2 = ' ' if ($i2 !~ /^\d$/);
383 @{ $marc_indicators->{$f} } = ($i1,$i2);
388 Save values for each MARC subfield explicitly
397 If you specify C<+> for subfield, value will be appended
398 to previous defined subfield.
403 my $f = shift or die "marc_compose needs field";
404 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
406 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
407 my $m = [ $f, $i1, $i2 ];
409 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
412 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
419 next unless (defined($v) && $v !~ /^\s*$/);
420 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
422 push @$m, ( $sf, $v );
428 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
430 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
433 =head2 marc_duplicate
435 Generate copy of current MARC record and continue working on copy
439 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
440 C<< _get_marc_fields( offset => 42 ) >>.
445 my $m = $marc_record->[ -1 ];
446 die "can't duplicate record which isn't defined" unless ($m);
447 push @{ $marc_record }, dclone( $m );
448 push @{ $marc_leader }, dclone( marc_leader() );
449 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
450 $marc_record_offset = $#{ $marc_record };
451 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
457 Remove some field or subfield from MARC record.
460 marc_remove('200','a');
462 This will erase field C<200> or C<200^a> from current MARC record.
466 Will remove all fields in current MARC record.
468 This is useful after calling C<marc_duplicate> or on it's own (but, you
469 should probably just remove that subfield definition if you are not
470 using C<marc_duplicate>).
472 FIXME: support fields < 10.
479 die "marc_remove needs record number" unless defined($f);
481 my $marc = $marc_record->[ $marc_record_offset ];
483 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
487 delete( $marc_record->[ $marc_record_offset ] );
488 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
493 foreach ( 0 .. $#{ $marc } ) {
494 last unless (defined $marc->[$i]);
495 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
496 if ($marc->[$i]->[0] eq $f) {
499 splice @$marc, $i, 1;
500 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
503 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
504 my $o = ($j * 2) + 3;
505 if ($marc->[$i]->[$o] eq $sf) {
507 splice @{$marc->[$i]}, $o, 2;
508 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
509 # is record now empty?
510 if ($#{ $marc->[$i] } == 2) {
511 splice @$marc, $i, 1;
512 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
522 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
524 $marc_record->[ $marc_record_offset ] = $marc;
527 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
530 =head2 marc_original_order
532 Copy all subfields preserving original order to marc field.
534 marc_original_order( marc_field_number, original_input_field_number );
536 Please note that field numbers are consistent with other commands (marc
537 field number first), but somewhat counter-intuitive (destination and then
540 You might want to use this command if you are just renaming subfields or
541 using pre-processing modify_record in C<config.yml> and don't need any
542 post-processing or want to preserve order of original subfields.
547 sub marc_original_order {
549 my ($to, $from) = @_;
550 die "marc_original_order needs from and to fields\n" unless ($from && $to);
552 return unless defined($rec->{$from});
554 my $r = $rec->{$from};
555 die "record field $from isn't array ",dump( $rec ) unless (ref($r) eq 'ARRAY');
557 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
558 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
560 foreach my $d (@$r) {
562 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
563 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
567 my @sfs = @{ $d->{subfields} };
569 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
571 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
573 my $m = [ $to, $i1, $i2 ];
575 while (my $sf = shift @sfs) {
577 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
578 my $offset = shift @sfs;
579 die "corrupted sufields specification for field $from\n" unless defined($offset);
582 if (ref($d->{$sf}) eq 'ARRAY') {
583 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
584 } elsif ($offset == 0) {
587 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
589 push @$m, ( $sf, $v ) if (defined($v));
593 push @{ $marc_record->[ $marc_record_offset ] }, $m;
597 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
603 Return number of MARC records created using L</marc_duplicate>.
605 print "created ", marc_count(), " records";
610 return $#{ $marc_record };
620 my $marc = shift || die "no marc?";
621 push @{ $marc_record->[ $marc_record_offset ] }, $marc;
624 =head1 PRIVATE FUNCTIONS
628 Clean internal structures
633 ($marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
634 ($marc_record_offset, $marc_fetch_offset) = (0,0);
638 =head2 _get_marc_fields
640 Get all fields defined by calls to C<marc>
642 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
644 We are using I<magic> which detect repeatable fields only from
645 sequence of field/subfield data generated by normalization.
647 Repeatable field is created when there is second occurence of same subfield or
648 if any of indicators are different.
650 This is sane for most cases. Something like:
656 will be created from any combination of:
658 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
662 marc('900','a', rec('200','a') );
663 marc('900','b', rec('200','b') );
664 marc('900','c', rec('200','c') );
666 which might not be what you have in mind. If you need repeatable subfield,
667 define it using C<marc_repeatable_subfield> like this:
669 marc_repeatable_subfield('900','a');
670 marc('900','a', rec('200','a') );
671 marc('900','b', rec('200','b') );
672 marc('900','c', rec('200','c') );
676 900a-1 900a-2 900a-3 900b-1 900c-1
679 There is also support for returning next or specific using:
681 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
682 # do something with $mf
685 will always return fields from next MARC record or
687 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
689 will return 42th copy record (if it exists).
695 sub _get_marc_fields {
698 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
699 $fetch_pos = $marc_fetch_offset;
700 if ($arg->{offset}) {
701 $fetch_pos = $arg->{offset};
702 } elsif($arg->{fetch_next}) {
703 $marc_fetch_offset++;
706 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
708 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
710 my $marc_rec = $marc_record->[ $fetch_pos ];
712 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
714 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
716 # first, sort all existing fields
717 # XXX might not be needed, but modern perl might randomize elements in hash
718 my @sorted_marc_record = sort {
719 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
722 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
727 # count unique field-subfields (used for offset when walking to next subfield)
729 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
732 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
733 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
734 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
735 warn "## subfield count = ", dump( $u ), $/;
738 my $len = $#sorted_marc_record;
743 foreach ( 0 .. $len ) {
745 # find next element which isn't visited
746 while ($visited->{$i}) {
747 $i = ($i + 1) % ($len + 1);
753 my $row = dclone( $sorted_marc_record[$i] );
755 # field and subfield which is key for
756 # marc_repeatable_subfield and u
757 my $fsf = $row->[0] . ( $row->[3] || '' );
761 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
762 print "### this [$i]: ", dump( $row ),$/;
763 print "### sf: ", $row->[3], " vs ", $field->[3],
764 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
770 if ( $#$field >= 0 ) {
772 $row->[0] ne $field->[0] || # field
773 $row->[1] ne $field->[1] || # i1
774 $row->[2] ne $field->[2] # i2
777 warn "## saved/1 ", dump( $field ),$/ if ($debug);
781 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
783 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
784 ! $marc_repeatable_subfield->{ $fsf }
788 warn "## saved/2 ", dump( $field ),$/ if ($debug);
792 # append new subfields to existing field
793 push @$field, ( $row->[3], $row->[4] );
800 if (! $marc_repeatable_subfield->{ $fsf }) {
801 # make step to next subfield
802 $i = ($i + $u->{ $fsf } ) % ($len + 1);
808 warn "## saved/3 ", dump( $field ),$/ if ($debug);
814 =head2 _get_marc_leader
816 Return leader from currently fetched record by L</_get_marc_fields>
818 print WebPAC::Normalize::MARC::_get_marc_leader();
822 sub _get_marc_leader {
823 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
824 return $marc_leader->[ $fetch_pos ];
827 =head2 _created_marc_records
829 my $nr_records = _created_marc_records;
833 sub _created_marc_records {
834 return $#{ $marc_record } + 1 if $marc_record;