1 package WebPAC::Normalize;
10 marc marc_indicators marc_repeatable_subfield
11 marc_compose marc_leader
12 marc_duplicate marc_remove
16 regex prefix suffix surround
17 first lookup join_with
25 #use base qw/WebPAC::Common/;
26 use Data::Dump qw/dump/;
27 use Storable qw/dclone/;
35 WebPAC::Normalize - describe normalisaton rules using sets
43 our $VERSION = '0.19';
47 This module uses C<conf/normalize/*.pl> files to perform normalisation
48 from input records using perl functions which are specialized for set
51 Sets are implemented as arrays, and normalisation file is valid perl, which
52 means that you check it's validity before running WebPAC using
53 C<perl -c normalize.pl>.
55 Normalisation can generate multiple output normalized data. For now, supported output
56 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
61 Functions which start with C<_> are private and used by WebPAC internally.
62 All other functions are available for use within normalisation rules.
68 my $ds = WebPAC::Normalize::data_structure(
69 lookup => $lookup->lookup_hash,
71 rules => $normalize_pl_config,
72 marc_encoding => 'utf-8',
76 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
79 This function will B<die> if normalizastion can't be evaled.
81 Since this function isn't exported you have to call it with
82 C<WebPAC::Normalize::data_structure>.
89 die "need row argument" unless ($arg->{row});
90 die "need normalisation argument" unless ($arg->{rules});
93 _set_lookup( $arg->{lookup} );
94 _set_rec( $arg->{row} );
95 _set_config( $arg->{config} );
96 _clean_ds( %{ $arg } );
98 die "error evaling $arg->{rules}: $@\n" if ($@);
105 Set current record hash
114 $rec = shift or die "no record hash";
119 Set current config hash
121 _set_config( $config );
129 Code of current database
147 Return hash formatted as data structure
153 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
154 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
162 Clean data structure hash for next record
170 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
171 ($marc_record_offset, $marc_fetch_offset) = (0,0);
172 $marc_encoding = $a->{marc_encoding};
177 Set current lookup hash
179 _set_lookup( $lookup );
189 =head2 _get_marc_fields
191 Get all fields defined by calls to C<marc>
193 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
195 We are using I<magic> which detect repeatable fields only from
196 sequence of field/subfield data generated by normalization.
198 Repeatable field is created when there is second occurence of same subfield or
199 if any of indicators are different.
201 This is sane for most cases. Something like:
207 will be created from any combination of:
209 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
213 marc('900','a', rec('200','a') );
214 marc('900','b', rec('200','b') );
215 marc('900','c', rec('200','c') );
217 which might not be what you have in mind. If you need repeatable subfield,
218 define it using C<marc_repeatable_subfield> like this:
220 marc_repeatable_subfield('900','a');
221 marc('900','a', rec('200','a') );
222 marc('900','b', rec('200','b') );
223 marc('900','c', rec('200','c') );
227 900a-1 900a-2 900a-3 900b-1 900c-1
230 There is also support for returning next or specific using:
232 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
233 # do something with $mf
236 will always return fields from next MARC record or
238 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
240 will return 42th copy record (if it exists).
244 sub _get_marc_fields {
247 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
248 my $offset = $marc_fetch_offset;
249 if ($arg->{offset}) {
250 $offset = $arg->{offset};
251 } elsif($arg->{fetch_next}) {
252 $marc_fetch_offset++;
255 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
257 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
259 my $marc_rec = $marc_record->[ $offset ];
261 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
263 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
265 # first, sort all existing fields
266 # XXX might not be needed, but modern perl might randomize elements in hash
267 my @sorted_marc_record = sort {
268 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
271 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
276 # count unique field-subfields (used for offset when walking to next subfield)
278 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
281 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
282 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
283 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
284 warn "## subfield count = ", dump( $u ), $/;
287 my $len = $#sorted_marc_record;
292 foreach ( 0 .. $len ) {
294 # find next element which isn't visited
295 while ($visited->{$i}) {
296 $i = ($i + 1) % ($len + 1);
302 my $row = dclone( $sorted_marc_record[$i] );
304 # field and subfield which is key for
305 # marc_repeatable_subfield and u
306 my $fsf = $row->[0] . ( $row->[3] || '' );
310 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
311 print "### this [$i]: ", dump( $row ),$/;
312 print "### sf: ", $row->[3], " vs ", $field->[3],
313 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
319 if ( $#$field >= 0 ) {
321 $row->[0] ne $field->[0] || # field
322 $row->[1] ne $field->[1] || # i1
323 $row->[2] ne $field->[2] # i2
326 warn "## saved/1 ", dump( $field ),$/ if ($debug);
330 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
332 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
333 ! $marc_repeatable_subfield->{ $fsf }
337 warn "## saved/2 ", dump( $field ),$/ if ($debug);
341 # append new subfields to existing field
342 push @$field, ( $row->[3], $row->[4] );
349 if (! $marc_repeatable_subfield->{ $fsf }) {
350 # make step to next subfield
351 $i = ($i + $u->{ $fsf } ) % ($len + 1);
357 warn "## saved/3 ", dump( $field ),$/ if ($debug);
365 Change level of debug warnings
373 return $debug unless defined($l);
374 warn "debug level $l",$/ if ($l > 0);
378 =head1 Functions to create C<data_structure>
380 Those functions generally have to first in your normalization file.
384 Define new tag for I<search> and I<display>.
386 tag('Title', rec('200','a') );
392 my $name = shift or die "tag needs name as first argument";
393 my @o = grep { defined($_) && $_ ne '' } @_;
395 $out->{$name}->{tag} = $name;
396 $out->{$name}->{search} = \@o;
397 $out->{$name}->{display} = \@o;
402 Define tag just for I<display>
404 @v = display('Title', rec('200','a') );
409 my $name = shift or die "display needs name as first argument";
410 my @o = grep { defined($_) && $_ ne '' } @_;
412 $out->{$name}->{tag} = $name;
413 $out->{$name}->{display} = \@o;
418 Prepare values just for I<search>
420 @v = search('Title', rec('200','a') );
425 my $name = shift or die "search needs name as first argument";
426 my @o = grep { defined($_) && $_ ne '' } @_;
428 $out->{$name}->{tag} = $name;
429 $out->{$name}->{search} = \@o;
434 Setup fields within MARC leader or get leader
436 marc_leader('05','c');
437 my $leader = marc_leader();
442 my ($offset,$value) = @_;
445 $out->{' leader'}->{ $offset } = $value;
447 return $out->{' leader'};
453 Save value for MARC field
455 marc('900','a', rec('200','a') );
456 marc('001', rec('000') );
461 my $f = shift or die "marc needs field";
462 die "marc field must be numer" unless ($f =~ /^\d+$/);
466 $sf = shift or die "marc needs subfield";
470 my $v = $_; # make var read-write for Encode
471 next unless (defined($v) && $v !~ /^\s*$/);
472 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
474 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
476 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
481 =head2 marc_repeatable_subfield
483 Save values for MARC repetable subfield
485 marc_repeatable_subfield('910', 'z', rec('909') );
489 sub marc_repeatable_subfield {
491 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
492 $marc_repeatable_subfield->{ $f . $sf }++;
496 =head2 marc_indicators
498 Set both indicators for MARC field
500 marc_indicators('900', ' ', 1);
502 Any indicator value other than C<0-9> will be treated as undefined.
506 sub marc_indicators {
507 my $f = shift || die "marc_indicators need field!\n";
509 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
510 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
512 $i1 = ' ' if ($i1 !~ /^\d$/);
513 $i2 = ' ' if ($i2 !~ /^\d$/);
514 @{ $marc_indicators->{$f} } = ($i1,$i2);
519 Save values for each MARC subfield explicitly
528 If you specify C<+> for subfield, value will be appended
529 to previous defined subfield.
534 my $f = shift or die "marc_compose needs field";
535 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
537 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
538 my $m = [ $f, $i1, $i2 ];
540 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
543 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
550 next unless (defined($v) && $v !~ /^\s*$/);
551 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
553 push @$m, ( $sf, $v );
559 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
561 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
564 =head2 marc_duplicate
566 Generate copy of current MARC record and continue working on copy
570 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
571 C<< _get_marc_fields( offset => 42 ) >>.
576 my $m = $marc_record->[ -1 ];
577 die "can't duplicate record which isn't defined" unless ($m);
578 push @{ $marc_record }, dclone( $m );
579 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
580 $marc_record_offset = $#{ $marc_record };
581 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
586 Remove some field or subfield from MARC record.
589 marc_remove('200','a');
591 This will erase field C<200> or C<200^a> from current MARC record.
593 This is useful after calling C<marc_duplicate> or on it's own (but, you
594 should probably just remove that subfield definition if you are not
595 using C<marc_duplicate>).
597 FIXME: support fields < 10.
604 die "marc_remove needs record number" unless defined($f);
606 my $marc = $marc_record->[ $marc_record_offset ];
608 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
611 foreach ( 0 .. $#{ $marc } ) {
612 last unless (defined $marc->[$i]);
613 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
614 if ($marc->[$i]->[0] eq $f) {
617 splice @$marc, $i, 1;
618 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
621 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
622 my $o = ($j * 2) + 3;
623 if ($marc->[$i]->[$o] eq $sf) {
625 splice @{$marc->[$i]}, $o, 2;
626 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
627 # is record now empty?
628 if ($#{ $marc->[$i] } == 2) {
629 splice @$marc, $i, 1;
630 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
640 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
642 $marc_record->[ $marc_record_offset ] = $marc;
644 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
647 =head2 marc_original_order
649 Copy all subfields preserving original order to marc field.
651 marc_original_order( marc_field_number, original_input_field_number );
653 Please note that field numbers are consistent with other commands (marc
654 field number first), but somewhat counter-intuitive (destination and then
657 You might want to use this command if you are just renaming subfields or
658 using pre-processing modify_record in C<config.yml> and don't need any
659 post-processing or want to preserve order of original subfields.
664 sub marc_original_order {
666 my ($to, $from) = @_;
667 die "marc_original_order needs from and to fields\n" unless ($from && $to);
669 return unless defined($rec->{$from});
671 my $r = $rec->{$from};
672 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
674 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
675 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
677 foreach my $d (@$r) {
679 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
680 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
684 my @sfs = @{ $d->{subfields} };
686 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
688 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
690 my $m = [ $to, $i1, $i2 ];
692 while (my $sf = shift @sfs) {
694 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
695 my $offset = shift @sfs;
696 die "corrupted sufields specification for field $from\n" unless defined($offset);
699 if (ref($d->{$sf}) eq 'ARRAY') {
700 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
701 } elsif ($offset == 0) {
704 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
706 push @$m, ( $sf, $v ) if (defined($v));
710 push @{ $marc_record->[ $marc_record_offset ] }, $m;
714 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
718 =head1 Functions to extract data from input
720 This function should be used inside functions to create C<data_structure> described
723 =head2 _pack_subfields_hash
725 @values = _pack_subfields_hash( $h, $include_subfields )
729 sub _pack_subfields_hash {
731 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
733 my ($h,$include_subfields) = @_;
735 if ( defined($h->{subfields}) ) {
736 my $sfs = delete $h->{subfields} || die "no subfields?";
739 my $sf = shift @$sfs;
740 push @out, '^' . $sf if ($include_subfields);
742 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
743 # single element subfields are not arrays
744 warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
746 push @out, $h->{$sf};
748 warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
749 push @out, $h->{$sf}->[$o];
752 if ($include_subfields) {
753 return join('', @out);
758 if ($include_subfields) {
760 foreach my $sf (keys %$h) {
761 if (ref($h->{$sf}) eq 'ARRAY') {
762 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
764 $out .= '^' . $sf . $h->{$sf};
769 # FIXME this should probably be in alphabetical order instead of hash order
777 Return all values in some field
781 TODO: order of values is probably same as in source data, need to investigate that
787 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
788 return unless (defined($rec) && defined($rec->{$f}));
789 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
790 if (ref($rec->{$f}) eq 'ARRAY') {
792 foreach my $h ( @{ $rec->{$f} } ) {
793 if (ref($h) eq 'HASH') {
794 push @out, ( _pack_subfields_hash( $h ) );
800 } elsif( defined($rec->{$f}) ) {
807 Return all values in specific field and subfield
815 return unless (defined($rec && $rec->{$f}));
817 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
819 if (ref($_->{$sf}) eq 'ARRAY') {
824 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
852 Apply regex to some or all values
854 @v = regex( 's/foo/bar/g', @v );
861 #warn "r: $r\n", dump(\@_);
865 push @out, $t if ($t && $t ne '');
872 Prefix all values with a string
874 @v = prefix( 'my_', @v );
879 my $p = shift or return;
880 return map { $p . $_ } grep { defined($_) } @_;
885 suffix all values with a string
887 @v = suffix( '_my', @v );
892 my $s = shift or die "suffix needs string as first argument";
893 return map { $_ . $s } grep { defined($_) } @_;
898 surround all values with a two strings
900 @v = surround( 'prefix_', '_suffix', @v );
905 my $p = shift or die "surround need prefix as first argument";
906 my $s = shift or die "surround needs suffix as second argument";
907 return map { $p . $_ . $s } grep { defined($_) } @_;
925 Consult lookup hashes for some value
933 my $k = shift or return;
934 return unless (defined($lookup->{$k}));
935 if (ref($lookup->{$k}) eq 'ARRAY') {
936 return @{ $lookup->{$k} };
938 return $lookup->{$k};
944 Consult config values stored in C<config.yml>
946 # return database code (key under databases in yaml)
947 $database_code = config(); # use _ from hash
948 $database_name = config('name');
949 $database_input_name = config('input name');
950 $tag = config('input normalize tag');
952 Up to three levels are supported.
957 return unless ($config);
965 warn "### getting config($p)\n" if ($debug > 1);
967 my @p = split(/\s+/,$p);
969 $v = $config->{ '_' }; # special, database code
972 my $c = dclone( $config );
975 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
976 if (ref($c) eq 'ARRAY') {
978 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
982 if (! defined($c->{$k}) ) {
993 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
994 warn "config( '$p' ) is empty\n" if (! $v);
1001 Returns unique id of this record
1005 Returns C<42/2> for 2nd occurence of MFN 42.
1010 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1011 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1016 Joins walues with some delimiter
1018 $v = join_with(", ", @v);
1024 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1025 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1026 return '' unless defined($v);
1032 Split record subfield on some regex and take one of parts out
1034 $a_before_semi_column =
1035 split_rec_on('200','a', /\s*;\s*/, $part);
1037 C<$part> is optional number of element. First element is
1040 If there is no C<$part> parameter or C<$part> is 0, this function will
1041 return all values produced by splitting.
1046 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1048 my ($fld, $sf, $regex, $part) = @_;
1049 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1051 my @r = rec( $fld, $sf );
1053 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1055 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1057 my @s = split( $regex, $v );
1058 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1059 if ($part && $part > 0) {
1060 return $s[ $part - 1 ];