1 package WebPAC::Normalize;
10 marc marc_indicators marc_repeatable_subfield
11 marc_compose marc_leader
12 marc_duplicate marc_remove
16 regex prefix suffix surround
17 first lookup join_with
26 #use base qw/WebPAC::Common/;
27 use Data::Dump qw/dump/;
28 use Storable qw/dclone/;
36 WebPAC::Normalize - describe normalisaton rules using sets
44 our $VERSION = '0.20';
48 This module uses C<conf/normalize/*.pl> files to perform normalisation
49 from input records using perl functions which are specialized for set
52 Sets are implemented as arrays, and normalisation file is valid perl, which
53 means that you check it's validity before running WebPAC using
54 C<perl -c normalize.pl>.
56 Normalisation can generate multiple output normalized data. For now, supported output
57 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
62 Functions which start with C<_> are private and used by WebPAC internally.
63 All other functions are available for use within normalisation rules.
69 my $ds = WebPAC::Normalize::data_structure(
70 lookup => $lookup_variable,
72 rules => $normalize_pl_config,
73 marc_encoding => 'utf-8',
77 Options C<row>, C<rules> and C<log> are mandatory while all
80 This function will B<die> if normalizastion can't be evaled.
82 Since this function isn't exported you have to call it with
83 C<WebPAC::Normalize::data_structure>.
90 die "need row argument" unless ($arg->{row});
91 die "need normalisation argument" unless ($arg->{rules});
94 _set_lookup( $arg->{lookup} ) if (defined( $arg->{lookup} ));
95 _set_rec( $arg->{row} );
96 _set_config( $arg->{config} ) if (defined( $arg->{config} ));
97 _clean_ds( %{ $arg } );
99 die "error evaling $arg->{rules}: $@\n" if ($@);
106 Set current record hash
115 $rec = shift or die "no record hash";
120 Set current config hash
122 _set_config( $config );
130 Code of current database
148 Return hash formatted as data structure
154 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
155 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
163 Clean data structure hash for next record
171 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
172 ($marc_record_offset, $marc_fetch_offset) = (0,0);
173 $marc_encoding = $a->{marc_encoding};
178 Set current lookup hash
180 _set_lookup( $lookup );
192 Get current lookup hash
194 my $lookup = _get_lookup();
202 =head2 _get_marc_fields
204 Get all fields defined by calls to C<marc>
206 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
208 We are using I<magic> which detect repeatable fields only from
209 sequence of field/subfield data generated by normalization.
211 Repeatable field is created when there is second occurence of same subfield or
212 if any of indicators are different.
214 This is sane for most cases. Something like:
220 will be created from any combination of:
222 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
226 marc('900','a', rec('200','a') );
227 marc('900','b', rec('200','b') );
228 marc('900','c', rec('200','c') );
230 which might not be what you have in mind. If you need repeatable subfield,
231 define it using C<marc_repeatable_subfield> like this:
233 marc_repeatable_subfield('900','a');
234 marc('900','a', rec('200','a') );
235 marc('900','b', rec('200','b') );
236 marc('900','c', rec('200','c') );
240 900a-1 900a-2 900a-3 900b-1 900c-1
243 There is also support for returning next or specific using:
245 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
246 # do something with $mf
249 will always return fields from next MARC record or
251 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
253 will return 42th copy record (if it exists).
257 sub _get_marc_fields {
260 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
261 my $offset = $marc_fetch_offset;
262 if ($arg->{offset}) {
263 $offset = $arg->{offset};
264 } elsif($arg->{fetch_next}) {
265 $marc_fetch_offset++;
268 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
270 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
272 my $marc_rec = $marc_record->[ $offset ];
274 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
276 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
278 # first, sort all existing fields
279 # XXX might not be needed, but modern perl might randomize elements in hash
280 my @sorted_marc_record = sort {
281 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
284 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
289 # count unique field-subfields (used for offset when walking to next subfield)
291 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
294 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
295 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
296 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
297 warn "## subfield count = ", dump( $u ), $/;
300 my $len = $#sorted_marc_record;
305 foreach ( 0 .. $len ) {
307 # find next element which isn't visited
308 while ($visited->{$i}) {
309 $i = ($i + 1) % ($len + 1);
315 my $row = dclone( $sorted_marc_record[$i] );
317 # field and subfield which is key for
318 # marc_repeatable_subfield and u
319 my $fsf = $row->[0] . ( $row->[3] || '' );
323 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
324 print "### this [$i]: ", dump( $row ),$/;
325 print "### sf: ", $row->[3], " vs ", $field->[3],
326 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
332 if ( $#$field >= 0 ) {
334 $row->[0] ne $field->[0] || # field
335 $row->[1] ne $field->[1] || # i1
336 $row->[2] ne $field->[2] # i2
339 warn "## saved/1 ", dump( $field ),$/ if ($debug);
343 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
345 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
346 ! $marc_repeatable_subfield->{ $fsf }
350 warn "## saved/2 ", dump( $field ),$/ if ($debug);
354 # append new subfields to existing field
355 push @$field, ( $row->[3], $row->[4] );
362 if (! $marc_repeatable_subfield->{ $fsf }) {
363 # make step to next subfield
364 $i = ($i + $u->{ $fsf } ) % ($len + 1);
370 warn "## saved/3 ", dump( $field ),$/ if ($debug);
378 Change level of debug warnings
386 return $debug unless defined($l);
387 warn "debug level $l",$/ if ($l > 0);
391 =head1 Functions to create C<data_structure>
393 Those functions generally have to first in your normalization file.
397 Define new tag for I<search> and I<display>.
399 tag('Title', rec('200','a') );
405 my $name = shift or die "tag needs name as first argument";
406 my @o = grep { defined($_) && $_ ne '' } @_;
408 $out->{$name}->{tag} = $name;
409 $out->{$name}->{search} = \@o;
410 $out->{$name}->{display} = \@o;
415 Define tag just for I<display>
417 @v = display('Title', rec('200','a') );
422 my $name = shift or die "display needs name as first argument";
423 my @o = grep { defined($_) && $_ ne '' } @_;
425 $out->{$name}->{tag} = $name;
426 $out->{$name}->{display} = \@o;
431 Prepare values just for I<search>
433 @v = search('Title', rec('200','a') );
438 my $name = shift or die "search needs name as first argument";
439 my @o = grep { defined($_) && $_ ne '' } @_;
441 $out->{$name}->{tag} = $name;
442 $out->{$name}->{search} = \@o;
447 Setup fields within MARC leader or get leader
449 marc_leader('05','c');
450 my $leader = marc_leader();
455 my ($offset,$value) = @_;
458 $out->{' leader'}->{ $offset } = $value;
460 return $out->{' leader'};
466 Save value for MARC field
468 marc('900','a', rec('200','a') );
469 marc('001', rec('000') );
474 my $f = shift or die "marc needs field";
475 die "marc field must be numer" unless ($f =~ /^\d+$/);
479 $sf = shift or die "marc needs subfield";
483 my $v = $_; # make var read-write for Encode
484 next unless (defined($v) && $v !~ /^\s*$/);
485 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
487 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
489 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
494 =head2 marc_repeatable_subfield
496 Save values for MARC repetable subfield
498 marc_repeatable_subfield('910', 'z', rec('909') );
502 sub marc_repeatable_subfield {
504 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
505 $marc_repeatable_subfield->{ $f . $sf }++;
509 =head2 marc_indicators
511 Set both indicators for MARC field
513 marc_indicators('900', ' ', 1);
515 Any indicator value other than C<0-9> will be treated as undefined.
519 sub marc_indicators {
520 my $f = shift || die "marc_indicators need field!\n";
522 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
523 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
525 $i1 = ' ' if ($i1 !~ /^\d$/);
526 $i2 = ' ' if ($i2 !~ /^\d$/);
527 @{ $marc_indicators->{$f} } = ($i1,$i2);
532 Save values for each MARC subfield explicitly
541 If you specify C<+> for subfield, value will be appended
542 to previous defined subfield.
547 my $f = shift or die "marc_compose needs field";
548 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
550 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
551 my $m = [ $f, $i1, $i2 ];
553 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
556 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
563 next unless (defined($v) && $v !~ /^\s*$/);
564 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
566 push @$m, ( $sf, $v );
572 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
577 =head2 marc_duplicate
579 Generate copy of current MARC record and continue working on copy
583 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
584 C<< _get_marc_fields( offset => 42 ) >>.
589 my $m = $marc_record->[ -1 ];
590 die "can't duplicate record which isn't defined" unless ($m);
591 push @{ $marc_record }, dclone( $m );
592 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
593 $marc_record_offset = $#{ $marc_record };
594 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
599 Remove some field or subfield from MARC record.
602 marc_remove('200','a');
604 This will erase field C<200> or C<200^a> from current MARC record.
606 This is useful after calling C<marc_duplicate> or on it's own (but, you
607 should probably just remove that subfield definition if you are not
608 using C<marc_duplicate>).
610 FIXME: support fields < 10.
617 die "marc_remove needs record number" unless defined($f);
619 my $marc = $marc_record->[ $marc_record_offset ];
621 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
624 foreach ( 0 .. $#{ $marc } ) {
625 last unless (defined $marc->[$i]);
626 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
627 if ($marc->[$i]->[0] eq $f) {
630 splice @$marc, $i, 1;
631 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
634 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
635 my $o = ($j * 2) + 3;
636 if ($marc->[$i]->[$o] eq $sf) {
638 splice @{$marc->[$i]}, $o, 2;
639 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
640 # is record now empty?
641 if ($#{ $marc->[$i] } == 2) {
642 splice @$marc, $i, 1;
643 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
653 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
655 $marc_record->[ $marc_record_offset ] = $marc;
657 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
660 =head2 marc_original_order
662 Copy all subfields preserving original order to marc field.
664 marc_original_order( marc_field_number, original_input_field_number );
666 Please note that field numbers are consistent with other commands (marc
667 field number first), but somewhat counter-intuitive (destination and then
670 You might want to use this command if you are just renaming subfields or
671 using pre-processing modify_record in C<config.yml> and don't need any
672 post-processing or want to preserve order of original subfields.
677 sub marc_original_order {
679 my ($to, $from) = @_;
680 die "marc_original_order needs from and to fields\n" unless ($from && $to);
682 return unless defined($rec->{$from});
684 my $r = $rec->{$from};
685 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
687 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
688 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
690 foreach my $d (@$r) {
692 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
693 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
697 my @sfs = @{ $d->{subfields} };
699 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
701 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
703 my $m = [ $to, $i1, $i2 ];
705 while (my $sf = shift @sfs) {
707 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
708 my $offset = shift @sfs;
709 die "corrupted sufields specification for field $from\n" unless defined($offset);
712 if (ref($d->{$sf}) eq 'ARRAY') {
713 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
714 } elsif ($offset == 0) {
717 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
719 push @$m, ( $sf, $v ) if (defined($v));
723 push @{ $marc_record->[ $marc_record_offset ] }, $m;
727 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
731 =head1 Functions to extract data from input
733 This function should be used inside functions to create C<data_structure> described
736 =head2 _pack_subfields_hash
738 @subfields = _pack_subfields_hash( $h );
739 $subfields = _pack_subfields_hash( $h, 1 );
741 Return each subfield value in array or pack them all together and return scalar
742 with subfields (denoted by C<^>) and values.
746 sub _pack_subfields_hash {
748 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
750 my ($h,$include_subfields) = @_;
752 if ( defined($h->{subfields}) ) {
753 my $sfs = delete $h->{subfields} || die "no subfields?";
756 my $sf = shift @$sfs;
757 push @out, '^' . $sf if ($include_subfields);
759 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
760 # single element subfields are not arrays
761 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
763 push @out, $h->{$sf};
765 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
766 push @out, $h->{$sf}->[$o];
769 if ($include_subfields) {
770 return join('', @out);
775 if ($include_subfields) {
777 foreach my $sf (sort keys %$h) {
778 if (ref($h->{$sf}) eq 'ARRAY') {
779 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
781 $out .= '^' . $sf . $h->{$sf};
786 # FIXME this should probably be in alphabetical order instead of hash order
794 Return all values in some field
798 TODO: order of values is probably same as in source data, need to investigate that
804 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
805 return unless (defined($rec) && defined($rec->{$f}));
806 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
807 if (ref($rec->{$f}) eq 'ARRAY') {
809 foreach my $h ( @{ $rec->{$f} } ) {
810 if (ref($h) eq 'HASH') {
811 push @out, ( _pack_subfields_hash( $h ) );
817 } elsif( defined($rec->{$f}) ) {
824 Return all values in specific field and subfield
832 return unless (defined($rec && $rec->{$f}));
834 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
836 if (ref($_->{$sf}) eq 'ARRAY') {
841 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
869 Apply regex to some or all values
871 @v = regex( 's/foo/bar/g', @v );
878 #warn "r: $r\n", dump(\@_);
882 push @out, $t if ($t && $t ne '');
889 Prefix all values with a string
891 @v = prefix( 'my_', @v );
896 my $p = shift or return;
897 return map { $p . $_ } grep { defined($_) } @_;
902 suffix all values with a string
904 @v = suffix( '_my', @v );
909 my $s = shift or die "suffix needs string as first argument";
910 return map { $_ . $s } grep { defined($_) } @_;
915 surround all values with a two strings
917 @v = surround( 'prefix_', '_suffix', @v );
922 my $p = shift or die "surround need prefix as first argument";
923 my $s = shift or die "surround needs suffix as second argument";
924 return map { $p . $_ . $s } grep { defined($_) } @_;
942 Consult lookup hashes for some value
947 FIXME B<currently this one is broken!>
952 my $k = shift or return;
953 return unless (defined($lookup->{$k}));
954 if (ref($lookup->{$k}) eq 'ARRAY') {
955 return @{ $lookup->{$k} };
957 return $lookup->{$k};
961 =head2 save_into_lookup
963 Save value into lookup.
965 save_into_lookup($database,$input,$key,sub {
966 # code which produce one or more values
969 This function shouldn't be called directly, it's called from code created by L<WebPAC::Parser>.
973 sub save_into_lookup {
974 my ($database,$input,$key,$coderef) = @_;
975 die "save_into_lookup needs database" unless defined($database);
976 die "save_into_lookup needs input" unless defined($input);
977 die "save_into_lookup needs key" unless defined($key);
978 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
979 my $mfn = $rec->{'000'}->[0] || die "mfn not defined or zero";
980 foreach my $v ( $coderef->() ) {
981 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
982 warn "# saved lookup $database/$input/$key [$v] $mfn\n"; #if ($debug > 1);
988 Consult config values stored in C<config.yml>
990 # return database code (key under databases in yaml)
991 $database_code = config(); # use _ from hash
992 $database_name = config('name');
993 $database_input_name = config('input name');
994 $tag = config('input normalize tag');
996 Up to three levels are supported.
1001 return unless ($config);
1009 warn "### getting config($p)\n" if ($debug > 1);
1011 my @p = split(/\s+/,$p);
1013 $v = $config->{ '_' }; # special, database code
1016 my $c = dclone( $config );
1018 foreach my $k (@p) {
1019 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1020 if (ref($c) eq 'ARRAY') {
1022 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1026 if (! defined($c->{$k}) ) {
1037 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1038 warn "config( '$p' ) is empty\n" if (! $v);
1045 Returns unique id of this record
1049 Returns C<42/2> for 2nd occurence of MFN 42.
1054 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1055 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1060 Joins walues with some delimiter
1062 $v = join_with(", ", @v);
1068 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1069 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1070 return '' unless defined($v);
1076 Split record subfield on some regex and take one of parts out
1078 $a_before_semi_column =
1079 split_rec_on('200','a', /\s*;\s*/, $part);
1081 C<$part> is optional number of element. First element is
1084 If there is no C<$part> parameter or C<$part> is 0, this function will
1085 return all values produced by splitting.
1090 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1092 my ($fld, $sf, $regex, $part) = @_;
1093 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1095 my @r = rec( $fld, $sf );
1097 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1099 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1101 my @s = split( $regex, $v );
1102 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1103 if ($part && $part > 0) {
1104 return $s[ $part - 1 ];