1 package WebPAC::Normalize;
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
14 regex prefix suffix surround
15 first lookup join_with
23 #use base qw/WebPAC::Common/;
24 use Data::Dump qw/dump/;
25 use Encode qw/from_to/;
26 use Storable qw/dclone/;
34 WebPAC::Normalize - describe normalisaton rules using sets
42 our $VERSION = '0.15';
46 This module uses C<conf/normalize/*.pl> files to perform normalisation
47 from input records using perl functions which are specialized for set
50 Sets are implemented as arrays, and normalisation file is valid perl, which
51 means that you check it's validity before running WebPAC using
52 C<perl -c normalize.pl>.
54 Normalisation can generate multiple output normalized data. For now, supported output
55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
60 Functions which start with C<_> are private and used by WebPAC internally.
61 All other functions are available for use within normalisation rules.
67 my $ds = WebPAC::Normalize::data_structure(
68 lookup => $lookup->lookup_hash,
70 rules => $normalize_pl_config,
71 marc_encoding => 'utf-8',
75 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
78 This function will B<die> if normalizastion can't be evaled.
80 Since this function isn't exported you have to call it with
81 C<WebPAC::Normalize::data_structure>.
88 die "need row argument" unless ($arg->{row});
89 die "need normalisation argument" unless ($arg->{rules});
92 _set_lookup( $arg->{lookup} );
93 _set_rec( $arg->{row} );
94 _set_config( $arg->{config} );
95 _clean_ds( %{ $arg } );
97 die "error evaling $arg->{rules}: $@\n" if ($@);
104 Set current record hash
113 $rec = shift or die "no record hash";
118 Set current config hash
120 _set_config( $config );
128 Code of current database
146 Return hash formatted as data structure
152 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
153 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
161 Clean data structure hash for next record
169 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
170 ($marc_record_offset, $marc_fetch_offset) = (0,0);
171 $marc_encoding = $a->{marc_encoding};
176 Set current lookup hash
178 _set_lookup( $lookup );
188 =head2 _get_marc_fields
190 Get all fields defined by calls to C<marc>
192 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
194 We are using I<magic> which detect repeatable fields only from
195 sequence of field/subfield data generated by normalization.
197 Repeatable field is created when there is second occurence of same subfield or
198 if any of indicators are different.
200 This is sane for most cases. Something like:
206 will be created from any combination of:
208 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
212 marc('900','a', rec('200','a') );
213 marc('900','b', rec('200','b') );
214 marc('900','c', rec('200','c') );
216 which might not be what you have in mind. If you need repeatable subfield,
217 define it using C<marc_repeatable_subfield> like this:
219 marc_repeatable_subfield('900','a');
220 marc('900','a', rec('200','a') );
221 marc('900','b', rec('200','b') );
222 marc('900','c', rec('200','c') );
226 900a-1 900a-2 900a-3 900b-1 900c-1
229 There is also support for returning next or specific using:
231 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
232 # do something with $mf
235 will always return fields from next MARC record or
237 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
239 will return 42th copy record (if it exists).
243 sub _get_marc_fields {
246 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
247 my $offset = $marc_fetch_offset;
248 if ($arg->{offset}) {
249 $offset = $arg->{offset};
250 } elsif($arg->{fetch_next}) {
251 $marc_fetch_offset++;
254 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
256 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
258 my $marc_rec = $marc_record->[ $offset ];
260 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
262 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
264 # first, sort all existing fields
265 # XXX might not be needed, but modern perl might randomize elements in hash
266 my @sorted_marc_record = sort {
267 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
270 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
275 # count unique field-subfields (used for offset when walking to next subfield)
277 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
280 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
281 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
282 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
283 warn "## subfield count = ", dump( $u ), $/;
286 my $len = $#sorted_marc_record;
291 foreach ( 0 .. $len ) {
293 # find next element which isn't visited
294 while ($visited->{$i}) {
295 $i = ($i + 1) % ($len + 1);
301 my $row = dclone( $sorted_marc_record[$i] );
303 # field and subfield which is key for
304 # marc_repeatable_subfield and u
305 my $fsf = $row->[0] . ( $row->[3] || '' );
309 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
310 print "### this [$i]: ", dump( $row ),$/;
311 print "### sf: ", $row->[3], " vs ", $field->[3],
312 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
318 if ( $#$field >= 0 ) {
320 $row->[0] ne $field->[0] || # field
321 $row->[1] ne $field->[1] || # i1
322 $row->[2] ne $field->[2] # i2
325 warn "## saved/1 ", dump( $field ),$/ if ($debug);
329 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
331 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
332 ! $marc_repeatable_subfield->{ $fsf }
336 warn "## saved/2 ", dump( $field ),$/ if ($debug);
340 # append new subfields to existing field
341 push @$field, ( $row->[3], $row->[4] );
348 if (! $marc_repeatable_subfield->{ $fsf }) {
349 # make step to next subfield
350 $i = ($i + $u->{ $fsf } ) % ($len + 1);
356 warn "## saved/3 ", dump( $field ),$/ if ($debug);
364 Change level of debug warnings
372 return $debug unless defined($l);
373 warn "debug level $l",$/ if ($l > 0);
377 =head1 Functions to create C<data_structure>
379 Those functions generally have to first in your normalization file.
383 Define new tag for I<search> and I<display>.
385 tag('Title', rec('200','a') );
391 my $name = shift or die "tag needs name as first argument";
392 my @o = grep { defined($_) && $_ ne '' } @_;
394 $out->{$name}->{tag} = $name;
395 $out->{$name}->{search} = \@o;
396 $out->{$name}->{display} = \@o;
401 Define tag just for I<display>
403 @v = display('Title', rec('200','a') );
408 my $name = shift or die "display needs name as first argument";
409 my @o = grep { defined($_) && $_ ne '' } @_;
411 $out->{$name}->{tag} = $name;
412 $out->{$name}->{display} = \@o;
417 Prepare values just for I<search>
419 @v = search('Title', rec('200','a') );
424 my $name = shift or die "search needs name as first argument";
425 my @o = grep { defined($_) && $_ ne '' } @_;
427 $out->{$name}->{tag} = $name;
428 $out->{$name}->{search} = \@o;
433 Setup fields within MARC leader or get leader
435 marc_leader('05','c');
436 my $leader = marc_leader();
441 my ($offset,$value) = @_;
444 $out->{' leader'}->{ $offset } = $value;
446 return $out->{' leader'};
452 Save value for MARC field
454 marc('900','a', rec('200','a') );
455 marc('001', rec('000') );
460 my $f = shift or die "marc needs field";
461 die "marc field must be numer" unless ($f =~ /^\d+$/);
465 $sf = shift or die "marc needs subfield";
469 my $v = $_; # make var read-write for Encode
470 next unless (defined($v) && $v !~ /^\s*$/);
471 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
472 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
474 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
476 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
481 =head2 marc_repeatable_subfield
483 Save values for MARC repetable subfield
485 marc_repeatable_subfield('910', 'z', rec('909') );
489 sub marc_repeatable_subfield {
491 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
492 $marc_repeatable_subfield->{ $f . $sf }++;
496 =head2 marc_indicators
498 Set both indicators for MARC field
500 marc_indicators('900', ' ', 1);
502 Any indicator value other than C<0-9> will be treated as undefined.
506 sub marc_indicators {
507 my $f = shift || die "marc_indicators need field!\n";
509 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
510 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
512 $i1 = ' ' if ($i1 !~ /^\d$/);
513 $i2 = ' ' if ($i2 !~ /^\d$/);
514 @{ $marc_indicators->{$f} } = ($i1,$i2);
519 Save values for each MARC subfield explicitly
528 If you specify C<+> for subfield, value will be appended
529 to previous defined subfield.
534 my $f = shift or die "marc_compose needs field";
535 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
537 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
538 my $m = [ $f, $i1, $i2 ];
540 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
543 my $sf = shift or die "marc_compose $f needs subfield";
546 next unless (defined($v) && $v !~ /^\s*$/);
547 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
548 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
550 push @$m, ( $sf, $v );
556 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
558 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
561 =head2 marc_duplicate
563 Generate copy of current MARC record and continue working on copy
567 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
568 C<< _get_marc_fields( offset => 42 ) >>.
573 my $m = $marc_record->[ -1 ];
574 die "can't duplicate record which isn't defined" unless ($m);
575 push @{ $marc_record }, dclone( $m );
576 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
577 $marc_record_offset = $#{ $marc_record };
578 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
583 Remove some field or subfield from MARC record.
586 marc_remove('200','a');
588 This will erase field C<200> or C<200^a> from current MARC record.
590 This is useful after calling C<marc_duplicate> or on it's own (but, you
591 should probably just remove that subfield definition if you are not
592 using C<marc_duplicate>).
594 FIXME: support fields < 10.
601 die "marc_remove needs record number" unless defined($f);
603 my $marc = $marc_record->[ $marc_record_offset ];
605 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
608 foreach ( 0 .. $#{ $marc } ) {
609 last unless (defined $marc->[$i]);
610 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
611 if ($marc->[$i]->[0] eq $f) {
614 splice @$marc, $i, 1;
615 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
618 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
619 my $o = ($j * 2) + 3;
620 if ($marc->[$i]->[$o] eq $sf) {
622 splice @{$marc->[$i]}, $o, 2;
623 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
624 # is record now empty?
625 if ($#{ $marc->[$i] } == 2) {
626 splice @$marc, $i, 1;
627 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
637 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
639 $marc_record->[ $marc_record_offset ] = $marc;
641 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
644 =head1 Functions to extract data from input
646 This function should be used inside functions to create C<data_structure> described
651 Return all values in some field
655 TODO: order of values is probably same as in source data, need to investigate that
661 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
662 return unless (defined($rec) && defined($rec->{$f}));
663 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
664 if (ref($rec->{$f}) eq 'ARRAY') {
666 if (ref($_) eq 'HASH') {
672 } elsif( defined($rec->{$f}) ) {
679 Return all values in specific field and subfield
687 return unless (defined($rec && $rec->{$f}));
689 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
691 if (ref($_->{$sf}) eq 'ARRAY') {
696 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
724 Apply regex to some or all values
726 @v = regex( 's/foo/bar/g', @v );
733 #warn "r: $r\n", dump(\@_);
737 push @out, $t if ($t && $t ne '');
744 Prefix all values with a string
746 @v = prefix( 'my_', @v );
751 my $p = shift or return;
752 return map { $p . $_ } grep { defined($_) } @_;
757 suffix all values with a string
759 @v = suffix( '_my', @v );
764 my $s = shift or die "suffix needs string as first argument";
765 return map { $_ . $s } grep { defined($_) } @_;
770 surround all values with a two strings
772 @v = surround( 'prefix_', '_suffix', @v );
777 my $p = shift or die "surround need prefix as first argument";
778 my $s = shift or die "surround needs suffix as second argument";
779 return map { $p . $_ . $s } grep { defined($_) } @_;
797 Consult lookup hashes for some value
805 my $k = shift or return;
806 return unless (defined($lookup->{$k}));
807 if (ref($lookup->{$k}) eq 'ARRAY') {
808 return @{ $lookup->{$k} };
810 return $lookup->{$k};
816 Consult config values stored in C<config.yml>
818 # return database code (key under databases in yaml)
819 $database_code = config(); # use _ from hash
820 $database_name = config('name');
821 $database_input_name = config('input name');
822 $tag = config('input normalize tag');
824 Up to three levels are supported.
829 return unless ($config);
837 warn "### getting config($p)\n" if ($debug > 1);
839 my @p = split(/\s+/,$p);
841 $v = $config->{ '_' }; # special, database code
844 my $c = dclone( $config );
847 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
848 if (ref($c) eq 'ARRAY') {
850 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
854 if (! defined($c->{$k}) ) {
865 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
866 warn "config( '$p' ) is empty\n" if (! $v);
873 Returns unique id of this record
877 Returns C<42/2> for 2nd occurence of MFN 42.
882 my $mfn = $config->{_mfn} || die "no _mfn in config data";
883 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
888 Joins walues with some delimiter
890 $v = join_with(", ", @v);
896 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
897 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
898 return '' unless defined($v);
904 Split record subfield on some regex and take one of parts out
906 $a_before_semi_column =
907 split_rec_on('200','a', /\s*;\s*/, $part);
909 C<$part> is optional number of element. First element is
912 If there is no C<$part> parameter or C<$part> is 0, this function will
913 return all values produced by splitting.
918 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
920 my ($fld, $sf, $regex, $part) = @_;
921 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
923 my @r = rec( $fld, $sf );
925 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
927 return '' if( ! defined($v) || $v =~ /^\s*$/);
929 my @s = split( $regex, $v );
930 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
931 if ($part && $part > 0) {
932 return $s[ $part - 1 ];