1 package WebPAC::Normalize;
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
14 regex prefix suffix surround
15 first lookup join_with
23 #use base qw/WebPAC::Common/;
24 use Data::Dump qw/dump/;
25 use Encode qw/from_to/;
26 use Storable qw/dclone/;
34 WebPAC::Normalize - describe normalisaton rules using sets
42 our $VERSION = '0.11';
46 This module uses C<conf/normalize/*.pl> files to perform normalisation
47 from input records using perl functions which are specialized for set
50 Sets are implemented as arrays, and normalisation file is valid perl, which
51 means that you check it's validity before running WebPAC using
52 C<perl -c normalize.pl>.
54 Normalisation can generate multiple output normalized data. For now, supported output
55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
60 Functions which start with C<_> are private and used by WebPAC internally.
61 All other functions are available for use within normalisation rules.
67 my $ds = WebPAC::Normalize::data_structure(
68 lookup => $lookup->lookup_hash,
70 rules => $normalize_pl_config,
71 marc_encoding => 'utf-8',
74 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
77 This function will B<die> if normalizastion can't be evaled.
79 Since this function isn't exported you have to call it with
80 C<WebPAC::Normalize::data_structure>.
87 die "need row argument" unless ($arg->{row});
88 die "need normalisation argument" unless ($arg->{rules});
91 _set_lookup( $arg->{lookup} );
92 _set_rec( $arg->{row} );
93 _clean_ds( %{ $arg } );
95 die "error evaling $arg->{rules}: $@\n" if ($@);
102 Set current record hash
111 $rec = shift or die "no record hash";
116 Return hash formatted as data structure
122 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
131 Clean data structure hash for next record
139 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140 ($marc_record_offset, $marc_fetch_offset) = (0,0);
141 $marc_encoding = $a->{marc_encoding};
146 Set current lookup hash
148 _set_lookup( $lookup );
158 =head2 _get_marc_fields
160 Get all fields defined by calls to C<marc>
162 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
164 We are using I<magic> which detect repeatable fields only from
165 sequence of field/subfield data generated by normalization.
167 Repeatable field is created when there is second occurence of same subfield or
168 if any of indicators are different.
170 This is sane for most cases. Something like:
176 will be created from any combination of:
178 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
182 marc('900','a', rec('200','a') );
183 marc('900','b', rec('200','b') );
184 marc('900','c', rec('200','c') );
186 which might not be what you have in mind. If you need repeatable subfield,
187 define it using C<marc_repeatable_subfield> like this:
189 marc_repeatable_subfield('900','a');
190 marc('900','a', rec('200','a') );
191 marc('900','b', rec('200','b') );
192 marc('900','c', rec('200','c') );
196 900a-1 900a-2 900a-3 900b-1 900c-1
199 There is also support for returning next or specific using:
201 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202 # do something with $mf
205 will always return fields from next MARC record or
207 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
209 will return 42th copy record (if it exists).
213 sub _get_marc_fields {
216 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217 my $offset = $marc_fetch_offset;
218 if ($arg->{offset}) {
219 $offset = $arg->{offset};
220 } elsif($arg->{fetch_next}) {
221 $marc_fetch_offset++;
224 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
226 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
228 my $marc_rec = $marc_record->[ $offset ];
230 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
232 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
234 # first, sort all existing fields
235 # XXX might not be needed, but modern perl might randomize elements in hash
236 my @sorted_marc_record = sort {
237 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
240 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
245 # count unique field-subfields (used for offset when walking to next subfield)
247 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
250 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253 warn "## subfield count = ", dump( $u ), $/;
256 my $len = $#sorted_marc_record;
261 foreach ( 0 .. $len ) {
263 # find next element which isn't visited
264 while ($visited->{$i}) {
265 $i = ($i + 1) % ($len + 1);
271 my $row = dclone( $sorted_marc_record[$i] );
273 # field and subfield which is key for
274 # marc_repeatable_subfield and u
275 my $fsf = $row->[0] . ( $row->[3] || '' );
279 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280 print "### this [$i]: ", dump( $row ),$/;
281 print "### sf: ", $row->[3], " vs ", $field->[3],
282 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
288 if ( $#$field >= 0 ) {
290 $row->[0] ne $field->[0] || # field
291 $row->[1] ne $field->[1] || # i1
292 $row->[2] ne $field->[2] # i2
295 warn "## saved/1 ", dump( $field ),$/ if ($debug);
299 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
301 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302 ! $marc_repeatable_subfield->{ $fsf }
306 warn "## saved/2 ", dump( $field ),$/ if ($debug);
310 # append new subfields to existing field
311 push @$field, ( $row->[3], $row->[4] );
318 if (! $marc_repeatable_subfield->{ $fsf }) {
319 # make step to next subfield
320 $i = ($i + $u->{ $fsf } ) % ($len + 1);
326 warn "## saved/3 ", dump( $field ),$/ if ($debug);
334 Change level of debug warnings
342 return $debug unless defined($l);
343 warn "debug level $l",$/ if ($l > 0);
347 =head1 Functions to create C<data_structure>
349 Those functions generally have to first in your normalization file.
353 Define new tag for I<search> and I<display>.
355 tag('Title', rec('200','a') );
361 my $name = shift or die "tag needs name as first argument";
362 my @o = grep { defined($_) && $_ ne '' } @_;
364 $out->{$name}->{tag} = $name;
365 $out->{$name}->{search} = \@o;
366 $out->{$name}->{display} = \@o;
371 Define tag just for I<display>
373 @v = display('Title', rec('200','a') );
378 my $name = shift or die "display needs name as first argument";
379 my @o = grep { defined($_) && $_ ne '' } @_;
381 $out->{$name}->{tag} = $name;
382 $out->{$name}->{display} = \@o;
387 Prepare values just for I<search>
389 @v = search('Title', rec('200','a') );
394 my $name = shift or die "search needs name as first argument";
395 my @o = grep { defined($_) && $_ ne '' } @_;
397 $out->{$name}->{tag} = $name;
398 $out->{$name}->{search} = \@o;
403 Setup fields within MARC leader or get leader
405 marc_leader('05','c');
406 my $leader = marc_leader();
411 my ($offset,$value) = @_;
414 $out->{' leader'}->{ $offset } = $value;
416 return $out->{' leader'};
422 Save value for MARC field
424 marc('900','a', rec('200','a') );
425 marc('001', rec('000') );
430 my $f = shift or die "marc needs field";
431 die "marc field must be numer" unless ($f =~ /^\d+$/);
435 $sf = shift or die "marc needs subfield";
439 my $v = $_; # make var read-write for Encode
440 next unless (defined($v) && $v !~ /^\s*$/);
441 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
444 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
446 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
451 =head2 marc_repeatable_subfield
453 Save values for MARC repetable subfield
455 marc_repeatable_subfield('910', 'z', rec('909') );
459 sub marc_repeatable_subfield {
461 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462 $marc_repeatable_subfield->{ $f . $sf }++;
466 =head2 marc_indicators
468 Set both indicators for MARC field
470 marc_indicators('900', ' ', 1);
472 Any indicator value other than C<0-9> will be treated as undefined.
476 sub marc_indicators {
477 my $f = shift || die "marc_indicators need field!\n";
479 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
482 $i1 = ' ' if ($i1 !~ /^\d$/);
483 $i2 = ' ' if ($i2 !~ /^\d$/);
484 @{ $marc_indicators->{$f} } = ($i1,$i2);
489 Save values for each MARC subfield explicitly
501 my $f = shift or die "marc_compose needs field";
502 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
504 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505 my $m = [ $f, $i1, $i2 ];
508 my $sf = shift or die "marc_compose $f needs subfield";
511 next unless (defined($v) && $v !~ /^\s*$/);
512 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
513 push @$m, ( $sf, $v );
514 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
517 warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);
519 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
522 =head2 marc_duplicate
524 Generate copy of current MARC record and continue working on copy
528 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
529 C<< _get_marc_fields( offset => 42 ) >>.
534 my $m = $marc_record->[ -1 ];
535 die "can't duplicate record which isn't defined" unless ($m);
536 push @{ $marc_record }, dclone( $m );
537 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
538 $marc_record_offset = $#{ $marc_record };
539 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
544 Remove some field or subfield from MARC record.
547 marc_remove('200','a');
549 This will erase field C<200> or C<200^a> from current MARC record.
551 This is useful after calling C<marc_duplicate> or on it's own (but, you
552 should probably just remove that subfield definition if you are not
553 using C<marc_duplicate>).
555 FIXME: support fields < 10.
562 die "marc_remove needs record number" unless defined($f);
564 my $marc = $marc_record->[ $marc_record_offset ];
566 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
569 foreach ( 0 .. $#{ $marc } ) {
570 last unless (defined $marc->[$i]);
571 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
572 if ($marc->[$i]->[0] eq $f) {
575 splice @$marc, $i, 1;
576 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
579 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
580 my $o = ($j * 2) + 3;
581 if ($marc->[$i]->[$o] eq $sf) {
583 splice @{$marc->[$i]}, $o, 2;
584 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
585 # is record now empty?
586 if ($#{ $marc->[$i] } == 2) {
587 splice @$marc, $i, 1;
588 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
598 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
600 $marc_record->[ $marc_record_offset ] = $marc;
602 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
605 =head1 Functions to extract data from input
607 This function should be used inside functions to create C<data_structure> described
612 Return all values in some field
616 TODO: order of values is probably same as in source data, need to investigate that
622 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
623 return unless (defined($rec) && defined($rec->{$f}));
624 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
625 if (ref($rec->{$f}) eq 'ARRAY') {
627 if (ref($_) eq 'HASH') {
633 } elsif( defined($rec->{$f}) ) {
640 Return all values in specific field and subfield
648 return unless (defined($rec && $rec->{$f}));
650 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
672 Apply regex to some or all values
674 @v = regex( 's/foo/bar/g', @v );
681 #warn "r: $r\n", dump(\@_);
685 push @out, $t if ($t && $t ne '');
692 Prefix all values with a string
694 @v = prefix( 'my_', @v );
699 my $p = shift or die "prefix needs string as first argument";
700 return map { $p . $_ } grep { defined($_) } @_;
705 suffix all values with a string
707 @v = suffix( '_my', @v );
712 my $s = shift or die "suffix needs string as first argument";
713 return map { $_ . $s } grep { defined($_) } @_;
718 surround all values with a two strings
720 @v = surround( 'prefix_', '_suffix', @v );
725 my $p = shift or die "surround need prefix as first argument";
726 my $s = shift or die "surround needs suffix as second argument";
727 return map { $p . $_ . $s } grep { defined($_) } @_;
745 Consult lookup hashes for some value
753 my $k = shift or return;
754 return unless (defined($lookup->{$k}));
755 if (ref($lookup->{$k}) eq 'ARRAY') {
756 return @{ $lookup->{$k} };
758 return $lookup->{$k};
764 Joins walues with some delimiter
766 $v = join_with(", ", @v);
772 return join($d, grep { defined($_) && $_ ne '' } @_);
777 Split record subfield on some regex and take one of parts out
779 $a_before_semi_column =
780 split_rec_on('200','a', /\s*;\s*/, $part);
782 C<$part> is optional number of element. First element is
785 If there is no C<$part> parameter or C<$part> is 0, this function will
786 return all values produced by splitting.
791 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
793 my ($fld, $sf, $regex, $part) = @_;
794 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
796 my @r = rec( $fld, $sf );
798 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
800 return '' if( ! defined($v) || $v =~ /^\s*$/);
802 my @s = split( $regex, $v );
803 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
804 if ($part && $part > 0) {
805 return $s[ $part - 1 ];