1 package WebPAC::Normalize;
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
13 regex prefix suffix surround
14 first lookup join_with
22 #use base qw/WebPAC::Common/;
23 use Data::Dump qw/dump/;
24 use Encode qw/from_to/;
32 WebPAC::Normalize - describe normalisaton rules using sets
40 our $VERSION = '0.09';
44 This module uses C<conf/normalize/*.pl> files to perform normalisation
45 from input records using perl functions which are specialized for set
48 Sets are implemented as arrays, and normalisation file is valid perl, which
49 means that you check it's validity before running WebPAC using
50 C<perl -c normalize.pl>.
52 Normalisation can generate multiple output normalized data. For now, supported output
53 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
58 Functions which start with C<_> are private and used by WebPAC internally.
59 All other functions are available for use within normalisation rules.
65 my $ds = WebPAC::Normalize::data_structure(
66 lookup => $lookup->lookup_hash,
68 rules => $normalize_pl_config,
69 marc_encoding => 'utf-8',
72 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75 This function will B<die> if normalizastion can't be evaled.
77 Since this function isn't exported you have to call it with
78 C<WebPAC::Normalize::data_structure>.
85 die "need row argument" unless ($arg->{row});
86 die "need normalisation argument" unless ($arg->{rules});
89 _set_lookup( $arg->{lookup} );
90 _set_rec( $arg->{row} );
91 _clean_ds( %{ $arg } );
93 die "error evaling $arg->{rules}: $@\n" if ($@);
100 Set current record hash
109 $rec = shift or die "no record hash";
114 Return hash formatted as data structure
120 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
128 Clean data structure hash for next record
136 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
137 $marc_encoding = $a->{marc_encoding};
142 Set current lookup hash
144 _set_lookup( $lookup );
154 =head2 _get_marc_fields
156 Get all fields defined by calls to C<marc>
158 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
160 We are using I<magic> which detect repeatable fields only from
161 sequence of field/subfield data generated by normalization.
163 Repeatable field is created when there is second occurence of same subfield or
164 if any of indicators are different.
166 This is sane for most cases. Something like:
172 will be created from any combination of:
174 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
178 marc('900','a', rec('200','a') );
179 marc('900','b', rec('200','b') );
180 marc('900','c', rec('200','c') );
182 which might not be what you have in mind. If you need repeatable subfield,
183 define it using C<marc_repeatable_subfield> like this:
189 sub _get_marc_fields {
191 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
193 # first, sort all existing fields
194 # XXX might not be needed, but modern perl might randomize elements in hash
195 my @sorted_marc_record = sort {
196 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
199 @sorted_marc_record = @{ $marc_record }; ### FIXME disable sorting
204 # count unique field-subfields (used for offset when walking to next subfield)
206 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
209 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
210 warn "## marc_record ", dump( $marc_record ), $/;
211 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
212 warn "## subfield count ", dump( $u ), $/;
215 my $len = $#sorted_marc_record;
220 foreach ( 0 .. $len ) {
222 # find next element which isn't visited
223 while ($visited->{$i}) {
224 $i = ($i + 1) % ($len + 1);
230 my $row = $sorted_marc_record[$i];
232 # field and subfield which is key for
233 # marc_repeatable_subfield and u
234 my $fsf = $row->[0] . $row->[3];
238 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
239 print "### this [$i]: ", dump( $row ),$/;
240 print "### sf: ", $row->[3], " vs ", $field->[3],
241 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
247 if ( $#$field >= 0 ) {
249 $row->[0] ne $field->[0] || # field
250 $row->[1] ne $field->[1] || # i1
251 $row->[2] ne $field->[2] # i2
254 warn "## saved/1 ", dump( $field ),$/ if ($debug);
258 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
260 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
261 ! $marc_repeatable_subfield->{ $fsf }
265 warn "## saved/2 ", dump( $field ),$/ if ($debug);
269 # append new subfields to existing field
270 push @$field, ( $row->[3], $row->[4] );
277 if (! $marc_repeatable_subfield->{ $fsf }) {
278 # make step to next subfield
279 $i = ($i + $u->{ $fsf } ) % ($len + 1);
285 warn "## saved/3 ", dump( $field ),$/ if ($debug);
293 Change level of debug warnings
301 return $debug unless defined($l);
302 warn "debug level $l" if ($l > 0);
306 =head1 Functions to create C<data_structure>
308 Those functions generally have to first in your normalization file.
312 Define new tag for I<search> and I<display>.
314 tag('Title', rec('200','a') );
320 my $name = shift or die "tag needs name as first argument";
321 my @o = grep { defined($_) && $_ ne '' } @_;
323 $out->{$name}->{tag} = $name;
324 $out->{$name}->{search} = \@o;
325 $out->{$name}->{display} = \@o;
330 Define tag just for I<display>
332 @v = display('Title', rec('200','a') );
337 my $name = shift or die "display needs name as first argument";
338 my @o = grep { defined($_) && $_ ne '' } @_;
340 $out->{$name}->{tag} = $name;
341 $out->{$name}->{display} = \@o;
346 Prepare values just for I<search>
348 @v = search('Title', rec('200','a') );
353 my $name = shift or die "search needs name as first argument";
354 my @o = grep { defined($_) && $_ ne '' } @_;
356 $out->{$name}->{tag} = $name;
357 $out->{$name}->{search} = \@o;
362 Setup fields within MARC leader or get leader
364 marc_leader('05','c');
365 my $leader = marc_leader();
370 my ($offset,$value) = @_;
373 $out->{' leader'}->{ $offset } = $value;
375 return $out->{' leader'};
381 Save value for MARC field
383 marc('900','a', rec('200','a') );
388 my $f = shift or die "marc needs field";
389 die "marc field must be numer" unless ($f =~ /^\d+$/);
391 my $sf = shift or die "marc needs subfield";
394 my $v = $_; # make var read-write for Encode
395 next unless (defined($v) && $v !~ /^\s*$/);
396 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
397 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
398 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
402 =head2 marc_repeatable_subfield
404 Save values for MARC repetable subfield
406 marc_repeatable_subfield('910', 'z', rec('909') );
410 sub marc_repeatable_subfield {
412 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
413 $marc_repeatable_subfield->{ $f . $sf }++;
417 =head2 marc_indicators
419 Set both indicators for MARC field
421 marc_indicators('900', ' ', 1);
423 Any indicator value other than C<0-9> will be treated as undefined.
427 sub marc_indicators {
428 my $f = shift || die "marc_indicators need field!\n";
430 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
431 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
433 $i1 = ' ' if ($i1 !~ /^\d$/);
434 $i2 = ' ' if ($i2 !~ /^\d$/);
435 @{ $marc_indicators->{$f} } = ($i1,$i2);
440 Save values for each MARC subfield explicitly
452 my $f = shift or die "marc_compose needs field";
453 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
455 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
456 my $m = [ $f, $i1, $i2 ];
459 my $sf = shift or die "marc_compose $f needs subfield";
460 my $v = shift or die "marc_compose $f needs value for subfield $sf";
462 next unless (defined($v) && $v !~ /^\s*$/);
463 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
464 push @$m, ( $sf, $v );
465 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ) if ($debug > 1);
468 warn "## marc_compose(d) ", dump( $m ) if ($debug > 1);
470 push @{ $marc_record }, $m;
474 =head1 Functions to extract data from input
476 This function should be used inside functions to create C<data_structure> described
481 Return all values in some field
485 TODO: order of values is probably same as in source data, need to investigate that
491 return unless (defined($rec) && defined($rec->{$f}));
492 if (ref($rec->{$f}) eq 'ARRAY') {
494 if (ref($_) eq 'HASH') {
500 } elsif( defined($rec->{$f}) ) {
507 Return all values in specific field and subfield
515 return unless (defined($rec && $rec->{$f}));
517 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
539 Apply regex to some or all values
541 @v = regex( 's/foo/bar/g', @v );
548 #warn "r: $r\n", dump(\@_);
552 push @out, $t if ($t && $t ne '');
559 Prefix all values with a string
561 @v = prefix( 'my_', @v );
566 my $p = shift or die "prefix needs string as first argument";
567 return map { $p . $_ } grep { defined($_) } @_;
572 suffix all values with a string
574 @v = suffix( '_my', @v );
579 my $s = shift or die "suffix needs string as first argument";
580 return map { $_ . $s } grep { defined($_) } @_;
585 surround all values with a two strings
587 @v = surround( 'prefix_', '_suffix', @v );
592 my $p = shift or die "surround need prefix as first argument";
593 my $s = shift or die "surround needs suffix as second argument";
594 return map { $p . $_ . $s } grep { defined($_) } @_;
612 Consult lookup hashes for some value
620 my $k = shift or return;
621 return unless (defined($lookup->{$k}));
622 if (ref($lookup->{$k}) eq 'ARRAY') {
623 return @{ $lookup->{$k} };
625 return $lookup->{$k};
631 Joins walues with some delimiter
633 $v = join_with(", ", @v);
639 return join($d, grep { defined($_) && $_ ne '' } @_);
644 Split record subfield on some regex and take one of parts out
646 $a_before_semi_column =
647 split_rec_on('200','a', /\s*;\s*/, $part);
649 C<$part> is optional number of element. First element is
652 If there is no C<$part> parameter or C<$part> is 0, this function will
653 return all values produced by splitting.
658 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
660 my ($fld, $sf, $regex, $part) = @_;
661 warn "### regex ", ref($regex), $regex if ($debug > 2);
663 my @r = rec( $fld, $sf );
665 warn "### first rec($fld,$sf) = ",dump($v) if ($debug > 2);
667 my @s = split( $regex, $v );
668 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s) if ($debug > 1);
670 return $s[ $part - 1 ];