1 package WebPAC::Normalize;
11 search_display search display sorted
15 regex prefix suffix surround
16 first lookup join_with
29 #use base qw/WebPAC::Common/;
30 use Data::Dump qw/dump/;
38 use WebPAC::Normalize::ISBN;
39 push @EXPORT, ( 'isbn_10', 'isbn_13' );
41 use WebPAC::Normalize::MARC;
43 marc marc_indicators marc_repeatable_subfield
44 marc_compose marc_leader marc_fixed
45 marc_duplicate marc_remove marc_count
50 use Storable qw/dclone/;
54 WebPAC::Normalize - describe normalisaton rules using sets
58 our $VERSION = '0.36';
62 This module uses C<conf/normalize/*.pl> files to perform normalisation
63 from input records using perl functions which are specialized for set
66 Sets are implemented as arrays, and normalisation file is valid perl, which
67 means that you check it's validity before running WebPAC using
68 C<perl -c normalize.pl>.
70 Normalisation can generate multiple output normalized data. For now, supported output
71 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
76 Functions which start with C<_> are private and used by WebPAC internally.
77 All other functions are available for use within normalisation rules.
83 my $ds = WebPAC::Normalize::data_structure(
84 lookup => $lookup_hash,
86 rules => $normalize_pl_config,
87 marc_encoding => 'utf-8',
89 load_row_coderef => sub {
90 my ($database,$input,$mfn) = @_;
91 $store->load_row( database => $database, input => $input, id => $mfn );
95 Options C<row>, C<rules> and C<log> are mandatory while all
98 C<load_row_coderef> is closure only used when executing lookups, so they will
99 die if it's not defined.
101 This function will B<die> if normalizastion can't be evaled.
103 Since this function isn't exported you have to call it with
104 C<WebPAC::Normalize::data_structure>.
108 my $load_row_coderef;
113 die "need row argument" unless ($arg->{row});
114 die "need normalisation argument" unless ($arg->{rules});
116 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
117 _set_ds( $arg->{row} );
118 _set_config( $arg->{config} ) if defined($arg->{config});
119 _clean_ds( %{ $arg } );
120 $load_row_coderef = $arg->{load_row_coderef};
123 no warnings 'redefine';
124 eval "$arg->{rules};";
125 die "error evaling $arg->{rules}: $@\n" if ($@);
132 Set current record hash
141 $rec = shift or die "no record hash";
142 $WebPAC::Normalize::MARC::rec = $rec;
147 my $rec = _get_rec();
151 sub _get_rec { $rec };
154 my $d = $rec->{ $_[0] };
155 return @$d if ref($d) eq 'ARRAY';
156 die "field $_[0] not array: ",dump( $d );
161 Set current config hash
163 _set_config( $config );
171 Code of current database
189 Return hash formatted as data structure
198 #warn "## out = ",dump($out);
204 Clean data structure hash for next record
213 WebPAC::Normalize::MARC::_clean();
218 Set current lookup hash
220 _set_lookup( $lookup );
232 Get current lookup hash
234 my $lookup = _get_lookup();
244 Setup code reference which will return L<data_structure> from
248 my ($database,$input,$mfn) = @_;
249 $store->load_row( database => $database, input => $input, id => $mfn );
256 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
258 $load_row_coderef = $coderef;
263 Change level of debug warnings
271 return $debug unless defined($l);
272 warn "debug level $l",$/ if ($l > 0);
274 $WebPAC::Normalize::MARC::debug = $debug;
277 =head1 Functions to create C<data_structure>
279 Those functions generally have to first in your normalization file.
283 Generic way to set values for some name
285 to('field-name', 'name-value' => rec('200','a') );
287 There are many helpers defined below which might be easier to use.
292 my $type = shift or confess "need type -- BUG?";
293 my $name = shift or confess "needs name as first argument";
294 my @o = grep { defined($_) && $_ ne '' } @_;
296 $out->{$name}->{$type} = \@o;
299 =head2 search_display
301 Define output for L<search> and L<display> at the same time
303 search_display('Title', rec('200','a') );
308 my $name = shift or die "search_display needs name as first argument";
309 my @o = grep { defined($_) && $_ ne '' } @_;
311 $out->{$name}->{search} = \@o;
312 $out->{$name}->{display} = \@o;
317 Old name for L<search_display>, it will probably be removed at one point.
322 search_display( @_ );
327 Define output just for I<display>
329 @v = display('Title', rec('200','a') );
333 sub display { to( 'display', @_ ) }
337 Prepare values just for I<search>
339 @v = search('Title', rec('200','a') );
343 sub search { to( 'search', @_ ) }
347 Insert into lists which will be automatically sorted
349 sorted('Title', rec('200','a') );
353 sub sorted { to( 'sorted', @_ ) }
357 Insert new row of data into output module
359 row( column => 'foo', column2 => 'bar' );
363 use Data::Dump qw/dump/;
366 die "array doesn't have even number of elements but $#_: ",dump( @_ ) if $#_ % 2 != 1;
368 push @{ $out->{'_rows'} }, {@_};
372 =head1 Functions to extract data from input
374 This function should be used inside functions to create C<data_structure> described
377 =head2 _pack_subfields_hash
379 @subfields = _pack_subfields_hash( $h );
380 $subfields = _pack_subfields_hash( $h, 1 );
382 Return each subfield value in array or pack them all together and return scalar
383 with subfields (denoted by C<^>) and values.
387 sub _pack_subfields_hash {
389 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
391 my ($hash,$include_subfields) = @_;
393 # sanity and ease of use
394 return $hash if (ref($hash) ne 'HASH');
396 my $h = dclone( $hash );
398 if ( defined($h->{subfields}) ) {
399 my $sfs = delete $h->{subfields} || die "no subfields?";
402 my $sf = shift @$sfs;
403 push @out, '^' . $sf if ($include_subfields);
405 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
406 # single element subfields are not arrays
407 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
409 push @out, $h->{$sf};
411 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
412 push @out, $h->{$sf}->[$o];
415 if ($include_subfields) {
416 return join('', @out);
421 if ($include_subfields) {
423 foreach my $sf (sort keys %$h) {
424 if (ref($h->{$sf}) eq 'ARRAY') {
425 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
427 $out .= '^' . $sf . $h->{$sf};
432 # FIXME this should probably be in alphabetical order instead of hash order
440 Return all values in some field
444 TODO: order of values is probably same as in source data, need to investigate that
450 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
451 return unless (defined($rec) && defined($rec->{$f}));
452 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
453 if (ref($rec->{$f}) eq 'ARRAY') {
455 foreach my $h ( @{ $rec->{$f} } ) {
456 if (ref($h) eq 'HASH') {
457 push @out, ( _pack_subfields_hash( $h ) );
463 } elsif( defined($rec->{$f}) ) {
470 Return all values in specific field and subfield
478 return unless (defined($rec && $rec->{$f}));
480 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
482 if (ref($_->{$sf}) eq 'ARRAY') {
487 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
497 If rec() returns just single value, it will
498 return scalar, not array.
509 if ($#out == 0 && ! wantarray) {
520 Returns first value from field
523 $v = frec('200','a');
529 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
537 Check if first values from two fields are same or different
539 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
542 # values are different
545 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
546 could write something like:
548 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
552 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
553 in order to parse text and create invalid function C<eqfrec>.
558 my ( $f1,$sf1, $f2, $sf2 ) = @_;
559 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
563 return ! frec_eq( @_ );
568 Apply regex to some or all values
570 @v = regex( 's/foo/bar/g', @v );
577 #warn "r: $r\n", dump(\@_);
581 push @out, $t if ($t && $t ne '');
588 Prefix all values with a string
590 @v = prefix( 'my_', @v );
596 return @_ unless defined( $p );
597 return map { $p . $_ } grep { defined($_) } @_;
602 suffix all values with a string
604 @v = suffix( '_my', @v );
610 return @_ unless defined( $s );
611 return map { $_ . $s } grep { defined($_) } @_;
616 surround all values with a two strings
618 @v = surround( 'prefix_', '_suffix', @v );
625 $p = '' unless defined( $p );
626 $s = '' unless defined( $s );
627 return map { $p . $_ . $s } grep { defined($_) } @_;
645 Consult lookup hashes for some value
649 'ffkk/peri/mfn'.rec('000')
651 'ffkk','peri','200-a-200-e',
653 first(rec(200,'a')).' '.first(rec('200','e'))
657 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
658 normal lookup definition in C<conf/lookup/something.pl> which looks like:
661 # which results to return from record recorded in lookup
662 sub { 'ffkk/peri/mfn' . rec('000') },
663 # from which database and input
665 # such that following values match
666 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
667 # if this part is missing, we will try to match same fields
668 # from lookup record and current one, or you can override
669 # which records to use from current record using
670 sub { rec('900','x') . ' ' . rec('900','y') },
673 You can think about this lookup as SQL (if that helps):
680 sub { filter from lookuped record }
682 sub { optional filter on current record }
689 my ($what, $database, $input, $key, $having) = @_;
691 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
693 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
694 return unless (defined($lookup->{$database}->{$input}->{$key}));
696 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
699 my @having = $having->();
701 warn "## having = ", dump( @having ) if ($debug > 2);
703 foreach my $h ( @having ) {
704 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
705 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
706 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
710 return unless ($mfns);
712 my @mfns = sort keys %$mfns;
714 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
719 foreach my $mfn (@mfns) {
720 $rec = $load_row_coderef->( $database, $input, $mfn );
722 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
724 my @vals = $what->();
726 push @out, ( @vals );
728 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
731 # if (ref($lookup->{$k}) eq 'ARRAY') {
732 # return @{ $lookup->{$k} };
734 # return $lookup->{$k};
739 warn "## lookup returns = ", dump(@out), $/ if ($debug);
748 =head2 save_into_lookup
750 Save value into lookup. It associates current database, input
751 and specific keys with one or more values which will be
754 MFN will be extracted from first occurence current of field 000
755 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
757 my $nr = save_into_lookup($database,$input,$key,sub {
758 # code which produce one or more values
761 It returns number of items saved.
763 This function shouldn't be called directly, it's called from code created by
768 sub save_into_lookup {
769 my ($database,$input,$key,$coderef) = @_;
770 die "save_into_lookup needs database" unless defined($database);
771 die "save_into_lookup needs input" unless defined($input);
772 die "save_into_lookup needs key" unless defined($key);
773 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
775 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
778 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
779 defined($config->{_mfn}) ? $config->{_mfn} :
780 die "mfn not defined or zero";
784 foreach my $v ( $coderef->() ) {
785 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
786 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
795 Consult config values stored in C<config.yml>
797 # return database code (key under databases in yaml)
798 $database_code = config(); # use _ from hash
799 $database_name = config('name');
800 $database_input_name = config('input name');
802 Up to three levels are supported.
807 return unless ($config);
815 warn "### getting config($p)\n" if ($debug > 1);
817 my @p = split(/\s+/,$p);
819 $v = $config->{ '_' }; # special, database code
822 my $c = dclone( $config );
825 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
826 if (ref($c) eq 'ARRAY') {
828 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
832 if (! defined($c->{$k}) ) {
843 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
844 warn "config( '$p' ) is empty\n" if (! $v);
851 Returns unique id of this record
855 Returns C<42/2> for 2nd occurence of MFN 42.
860 my $mfn = $config->{_mfn} || die "no _mfn in config data";
861 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
866 Joins walues with some delimiter
868 $v = join_with(", ", @v);
874 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
875 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
876 return '' unless defined($v);
882 Split record subfield on some regex and take one of parts out
884 $a_before_semi_column =
885 split_rec_on('200','a', /\s*;\s*/, $part);
887 C<$part> is optional number of element. First element is
890 If there is no C<$part> parameter or C<$part> is 0, this function will
891 return all values produced by splitting.
896 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
898 my ($fld, $sf, $regex, $part) = @_;
899 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
901 my @r = rec( $fld, $sf );
903 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
905 return '' if ( ! defined($v) || $v =~ /^\s*$/);
907 my @s = split( $regex, $v );
908 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
909 if ($part && $part > 0) {
910 return $s[ $part - 1 ];
920 set( key => 'value' );
926 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
937 my $k = shift || return;
939 warn "## get $k = ", dump( $v ), $/ if ( $debug );
945 if ( count( @result ) == 1 ) {
946 # do something if only 1 result is there
952 warn "## count ",dump(@_),$/ if ( $debug );