1 package WebPAC::Normalize;
11 search_display search display sorted
15 regex prefix suffix surround
16 first lookup join_with
32 #use base qw/WebPAC::Common/;
33 use Data::Dump qw/dump/;
41 use WebPAC::Normalize::ISBN;
42 push @EXPORT, ( 'isbn_10', 'isbn_13' );
44 use WebPAC::Normalize::MARC;
46 marc marc_indicators marc_repeatable_subfield
47 marc_compose marc_leader marc_fixed
48 marc_duplicate marc_remove marc_count
53 use Storable qw/dclone/;
57 WebPAC::Normalize - describe normalisaton rules using sets
61 our $VERSION = '0.36';
65 This module uses C<conf/normalize/*.pl> files to perform normalisation
66 from input records using perl functions which are specialized for set
69 Sets are implemented as arrays, and normalisation file is valid perl, which
70 means that you check it's validity before running WebPAC using
71 C<perl -c normalize.pl>.
73 Normalisation can generate multiple output normalized data. For now, supported output
74 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
79 Functions which start with C<_> are private and used by WebPAC internally.
80 All other functions are available for use within normalisation rules.
86 my $ds = WebPAC::Normalize::data_structure(
87 lookup => $lookup_hash,
89 rules => $normalize_pl_config,
90 marc_encoding => 'utf-8',
92 load_row_coderef => sub {
93 my ($database,$input,$mfn) = @_;
94 $store->load_row( database => $database, input => $input, id => $mfn );
98 Options C<row>, C<rules> and C<log> are mandatory while all
101 C<load_row_coderef> is closure only used when executing lookups, so they will
102 die if it's not defined.
104 This function will B<die> if normalizastion can't be evaled.
106 Since this function isn't exported you have to call it with
107 C<WebPAC::Normalize::data_structure>.
111 my $load_row_coderef;
116 die "need row argument" unless ($arg->{row});
117 die "need normalisation argument" unless ($arg->{rules});
119 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
120 _set_ds( $arg->{row} );
121 _set_config( $arg->{config} ) if defined($arg->{config});
122 _clean_ds( %{ $arg } );
123 $load_row_coderef = $arg->{load_row_coderef};
126 no warnings 'redefine';
127 eval "$arg->{rules};";
128 die "error evaling $arg->{rules}: $@\n" if ($@);
135 Set current record hash
144 $rec = shift or die "no record hash";
145 $WebPAC::Normalize::MARC::rec = $rec;
150 my $rec = _get_rec();
154 sub _get_rec { $rec };
157 my $d = $rec->{ $_[0] };
158 return @$d if ref($d) eq 'ARRAY';
159 die "field $_[0] not array: ",dump( $d );
164 Set current config hash
166 _set_config( $config );
174 Code of current database
192 Return hash formatted as data structure
201 #warn "## out = ",dump($out);
207 Clean data structure hash for next record
216 WebPAC::Normalize::MARC::_clean();
221 Set current lookup hash
223 _set_lookup( $lookup );
235 Get current lookup hash
237 my $lookup = _get_lookup();
247 Setup code reference which will return L<data_structure> from
251 my ($database,$input,$mfn) = @_;
252 $store->load_row( database => $database, input => $input, id => $mfn );
259 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
261 $load_row_coderef = $coderef;
266 Change level of debug warnings
274 return $debug unless defined($l);
275 warn "debug level $l",$/ if ($l > 0);
277 $WebPAC::Normalize::MARC::debug = $debug;
280 =head1 Functions to create C<data_structure>
282 Those functions generally have to first in your normalization file.
286 Generic way to set values for some name
288 to('field-name', 'name-value' => rec('200','a') );
290 There are many helpers defined below which might be easier to use.
295 my $type = shift or confess "need type -- BUG?";
296 my $name = shift or confess "needs name as first argument";
297 my @o = grep { defined($_) && $_ ne '' } @_;
299 $out->{$name}->{$type} = \@o;
302 =head2 search_display
304 Define output for L<search> and L<display> at the same time
306 search_display('Title', rec('200','a') );
311 my $name = shift or die "search_display needs name as first argument";
312 my @o = grep { defined($_) && $_ ne '' } @_;
314 $out->{$name}->{search} = \@o;
315 $out->{$name}->{display} = \@o;
320 Old name for L<search_display>, it will probably be removed at one point.
325 search_display( @_ );
330 Define output just for I<display>
332 @v = display('Title', rec('200','a') );
336 sub display { to( 'display', @_ ) }
340 Prepare values just for I<search>
342 @v = search('Title', rec('200','a') );
346 sub search { to( 'search', @_ ) }
350 Insert into lists which will be automatically sorted
352 sorted('Title', rec('200','a') );
356 sub sorted { to( 'sorted', @_ ) }
360 Insert new row of data into output module
362 row( column => 'foo', column2 => 'bar' );
366 use Data::Dump qw/dump/;
369 die "array doesn't have even number of elements but $#_: ",dump( @_ ) if $#_ % 2 != 1;
371 push @{ $out->{'_rows'} }, {@_};
375 =head1 Functions to extract data from input
377 This function should be used inside functions to create C<data_structure> described
380 =head2 _pack_subfields_hash
382 @subfields = _pack_subfields_hash( $h );
383 $subfields = _pack_subfields_hash( $h, 1 );
385 Return each subfield value in array or pack them all together and return scalar
386 with subfields (denoted by C<^>) and values.
390 sub _pack_subfields_hash {
392 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
394 my ($hash,$include_subfields) = @_;
396 # sanity and ease of use
397 return $hash if (ref($hash) ne 'HASH');
399 my $h = dclone( $hash );
401 if ( defined($h->{subfields}) ) {
402 my $sfs = delete $h->{subfields} || die "no subfields?";
405 my $sf = shift @$sfs;
406 push @out, '^' . $sf if ($include_subfields);
408 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
409 # single element subfields are not arrays
410 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
412 push @out, $h->{$sf};
414 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
415 push @out, $h->{$sf}->[$o];
418 if ($include_subfields) {
419 return join('', @out);
424 if ($include_subfields) {
426 foreach my $sf (sort keys %$h) {
427 if (ref($h->{$sf}) eq 'ARRAY') {
428 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
430 $out .= '^' . $sf . $h->{$sf};
435 # FIXME this should probably be in alphabetical order instead of hash order
443 Return all values in some field
447 TODO: order of values is probably same as in source data, need to investigate that
453 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
454 return unless (defined($rec) && defined($rec->{$f}));
455 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
456 if (ref($rec->{$f}) eq 'ARRAY') {
458 foreach my $h ( @{ $rec->{$f} } ) {
459 if (ref($h) eq 'HASH') {
460 push @out, ( _pack_subfields_hash( $h ) );
466 } elsif( defined($rec->{$f}) ) {
473 Return all values in specific field and subfield
481 return unless (defined($rec && $rec->{$f}));
483 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
485 if (ref($_->{$sf}) eq 'ARRAY') {
490 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
500 If rec() returns just single value, it will
501 return scalar, not array.
512 if ($#out == 0 && ! wantarray) {
523 Returns first value from field
526 $v = frec('200','a');
532 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
540 Check if first values from two fields are same or different
542 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
545 # values are different
548 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
549 could write something like:
551 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
555 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
556 in order to parse text and create invalid function C<eqfrec>.
561 my ( $f1,$sf1, $f2, $sf2 ) = @_;
562 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
566 return ! frec_eq( @_ );
571 Apply regex to some or all values
573 @v = regex( 's/foo/bar/g', @v );
580 #warn "r: $r\n", dump(\@_);
584 push @out, $t if ($t && $t ne '');
591 Prefix all values with a string
593 @v = prefix( 'my_', @v );
599 return @_ unless defined( $p );
600 return map { $p . $_ } grep { defined($_) } @_;
605 suffix all values with a string
607 @v = suffix( '_my', @v );
613 return @_ unless defined( $s );
614 return map { $_ . $s } grep { defined($_) } @_;
619 surround all values with a two strings
621 @v = surround( 'prefix_', '_suffix', @v );
628 $p = '' unless defined( $p );
629 $s = '' unless defined( $s );
630 return map { $p . $_ . $s } grep { defined($_) } @_;
648 Consult lookup hashes for some value
652 'ffkk/peri/mfn'.rec('000')
654 'ffkk','peri','200-a-200-e',
656 first(rec(200,'a')).' '.first(rec('200','e'))
660 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
661 normal lookup definition in C<conf/lookup/something.pl> which looks like:
664 # which results to return from record recorded in lookup
665 sub { 'ffkk/peri/mfn' . rec('000') },
666 # from which database and input
668 # such that following values match
669 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
670 # if this part is missing, we will try to match same fields
671 # from lookup record and current one, or you can override
672 # which records to use from current record using
673 sub { rec('900','x') . ' ' . rec('900','y') },
676 You can think about this lookup as SQL (if that helps):
683 sub { filter from lookuped record }
685 sub { optional filter on current record }
692 my ($what, $database, $input, $key, $having) = @_;
694 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
696 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
697 return unless (defined($lookup->{$database}->{$input}->{$key}));
699 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
702 my @having = $having->();
704 warn "## having = ", dump( @having ) if ($debug > 2);
706 foreach my $h ( @having ) {
707 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
708 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
709 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
713 return unless ($mfns);
715 my @mfns = sort keys %$mfns;
717 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
722 foreach my $mfn (@mfns) {
723 $rec = $load_row_coderef->( $database, $input, $mfn );
725 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
727 my @vals = $what->();
729 push @out, ( @vals );
731 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
734 # if (ref($lookup->{$k}) eq 'ARRAY') {
735 # return @{ $lookup->{$k} };
737 # return $lookup->{$k};
742 warn "## lookup returns = ", dump(@out), $/ if ($debug);
751 =head2 save_into_lookup
753 Save value into lookup. It associates current database, input
754 and specific keys with one or more values which will be
757 MFN will be extracted from first occurence current of field 000
758 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
760 my $nr = save_into_lookup($database,$input,$key,sub {
761 # code which produce one or more values
764 It returns number of items saved.
766 This function shouldn't be called directly, it's called from code created by
771 sub save_into_lookup {
772 my ($database,$input,$key,$coderef) = @_;
773 die "save_into_lookup needs database" unless defined($database);
774 die "save_into_lookup needs input" unless defined($input);
775 die "save_into_lookup needs key" unless defined($key);
776 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
778 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
781 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
782 defined($config->{_mfn}) ? $config->{_mfn} :
783 die "mfn not defined or zero";
787 foreach my $v ( $coderef->() ) {
788 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
789 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
798 Consult config values stored in C<config.yml>
800 # return database code (key under databases in yaml)
801 $database_code = config(); # use _ from hash
802 $database_name = config('name');
803 $database_input_name = config('input name');
805 Up to three levels are supported.
810 return unless ($config);
818 warn "### getting config($p)\n" if ($debug > 1);
820 my @p = split(/\s+/,$p);
822 $v = $config->{ '_' }; # special, database code
825 my $c = dclone( $config );
828 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
829 if (ref($c) eq 'ARRAY') {
831 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
835 if (! defined($c->{$k}) ) {
846 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
847 warn "config( '$p' ) is empty\n" if (! $v);
854 Returns unique id of this record
858 Returns C<42/2> for 2nd occurence of MFN 42.
863 my $mfn = $config->{_mfn} || die "no _mfn in config data";
864 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
869 Joins walues with some delimiter
871 $v = join_with(", ", @v);
877 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
878 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
879 return '' unless defined($v);
885 Split record subfield on some regex and take one of parts out
887 $a_before_semi_column =
888 split_rec_on('200','a', /\s*;\s*/, $part);
890 C<$part> is optional number of element. First element is
893 If there is no C<$part> parameter or C<$part> is 0, this function will
894 return all values produced by splitting.
899 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
901 my ($fld, $sf, $regex, $part) = @_;
902 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
904 my @r = rec( $fld, $sf );
906 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
908 return '' if ( ! defined($v) || $v =~ /^\s*$/);
910 my @s = split( $regex, $v );
911 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
912 if ($part && $part > 0) {
913 return $s[ $part - 1 ];
923 set( key => 'value' );
929 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
940 my $k = shift || return;
942 warn "## get $k = ", dump( $v ), $/ if ( $debug );
948 if ( count( @result ) == 1 ) {
949 # do something if only 1 result is there
955 warn "## count ",dump(@_),$/ if ( $debug );