1 package WebPAC::Normalize;
11 search_display search display sorted
15 regex prefix suffix surround
16 first lookup join_with
32 #use base qw/WebPAC::Common/;
33 use Data::Dump qw/dump/;
41 use WebPAC::Normalize::ISBN;
42 push @EXPORT, ( 'isbn_10', 'isbn_13' );
44 use WebPAC::Normalize::MARC;
46 marc marc_indicators marc_repeatable_subfield
47 marc_compose marc_leader marc_fixed
48 marc_duplicate marc_remove marc_count
53 use Storable qw/dclone/;
57 WebPAC::Normalize - describe normalisaton rules using sets
61 our $VERSION = '0.36';
65 This module uses C<conf/normalize/*.pl> files to perform normalisation
66 from input records using perl functions which are specialized for set
69 Sets are implemented as arrays, and normalisation file is valid perl, which
70 means that you check it's validity before running WebPAC using
71 C<perl -c normalize.pl>.
73 Normalisation can generate multiple output normalized data. For now, supported output
74 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
79 Functions which start with C<_> are private and used by WebPAC internally.
80 All other functions are available for use within normalisation rules.
86 my $ds = WebPAC::Normalize::data_structure(
87 lookup => $lookup_hash,
89 rules => $normalize_pl_config,
90 marc_encoding => 'utf-8',
92 load_row_coderef => sub {
93 my ($database,$input,$mfn) = @_;
94 $store->load_row( database => $database, input => $input, id => $mfn );
98 Options C<row>, C<rules> and C<log> are mandatory while all
101 C<load_row_coderef> is closure only used when executing lookups, so they will
102 die if it's not defined.
104 This function will B<die> if normalizastion can't be evaled.
106 Since this function isn't exported you have to call it with
107 C<WebPAC::Normalize::data_structure>.
111 my $load_row_coderef;
116 die "need row argument" unless ($arg->{row});
117 die "need normalisation argument" unless ($arg->{rules});
119 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
120 _set_ds( $arg->{row} );
121 _set_config( $arg->{config} ) if defined($arg->{config});
122 _clean_ds( %{ $arg } );
123 $load_row_coderef = $arg->{load_row_coderef};
126 no warnings 'redefine';
127 eval "$arg->{rules};";
128 die "error evaling $arg->{rules}: $@\n" if ($@);
135 Set current record hash
144 $rec = shift or die "no record hash";
145 $WebPAC::Normalize::MARC::rec = $rec;
150 my $rec = _get_rec();
154 sub _get_rec { $rec };
158 Set current config hash
160 _set_config( $config );
168 Code of current database
186 Return hash formatted as data structure
195 #warn "## out = ",dump($out);
201 Clean data structure hash for next record
210 WebPAC::Normalize::MARC::_clean();
215 Set current lookup hash
217 _set_lookup( $lookup );
229 Get current lookup hash
231 my $lookup = _get_lookup();
241 Setup code reference which will return L<data_structure> from
245 my ($database,$input,$mfn) = @_;
246 $store->load_row( database => $database, input => $input, id => $mfn );
253 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
255 $load_row_coderef = $coderef;
260 Change level of debug warnings
268 return $debug unless defined($l);
269 warn "debug level $l",$/ if ($l > 0);
271 $WebPAC::Normalize::MARC::debug = $debug;
274 =head1 Functions to create C<data_structure>
276 Those functions generally have to first in your normalization file.
280 Generic way to set values for some name
282 to('field-name', 'name-value' => rec('200','a') );
284 There are many helpers defined below which might be easier to use.
289 my $type = shift or confess "need type -- BUG?";
290 my $name = shift or confess "needs name as first argument";
291 my @o = grep { defined($_) && $_ ne '' } @_;
293 $out->{$name}->{$type} = \@o;
296 =head2 search_display
298 Define output for L<search> and L<display> at the same time
300 search_display('Title', rec('200','a') );
305 my $name = shift or die "search_display needs name as first argument";
306 my @o = grep { defined($_) && $_ ne '' } @_;
308 $out->{$name}->{search} = \@o;
309 $out->{$name}->{display} = \@o;
314 Old name for L<search_display>, it will probably be removed at one point.
319 search_display( @_ );
324 Define output just for I<display>
326 @v = display('Title', rec('200','a') );
330 sub display { to( 'display', @_ ) }
334 Prepare values just for I<search>
336 @v = search('Title', rec('200','a') );
340 sub search { to( 'search', @_ ) }
344 Insert into lists which will be automatically sorted
346 sorted('Title', rec('200','a') );
350 sub sorted { to( 'sorted', @_ ) }
354 Insert new row of data into output module
356 row( column => 'foo', column2 => 'bar' );
360 use Data::Dump qw/dump/;
363 die "array doesn't have odd number of elements but $#_: ",dump( @_ ) if $#_ % 2 == 1;
364 my $table = shift @_;
365 push @{ $out->{'_rows'}->{$table} }, {@_};
369 =head1 Functions to extract data from input
371 This function should be used inside functions to create C<data_structure> described
374 =head2 _pack_subfields_hash
376 @subfields = _pack_subfields_hash( $h );
377 $subfields = _pack_subfields_hash( $h, 1 );
379 Return each subfield value in array or pack them all together and return scalar
380 with subfields (denoted by C<^>) and values.
384 sub _pack_subfields_hash {
386 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
388 my ($hash,$include_subfields) = @_;
390 # sanity and ease of use
391 return $hash if (ref($hash) ne 'HASH');
393 my $h = dclone( $hash );
395 if ( defined($h->{subfields}) ) {
396 my $sfs = delete $h->{subfields} || die "no subfields?";
399 my $sf = shift @$sfs;
400 push @out, '^' . $sf if ($include_subfields);
402 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
403 # single element subfields are not arrays
404 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
406 push @out, $h->{$sf};
408 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
409 push @out, $h->{$sf}->[$o];
412 if ($include_subfields) {
413 return join('', @out);
418 if ($include_subfields) {
420 foreach my $sf (sort keys %$h) {
421 if (ref($h->{$sf}) eq 'ARRAY') {
422 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
424 $out .= '^' . $sf . $h->{$sf};
429 # FIXME this should probably be in alphabetical order instead of hash order
437 Return all values in some field
441 TODO: order of values is probably same as in source data, need to investigate that
447 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
448 return unless (defined($rec) && defined($rec->{$f}));
449 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
450 if (ref($rec->{$f}) eq 'ARRAY') {
452 foreach my $h ( @{ $rec->{$f} } ) {
453 if (ref($h) eq 'HASH') {
454 push @out, ( _pack_subfields_hash( $h ) );
460 } elsif( defined($rec->{$f}) ) {
467 Return all values in specific field and subfield
475 return unless (defined($rec && $rec->{$f}));
477 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
479 if (ref($_->{$sf}) eq 'ARRAY') {
484 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
494 If rec() returns just single value, it will
495 return scalar, not array.
506 if ($#out == 0 && ! wantarray) {
517 Returns first value from field
520 $v = frec('200','a');
526 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
534 Check if first values from two fields are same or different
536 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
539 # values are different
542 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
543 could write something like:
545 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
549 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
550 in order to parse text and create invalid function C<eqfrec>.
555 my ( $f1,$sf1, $f2, $sf2 ) = @_;
556 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
560 return ! frec_eq( @_ );
565 Apply regex to some or all values
567 @v = regex( 's/foo/bar/g', @v );
574 #warn "r: $r\n", dump(\@_);
578 push @out, $t if ($t && $t ne '');
585 Prefix all values with a string
587 @v = prefix( 'my_', @v );
593 return @_ unless defined( $p );
594 return map { $p . $_ } grep { defined($_) } @_;
599 suffix all values with a string
601 @v = suffix( '_my', @v );
607 return @_ unless defined( $s );
608 return map { $_ . $s } grep { defined($_) } @_;
613 surround all values with a two strings
615 @v = surround( 'prefix_', '_suffix', @v );
622 $p = '' unless defined( $p );
623 $s = '' unless defined( $s );
624 return map { $p . $_ . $s } grep { defined($_) } @_;
642 Consult lookup hashes for some value
646 'ffkk/peri/mfn'.rec('000')
648 'ffkk','peri','200-a-200-e',
650 first(rec(200,'a')).' '.first(rec('200','e'))
654 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
655 normal lookup definition in C<conf/lookup/something.pl> which looks like:
658 # which results to return from record recorded in lookup
659 sub { 'ffkk/peri/mfn' . rec('000') },
660 # from which database and input
662 # such that following values match
663 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
664 # if this part is missing, we will try to match same fields
665 # from lookup record and current one, or you can override
666 # which records to use from current record using
667 sub { rec('900','x') . ' ' . rec('900','y') },
670 You can think about this lookup as SQL (if that helps):
677 sub { filter from lookuped record }
679 sub { optional filter on current record }
686 my ($what, $database, $input, $key, $having) = @_;
688 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
690 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
691 return unless (defined($lookup->{$database}->{$input}->{$key}));
693 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
696 my @having = $having->();
698 warn "## having = ", dump( @having ) if ($debug > 2);
700 foreach my $h ( @having ) {
701 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
702 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
703 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
707 return unless ($mfns);
709 my @mfns = sort keys %$mfns;
711 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
716 foreach my $mfn (@mfns) {
717 $rec = $load_row_coderef->( $database, $input, $mfn );
719 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
721 my @vals = $what->();
723 push @out, ( @vals );
725 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
728 # if (ref($lookup->{$k}) eq 'ARRAY') {
729 # return @{ $lookup->{$k} };
731 # return $lookup->{$k};
736 warn "## lookup returns = ", dump(@out), $/ if ($debug);
745 =head2 save_into_lookup
747 Save value into lookup. It associates current database, input
748 and specific keys with one or more values which will be
751 MFN will be extracted from first occurence current of field 000
752 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
754 my $nr = save_into_lookup($database,$input,$key,sub {
755 # code which produce one or more values
758 It returns number of items saved.
760 This function shouldn't be called directly, it's called from code created by
765 sub save_into_lookup {
766 my ($database,$input,$key,$coderef) = @_;
767 die "save_into_lookup needs database" unless defined($database);
768 die "save_into_lookup needs input" unless defined($input);
769 die "save_into_lookup needs key" unless defined($key);
770 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
772 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
775 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
776 defined($config->{_mfn}) ? $config->{_mfn} :
777 die "mfn not defined or zero";
781 foreach my $v ( $coderef->() ) {
782 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
783 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
792 Consult config values stored in C<config.yml>
794 # return database code (key under databases in yaml)
795 $database_code = config(); # use _ from hash
796 $database_name = config('name');
797 $database_input_name = config('input name');
799 Up to three levels are supported.
804 return unless ($config);
812 warn "### getting config($p)\n" if ($debug > 1);
814 my @p = split(/\s+/,$p);
816 $v = $config->{ '_' }; # special, database code
819 my $c = dclone( $config );
822 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
823 if (ref($c) eq 'ARRAY') {
825 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
829 if (! defined($c->{$k}) ) {
840 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
841 warn "config( '$p' ) is empty\n" if (! $v);
848 Returns unique id of this record
852 Returns C<42/2> for 2nd occurence of MFN 42.
857 my $mfn = $config->{_mfn} || die "no _mfn in config data";
858 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
863 Joins walues with some delimiter
865 $v = join_with(", ", @v);
871 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
872 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
873 return '' unless defined($v);
879 Split record subfield on some regex and take one of parts out
881 $a_before_semi_column =
882 split_rec_on('200','a', /\s*;\s*/, $part);
884 C<$part> is optional number of element. First element is
887 If there is no C<$part> parameter or C<$part> is 0, this function will
888 return all values produced by splitting.
893 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
895 my ($fld, $sf, $regex, $part) = @_;
896 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
898 my @r = rec( $fld, $sf );
900 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
902 return '' if ( ! defined($v) || $v =~ /^\s*$/);
904 my @s = split( $regex, $v );
905 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
906 if ($part && $part > 0) {
907 return $s[ $part - 1 ];
917 set( key => 'value' );
923 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
934 my $k = shift || return;
936 warn "## get $k = ", dump( $v ), $/ if ( $debug );
942 if ( count( @result ) == 1 ) {
943 # do something if only 1 result is there
949 warn "## count ",dump(@_),$/ if ( $debug );
955 Always return field as array
957 foreach my $d ( rec_array('field') {
964 my $d = $rec->{ $_[0] };
965 return @$d if ref($d) eq 'ARRAY';