1 package WebPAC::Normalize;
11 search_display search display sorted
15 regex prefix suffix surround
16 first lookup join_with
29 #use base qw/WebPAC::Common/;
30 use Data::Dump qw/dump/;
38 use WebPAC::Normalize::ISBN;
39 push @EXPORT, ( 'isbn_10', 'isbn_13' );
41 use WebPAC::Normalize::MARC;
43 marc marc_indicators marc_repeatable_subfield
44 marc_compose marc_leader marc_fixed
45 marc_duplicate marc_remove marc_count
50 use Storable qw/dclone/;
54 WebPAC::Normalize - describe normalisaton rules using sets
58 our $VERSION = '0.36';
62 This module uses C<conf/normalize/*.pl> files to perform normalisation
63 from input records using perl functions which are specialized for set
66 Sets are implemented as arrays, and normalisation file is valid perl, which
67 means that you check it's validity before running WebPAC using
68 C<perl -c normalize.pl>.
70 Normalisation can generate multiple output normalized data. For now, supported output
71 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
76 Functions which start with C<_> are private and used by WebPAC internally.
77 All other functions are available for use within normalisation rules.
83 my $ds = WebPAC::Normalize::data_structure(
84 lookup => $lookup_hash,
86 rules => $normalize_pl_config,
87 marc_encoding => 'utf-8',
89 load_row_coderef => sub {
90 my ($database,$input,$mfn) = @_;
91 $store->load_row( database => $database, input => $input, id => $mfn );
95 Options C<row>, C<rules> and C<log> are mandatory while all
98 C<load_row_coderef> is closure only used when executing lookups, so they will
99 die if it's not defined.
101 This function will B<die> if normalizastion can't be evaled.
103 Since this function isn't exported you have to call it with
104 C<WebPAC::Normalize::data_structure>.
108 my $load_row_coderef;
113 die "need row argument" unless ($arg->{row});
114 die "need normalisation argument" unless ($arg->{rules});
116 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
117 _set_ds( $arg->{row} );
118 _set_config( $arg->{config} ) if defined($arg->{config});
119 _clean_ds( %{ $arg } );
120 $load_row_coderef = $arg->{load_row_coderef};
123 no warnings 'redefine';
124 eval "$arg->{rules};";
125 die "error evaling $arg->{rules}: $@\n" if ($@);
132 Set current record hash
141 $rec = shift or die "no record hash";
142 $WebPAC::Normalize::MARC::rec = $rec;
147 my $rec = _get_rec();
151 sub _get_rec { $rec };
155 Set current config hash
157 _set_config( $config );
165 Code of current database
183 Return hash formatted as data structure
192 #warn "## out = ",dump($out);
198 Clean data structure hash for next record
207 WebPAC::Normalize::MARC::_clean();
212 Set current lookup hash
214 _set_lookup( $lookup );
226 Get current lookup hash
228 my $lookup = _get_lookup();
238 Setup code reference which will return L<data_structure> from
242 my ($database,$input,$mfn) = @_;
243 $store->load_row( database => $database, input => $input, id => $mfn );
250 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
252 $load_row_coderef = $coderef;
257 Change level of debug warnings
265 return $debug unless defined($l);
266 warn "debug level $l",$/ if ($l > 0);
268 $WebPAC::Normalize::MARC::debug = $debug;
271 =head1 Functions to create C<data_structure>
273 Those functions generally have to first in your normalization file.
277 Generic way to set values for some name
279 to('field-name', 'name-value' => rec('200','a') );
281 There are many helpers defined below which might be easier to use.
286 my $type = shift or confess "need type -- BUG?";
287 my $name = shift or confess "needs name as first argument";
288 my @o = grep { defined($_) && $_ ne '' } @_;
290 $out->{$name}->{$type} = \@o;
293 =head2 search_display
295 Define output for L<search> and L<display> at the same time
297 search_display('Title', rec('200','a') );
302 my $name = shift or die "search_display needs name as first argument";
303 my @o = grep { defined($_) && $_ ne '' } @_;
305 $out->{$name}->{search} = \@o;
306 $out->{$name}->{display} = \@o;
311 Old name for L<search_display>, it will probably be removed at one point.
316 search_display( @_ );
321 Define output just for I<display>
323 @v = display('Title', rec('200','a') );
327 sub display { to( 'display', @_ ) }
331 Prepare values just for I<search>
333 @v = search('Title', rec('200','a') );
337 sub search { to( 'search', @_ ) }
341 Insert into lists which will be automatically sorted
343 sorted('Title', rec('200','a') );
347 sub sorted { to( 'sorted', @_ ) }
350 =head1 Functions to extract data from input
352 This function should be used inside functions to create C<data_structure> described
355 =head2 _pack_subfields_hash
357 @subfields = _pack_subfields_hash( $h );
358 $subfields = _pack_subfields_hash( $h, 1 );
360 Return each subfield value in array or pack them all together and return scalar
361 with subfields (denoted by C<^>) and values.
365 sub _pack_subfields_hash {
367 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
369 my ($hash,$include_subfields) = @_;
371 # sanity and ease of use
372 return $hash if (ref($hash) ne 'HASH');
374 my $h = dclone( $hash );
376 if ( defined($h->{subfields}) ) {
377 my $sfs = delete $h->{subfields} || die "no subfields?";
380 my $sf = shift @$sfs;
381 push @out, '^' . $sf if ($include_subfields);
383 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
384 # single element subfields are not arrays
385 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
387 push @out, $h->{$sf};
389 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
390 push @out, $h->{$sf}->[$o];
393 if ($include_subfields) {
394 return join('', @out);
399 if ($include_subfields) {
401 foreach my $sf (sort keys %$h) {
402 if (ref($h->{$sf}) eq 'ARRAY') {
403 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
405 $out .= '^' . $sf . $h->{$sf};
410 # FIXME this should probably be in alphabetical order instead of hash order
418 Return all values in some field
422 TODO: order of values is probably same as in source data, need to investigate that
428 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
429 return unless (defined($rec) && defined($rec->{$f}));
430 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
431 if (ref($rec->{$f}) eq 'ARRAY') {
433 foreach my $h ( @{ $rec->{$f} } ) {
434 if (ref($h) eq 'HASH') {
435 push @out, ( _pack_subfields_hash( $h ) );
441 } elsif( defined($rec->{$f}) ) {
448 Return all values in specific field and subfield
456 return unless (defined($rec && $rec->{$f}));
458 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
460 if (ref($_->{$sf}) eq 'ARRAY') {
465 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
475 If rec() returns just single value, it will
476 return scalar, not array.
487 if ($#out == 0 && ! wantarray) {
498 Returns first value from field
501 $v = frec('200','a');
507 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
515 Check if first values from two fields are same or different
517 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
520 # values are different
523 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
524 could write something like:
526 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
530 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
531 in order to parse text and create invalid function C<eqfrec>.
536 my ( $f1,$sf1, $f2, $sf2 ) = @_;
537 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
541 return ! frec_eq( @_ );
546 Apply regex to some or all values
548 @v = regex( 's/foo/bar/g', @v );
555 #warn "r: $r\n", dump(\@_);
559 push @out, $t if ($t && $t ne '');
566 Prefix all values with a string
568 @v = prefix( 'my_', @v );
574 return @_ unless defined( $p );
575 return map { $p . $_ } grep { defined($_) } @_;
580 suffix all values with a string
582 @v = suffix( '_my', @v );
588 return @_ unless defined( $s );
589 return map { $_ . $s } grep { defined($_) } @_;
594 surround all values with a two strings
596 @v = surround( 'prefix_', '_suffix', @v );
603 $p = '' unless defined( $p );
604 $s = '' unless defined( $s );
605 return map { $p . $_ . $s } grep { defined($_) } @_;
623 Consult lookup hashes for some value
627 'ffkk/peri/mfn'.rec('000')
629 'ffkk','peri','200-a-200-e',
631 first(rec(200,'a')).' '.first(rec('200','e'))
635 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
636 normal lookup definition in C<conf/lookup/something.pl> which looks like:
639 # which results to return from record recorded in lookup
640 sub { 'ffkk/peri/mfn' . rec('000') },
641 # from which database and input
643 # such that following values match
644 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
645 # if this part is missing, we will try to match same fields
646 # from lookup record and current one, or you can override
647 # which records to use from current record using
648 sub { rec('900','x') . ' ' . rec('900','y') },
651 You can think about this lookup as SQL (if that helps):
658 sub { filter from lookuped record }
660 sub { optional filter on current record }
667 my ($what, $database, $input, $key, $having) = @_;
669 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
671 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
672 return unless (defined($lookup->{$database}->{$input}->{$key}));
674 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
677 my @having = $having->();
679 warn "## having = ", dump( @having ) if ($debug > 2);
681 foreach my $h ( @having ) {
682 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
683 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
684 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
688 return unless ($mfns);
690 my @mfns = sort keys %$mfns;
692 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
697 foreach my $mfn (@mfns) {
698 $rec = $load_row_coderef->( $database, $input, $mfn );
700 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
702 my @vals = $what->();
704 push @out, ( @vals );
706 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
709 # if (ref($lookup->{$k}) eq 'ARRAY') {
710 # return @{ $lookup->{$k} };
712 # return $lookup->{$k};
717 warn "## lookup returns = ", dump(@out), $/ if ($debug);
726 =head2 save_into_lookup
728 Save value into lookup. It associates current database, input
729 and specific keys with one or more values which will be
732 MFN will be extracted from first occurence current of field 000
733 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
735 my $nr = save_into_lookup($database,$input,$key,sub {
736 # code which produce one or more values
739 It returns number of items saved.
741 This function shouldn't be called directly, it's called from code created by
746 sub save_into_lookup {
747 my ($database,$input,$key,$coderef) = @_;
748 die "save_into_lookup needs database" unless defined($database);
749 die "save_into_lookup needs input" unless defined($input);
750 die "save_into_lookup needs key" unless defined($key);
751 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
753 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
756 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
757 defined($config->{_mfn}) ? $config->{_mfn} :
758 die "mfn not defined or zero";
762 foreach my $v ( $coderef->() ) {
763 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
764 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
773 Consult config values stored in C<config.yml>
775 # return database code (key under databases in yaml)
776 $database_code = config(); # use _ from hash
777 $database_name = config('name');
778 $database_input_name = config('input name');
780 Up to three levels are supported.
785 return unless ($config);
793 warn "### getting config($p)\n" if ($debug > 1);
795 my @p = split(/\s+/,$p);
797 $v = $config->{ '_' }; # special, database code
800 my $c = dclone( $config );
803 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
804 if (ref($c) eq 'ARRAY') {
806 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
810 if (! defined($c->{$k}) ) {
821 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
822 warn "config( '$p' ) is empty\n" if (! $v);
829 Returns unique id of this record
833 Returns C<42/2> for 2nd occurence of MFN 42.
838 my $mfn = $config->{_mfn} || die "no _mfn in config data";
839 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
844 Joins walues with some delimiter
846 $v = join_with(", ", @v);
852 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
853 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
854 return '' unless defined($v);
860 Split record subfield on some regex and take one of parts out
862 $a_before_semi_column =
863 split_rec_on('200','a', /\s*;\s*/, $part);
865 C<$part> is optional number of element. First element is
868 If there is no C<$part> parameter or C<$part> is 0, this function will
869 return all values produced by splitting.
874 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
876 my ($fld, $sf, $regex, $part) = @_;
877 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
879 my @r = rec( $fld, $sf );
881 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
883 return '' if ( ! defined($v) || $v =~ /^\s*$/);
885 my @s = split( $regex, $v );
886 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
887 if ($part && $part > 0) {
888 return $s[ $part - 1 ];
898 set( key => 'value' );
904 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
915 my $k = shift || return;
917 warn "## get $k = ", dump( $v ), $/ if ( $debug );
923 if ( count( @result ) == 1 ) {
924 # do something if only 1 result is there
930 warn "## count ",dump(@_),$/ if ( $debug );