1 package WebPAC::Normalize;
10 search_display search display sorted
14 regex prefix suffix surround
15 first lookup join_with
28 #use base qw/WebPAC::Common/;
29 use Data::Dump qw/dump/;
37 use WebPAC::Normalize::ISBN;
38 push @EXPORT, ( 'isbn_10', 'isbn_13' );
40 use WebPAC::Normalize::MARC;
42 marc marc_indicators marc_repeatable_subfield
43 marc_compose marc_leader marc_fixed
44 marc_duplicate marc_remove marc_count
51 WebPAC::Normalize - describe normalisaton rules using sets
55 our $VERSION = '0.35';
59 This module uses C<conf/normalize/*.pl> files to perform normalisation
60 from input records using perl functions which are specialized for set
63 Sets are implemented as arrays, and normalisation file is valid perl, which
64 means that you check it's validity before running WebPAC using
65 C<perl -c normalize.pl>.
67 Normalisation can generate multiple output normalized data. For now, supported output
68 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
73 Functions which start with C<_> are private and used by WebPAC internally.
74 All other functions are available for use within normalisation rules.
80 my $ds = WebPAC::Normalize::data_structure(
81 lookup => $lookup_hash,
83 rules => $normalize_pl_config,
84 marc_encoding => 'utf-8',
86 load_row_coderef => sub {
87 my ($database,$input,$mfn) = @_;
88 $store->load_row( database => $database, input => $input, id => $mfn );
92 Options C<row>, C<rules> and C<log> are mandatory while all
95 C<load_row_coderef> is closure only used when executing lookups, so they will
96 die if it's not defined.
98 This function will B<die> if normalizastion can't be evaled.
100 Since this function isn't exported you have to call it with
101 C<WebPAC::Normalize::data_structure>.
105 my $load_row_coderef;
110 die "need row argument" unless ($arg->{row});
111 die "need normalisation argument" unless ($arg->{rules});
113 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114 _set_ds( $arg->{row} );
115 _set_config( $arg->{config} ) if defined($arg->{config});
116 _clean_ds( %{ $arg } );
117 $load_row_coderef = $arg->{load_row_coderef};
120 no warnings 'redefine';
121 eval "$arg->{rules};";
122 die "error evaling $arg->{rules}: $@\n" if ($@);
129 Set current record hash
138 $rec = shift or die "no record hash";
139 $WebPAC::Normalize::MARC::rec = $rec;
144 my $rec = _get_rec();
148 sub _get_rec { $rec };
152 Set current config hash
154 _set_config( $config );
162 Code of current database
180 Return hash formatted as data structure
189 #warn "## out = ",dump($out);
195 Clean data structure hash for next record
204 WebPAC::Normalize::MARC::_clean();
209 Set current lookup hash
211 _set_lookup( $lookup );
223 Get current lookup hash
225 my $lookup = _get_lookup();
235 Setup code reference which will return L<data_structure> from
239 my ($database,$input,$mfn) = @_;
240 $store->load_row( database => $database, input => $input, id => $mfn );
247 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
249 $load_row_coderef = $coderef;
254 Change level of debug warnings
262 return $debug unless defined($l);
263 warn "debug level $l",$/ if ($l > 0);
265 $WebPAC::Normalize::MARC::debug = $debug;
268 =head1 Functions to create C<data_structure>
270 Those functions generally have to first in your normalization file.
272 =head2 search_display
274 Define output for L<search> and L<display> at the same time
276 search_display('Title', rec('200','a') );
282 my $name = shift or die "search_display needs name as first argument";
283 my @o = grep { defined($_) && $_ ne '' } @_;
285 $out->{$name}->{search} = \@o;
286 $out->{$name}->{display} = \@o;
291 Old name for L<search_display>, but supported
296 search_display( @_ );
301 Define output just for I<display>
303 @v = display('Title', rec('200','a') );
308 my $type = shift or confess "need type -- BUG?";
309 my $name = shift or confess "needs name as first argument";
310 my @o = grep { defined($_) && $_ ne '' } @_;
312 $out->{$name}->{$type} = \@o;
315 sub display { _field( 'display', @_ ) }
319 Prepare values just for I<search>
321 @v = search('Title', rec('200','a') );
325 sub search { _field( 'search', @_ ) }
329 Insert into lists which will be automatically sorted
331 sorted('Title', rec('200','a') );
335 sub sorted { _field( 'sorted', @_ ) }
339 =head1 Functions to extract data from input
341 This function should be used inside functions to create C<data_structure> described
344 =head2 _pack_subfields_hash
346 @subfields = _pack_subfields_hash( $h );
347 $subfields = _pack_subfields_hash( $h, 1 );
349 Return each subfield value in array or pack them all together and return scalar
350 with subfields (denoted by C<^>) and values.
354 sub _pack_subfields_hash {
356 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
358 my ($h,$include_subfields) = @_;
360 # sanity and ease of use
361 return $h if (ref($h) ne 'HASH');
363 if ( defined($h->{subfields}) ) {
364 my $sfs = delete $h->{subfields} || die "no subfields?";
367 my $sf = shift @$sfs;
368 push @out, '^' . $sf if ($include_subfields);
370 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
371 # single element subfields are not arrays
372 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
374 push @out, $h->{$sf};
376 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
377 push @out, $h->{$sf}->[$o];
380 if ($include_subfields) {
381 return join('', @out);
386 if ($include_subfields) {
388 foreach my $sf (sort keys %$h) {
389 if (ref($h->{$sf}) eq 'ARRAY') {
390 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
392 $out .= '^' . $sf . $h->{$sf};
397 # FIXME this should probably be in alphabetical order instead of hash order
405 Return all values in some field
409 TODO: order of values is probably same as in source data, need to investigate that
415 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
416 return unless (defined($rec) && defined($rec->{$f}));
417 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
418 if (ref($rec->{$f}) eq 'ARRAY') {
420 foreach my $h ( @{ $rec->{$f} } ) {
421 if (ref($h) eq 'HASH') {
422 push @out, ( _pack_subfields_hash( $h ) );
428 } elsif( defined($rec->{$f}) ) {
435 Return all values in specific field and subfield
443 return unless (defined($rec && $rec->{$f}));
445 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
447 if (ref($_->{$sf}) eq 'ARRAY') {
452 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
462 If rec() returns just single value, it will
463 return scalar, not array.
474 if ($#out == 0 && ! wantarray) {
485 Returns first value from field
488 $v = frec('200','a');
494 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
502 Check if first values from two fields are same or different
504 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
507 # values are different
510 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
511 could write something like:
513 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
517 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
518 in order to parse text and create invalid function C<eqfrec>.
523 my ( $f1,$sf1, $f2, $sf2 ) = @_;
524 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
528 return ! frec_eq( @_ );
533 Apply regex to some or all values
535 @v = regex( 's/foo/bar/g', @v );
542 #warn "r: $r\n", dump(\@_);
546 push @out, $t if ($t && $t ne '');
553 Prefix all values with a string
555 @v = prefix( 'my_', @v );
561 return @_ unless defined( $p );
562 return map { $p . $_ } grep { defined($_) } @_;
567 suffix all values with a string
569 @v = suffix( '_my', @v );
575 return @_ unless defined( $s );
576 return map { $_ . $s } grep { defined($_) } @_;
581 surround all values with a two strings
583 @v = surround( 'prefix_', '_suffix', @v );
590 $p = '' unless defined( $p );
591 $s = '' unless defined( $s );
592 return map { $p . $_ . $s } grep { defined($_) } @_;
610 Consult lookup hashes for some value
614 'ffkk/peri/mfn'.rec('000')
616 'ffkk','peri','200-a-200-e',
618 first(rec(200,'a')).' '.first(rec('200','e'))
622 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
623 normal lookup definition in C<conf/lookup/something.pl> which looks like:
626 # which results to return from record recorded in lookup
627 sub { 'ffkk/peri/mfn' . rec('000') },
628 # from which database and input
630 # such that following values match
631 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
632 # if this part is missing, we will try to match same fields
633 # from lookup record and current one, or you can override
634 # which records to use from current record using
635 sub { rec('900','x') . ' ' . rec('900','y') },
638 You can think about this lookup as SQL (if that helps):
645 sub { filter from lookuped record }
647 sub { optional filter on current record }
654 my ($what, $database, $input, $key, $having) = @_;
656 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
658 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
659 return unless (defined($lookup->{$database}->{$input}->{$key}));
661 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
664 my @having = $having->();
666 warn "## having = ", dump( @having ) if ($debug > 2);
668 foreach my $h ( @having ) {
669 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
670 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
671 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
675 return unless ($mfns);
677 my @mfns = sort keys %$mfns;
679 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
684 foreach my $mfn (@mfns) {
685 $rec = $load_row_coderef->( $database, $input, $mfn );
687 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
689 my @vals = $what->();
691 push @out, ( @vals );
693 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
696 # if (ref($lookup->{$k}) eq 'ARRAY') {
697 # return @{ $lookup->{$k} };
699 # return $lookup->{$k};
704 warn "## lookup returns = ", dump(@out), $/ if ($debug);
713 =head2 save_into_lookup
715 Save value into lookup. It associates current database, input
716 and specific keys with one or more values which will be
719 MFN will be extracted from first occurence current of field 000
720 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
722 my $nr = save_into_lookup($database,$input,$key,sub {
723 # code which produce one or more values
726 It returns number of items saved.
728 This function shouldn't be called directly, it's called from code created by
733 sub save_into_lookup {
734 my ($database,$input,$key,$coderef) = @_;
735 die "save_into_lookup needs database" unless defined($database);
736 die "save_into_lookup needs input" unless defined($input);
737 die "save_into_lookup needs key" unless defined($key);
738 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
740 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
743 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
744 defined($config->{_mfn}) ? $config->{_mfn} :
745 die "mfn not defined or zero";
749 foreach my $v ( $coderef->() ) {
750 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
751 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
760 Consult config values stored in C<config.yml>
762 # return database code (key under databases in yaml)
763 $database_code = config(); # use _ from hash
764 $database_name = config('name');
765 $database_input_name = config('input name');
767 Up to three levels are supported.
772 return unless ($config);
780 warn "### getting config($p)\n" if ($debug > 1);
782 my @p = split(/\s+/,$p);
784 $v = $config->{ '_' }; # special, database code
787 my $c = dclone( $config );
790 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
791 if (ref($c) eq 'ARRAY') {
793 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
797 if (! defined($c->{$k}) ) {
808 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
809 warn "config( '$p' ) is empty\n" if (! $v);
816 Returns unique id of this record
820 Returns C<42/2> for 2nd occurence of MFN 42.
825 my $mfn = $config->{_mfn} || die "no _mfn in config data";
826 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
831 Joins walues with some delimiter
833 $v = join_with(", ", @v);
839 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
840 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
841 return '' unless defined($v);
847 Split record subfield on some regex and take one of parts out
849 $a_before_semi_column =
850 split_rec_on('200','a', /\s*;\s*/, $part);
852 C<$part> is optional number of element. First element is
855 If there is no C<$part> parameter or C<$part> is 0, this function will
856 return all values produced by splitting.
861 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
863 my ($fld, $sf, $regex, $part) = @_;
864 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
866 my @r = rec( $fld, $sf );
868 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
870 return '' if ( ! defined($v) || $v =~ /^\s*$/);
872 my @s = split( $regex, $v );
873 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
874 if ($part && $part > 0) {
875 return $s[ $part - 1 ];
885 set( key => 'value' );
891 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
902 my $k = shift || return;
904 warn "## get $k = ", dump( $v ), $/ if ( $debug );
910 if ( count( @result ) == 1 ) {
911 # do something if only 1 result is there
917 warn "## count ",dump(@_),$/ if ( $debug );