1 package WebPAC::Normalize;
11 search_display search display sorted
15 regex prefix suffix surround
16 first lookup join_with
29 #use base qw/WebPAC::Common/;
30 use Data::Dump qw/dump/;
38 use WebPAC::Normalize::ISBN;
39 push @EXPORT, ( 'isbn_10', 'isbn_13' );
41 use WebPAC::Normalize::MARC;
43 marc marc_indicators marc_repeatable_subfield
44 marc_compose marc_leader marc_fixed
45 marc_duplicate marc_remove marc_count
52 WebPAC::Normalize - describe normalisaton rules using sets
56 our $VERSION = '0.36';
60 This module uses C<conf/normalize/*.pl> files to perform normalisation
61 from input records using perl functions which are specialized for set
64 Sets are implemented as arrays, and normalisation file is valid perl, which
65 means that you check it's validity before running WebPAC using
66 C<perl -c normalize.pl>.
68 Normalisation can generate multiple output normalized data. For now, supported output
69 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
74 Functions which start with C<_> are private and used by WebPAC internally.
75 All other functions are available for use within normalisation rules.
81 my $ds = WebPAC::Normalize::data_structure(
82 lookup => $lookup_hash,
84 rules => $normalize_pl_config,
85 marc_encoding => 'utf-8',
87 load_row_coderef => sub {
88 my ($database,$input,$mfn) = @_;
89 $store->load_row( database => $database, input => $input, id => $mfn );
93 Options C<row>, C<rules> and C<log> are mandatory while all
96 C<load_row_coderef> is closure only used when executing lookups, so they will
97 die if it's not defined.
99 This function will B<die> if normalizastion can't be evaled.
101 Since this function isn't exported you have to call it with
102 C<WebPAC::Normalize::data_structure>.
106 my $load_row_coderef;
111 die "need row argument" unless ($arg->{row});
112 die "need normalisation argument" unless ($arg->{rules});
114 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
115 _set_ds( $arg->{row} );
116 _set_config( $arg->{config} ) if defined($arg->{config});
117 _clean_ds( %{ $arg } );
118 $load_row_coderef = $arg->{load_row_coderef};
121 no warnings 'redefine';
122 eval "$arg->{rules};";
123 die "error evaling $arg->{rules}: $@\n" if ($@);
130 Set current record hash
139 $rec = shift or die "no record hash";
140 $WebPAC::Normalize::MARC::rec = $rec;
145 my $rec = _get_rec();
149 sub _get_rec { $rec };
153 Set current config hash
155 _set_config( $config );
163 Code of current database
181 Return hash formatted as data structure
190 #warn "## out = ",dump($out);
196 Clean data structure hash for next record
205 WebPAC::Normalize::MARC::_clean();
210 Set current lookup hash
212 _set_lookup( $lookup );
224 Get current lookup hash
226 my $lookup = _get_lookup();
236 Setup code reference which will return L<data_structure> from
240 my ($database,$input,$mfn) = @_;
241 $store->load_row( database => $database, input => $input, id => $mfn );
248 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
250 $load_row_coderef = $coderef;
255 Change level of debug warnings
263 return $debug unless defined($l);
264 warn "debug level $l",$/ if ($l > 0);
266 $WebPAC::Normalize::MARC::debug = $debug;
269 =head1 Functions to create C<data_structure>
271 Those functions generally have to first in your normalization file.
275 Generic way to set values for some name
277 to('field-name', 'name-value' => rec('200','a') );
279 There are many helpers defined below which might be easier to use.
284 my $type = shift or confess "need type -- BUG?";
285 my $name = shift or confess "needs name as first argument";
286 my @o = grep { defined($_) && $_ ne '' } @_;
288 $out->{$name}->{$type} = \@o;
291 =head2 search_display
293 Define output for L<search> and L<display> at the same time
295 search_display('Title', rec('200','a') );
300 my $name = shift or die "search_display needs name as first argument";
301 my @o = grep { defined($_) && $_ ne '' } @_;
303 $out->{$name}->{search} = \@o;
304 $out->{$name}->{display} = \@o;
309 Old name for L<search_display>, it will probably be removed at one point.
314 search_display( @_ );
319 Define output just for I<display>
321 @v = display('Title', rec('200','a') );
325 sub display { to( 'display', @_ ) }
329 Prepare values just for I<search>
331 @v = search('Title', rec('200','a') );
335 sub search { to( 'search', @_ ) }
339 Insert into lists which will be automatically sorted
341 sorted('Title', rec('200','a') );
345 sub sorted { to( 'sorted', @_ ) }
348 =head1 Functions to extract data from input
350 This function should be used inside functions to create C<data_structure> described
353 =head2 _pack_subfields_hash
355 @subfields = _pack_subfields_hash( $h );
356 $subfields = _pack_subfields_hash( $h, 1 );
358 Return each subfield value in array or pack them all together and return scalar
359 with subfields (denoted by C<^>) and values.
363 sub _pack_subfields_hash {
365 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
367 my ($h,$include_subfields) = @_;
369 # sanity and ease of use
370 return $h if (ref($h) ne 'HASH');
372 if ( defined($h->{subfields}) ) {
373 my $sfs = delete $h->{subfields} || die "no subfields?";
376 my $sf = shift @$sfs;
377 push @out, '^' . $sf if ($include_subfields);
379 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
380 # single element subfields are not arrays
381 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
383 push @out, $h->{$sf};
385 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
386 push @out, $h->{$sf}->[$o];
389 if ($include_subfields) {
390 return join('', @out);
395 if ($include_subfields) {
397 foreach my $sf (sort keys %$h) {
398 if (ref($h->{$sf}) eq 'ARRAY') {
399 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
401 $out .= '^' . $sf . $h->{$sf};
406 # FIXME this should probably be in alphabetical order instead of hash order
414 Return all values in some field
418 TODO: order of values is probably same as in source data, need to investigate that
424 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
425 return unless (defined($rec) && defined($rec->{$f}));
426 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
427 if (ref($rec->{$f}) eq 'ARRAY') {
429 foreach my $h ( @{ $rec->{$f} } ) {
430 if (ref($h) eq 'HASH') {
431 push @out, ( _pack_subfields_hash( $h ) );
437 } elsif( defined($rec->{$f}) ) {
444 Return all values in specific field and subfield
452 return unless (defined($rec && $rec->{$f}));
454 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
456 if (ref($_->{$sf}) eq 'ARRAY') {
461 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
471 If rec() returns just single value, it will
472 return scalar, not array.
483 if ($#out == 0 && ! wantarray) {
494 Returns first value from field
497 $v = frec('200','a');
503 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
511 Check if first values from two fields are same or different
513 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
516 # values are different
519 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
520 could write something like:
522 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
526 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
527 in order to parse text and create invalid function C<eqfrec>.
532 my ( $f1,$sf1, $f2, $sf2 ) = @_;
533 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
537 return ! frec_eq( @_ );
542 Apply regex to some or all values
544 @v = regex( 's/foo/bar/g', @v );
551 #warn "r: $r\n", dump(\@_);
555 push @out, $t if ($t && $t ne '');
562 Prefix all values with a string
564 @v = prefix( 'my_', @v );
570 return @_ unless defined( $p );
571 return map { $p . $_ } grep { defined($_) } @_;
576 suffix all values with a string
578 @v = suffix( '_my', @v );
584 return @_ unless defined( $s );
585 return map { $_ . $s } grep { defined($_) } @_;
590 surround all values with a two strings
592 @v = surround( 'prefix_', '_suffix', @v );
599 $p = '' unless defined( $p );
600 $s = '' unless defined( $s );
601 return map { $p . $_ . $s } grep { defined($_) } @_;
619 Consult lookup hashes for some value
623 'ffkk/peri/mfn'.rec('000')
625 'ffkk','peri','200-a-200-e',
627 first(rec(200,'a')).' '.first(rec('200','e'))
631 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
632 normal lookup definition in C<conf/lookup/something.pl> which looks like:
635 # which results to return from record recorded in lookup
636 sub { 'ffkk/peri/mfn' . rec('000') },
637 # from which database and input
639 # such that following values match
640 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
641 # if this part is missing, we will try to match same fields
642 # from lookup record and current one, or you can override
643 # which records to use from current record using
644 sub { rec('900','x') . ' ' . rec('900','y') },
647 You can think about this lookup as SQL (if that helps):
654 sub { filter from lookuped record }
656 sub { optional filter on current record }
663 my ($what, $database, $input, $key, $having) = @_;
665 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
667 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
668 return unless (defined($lookup->{$database}->{$input}->{$key}));
670 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
673 my @having = $having->();
675 warn "## having = ", dump( @having ) if ($debug > 2);
677 foreach my $h ( @having ) {
678 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
679 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
680 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
684 return unless ($mfns);
686 my @mfns = sort keys %$mfns;
688 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
693 foreach my $mfn (@mfns) {
694 $rec = $load_row_coderef->( $database, $input, $mfn );
696 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
698 my @vals = $what->();
700 push @out, ( @vals );
702 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
705 # if (ref($lookup->{$k}) eq 'ARRAY') {
706 # return @{ $lookup->{$k} };
708 # return $lookup->{$k};
713 warn "## lookup returns = ", dump(@out), $/ if ($debug);
722 =head2 save_into_lookup
724 Save value into lookup. It associates current database, input
725 and specific keys with one or more values which will be
728 MFN will be extracted from first occurence current of field 000
729 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
731 my $nr = save_into_lookup($database,$input,$key,sub {
732 # code which produce one or more values
735 It returns number of items saved.
737 This function shouldn't be called directly, it's called from code created by
742 sub save_into_lookup {
743 my ($database,$input,$key,$coderef) = @_;
744 die "save_into_lookup needs database" unless defined($database);
745 die "save_into_lookup needs input" unless defined($input);
746 die "save_into_lookup needs key" unless defined($key);
747 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
749 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
752 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
753 defined($config->{_mfn}) ? $config->{_mfn} :
754 die "mfn not defined or zero";
758 foreach my $v ( $coderef->() ) {
759 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
760 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
769 Consult config values stored in C<config.yml>
771 # return database code (key under databases in yaml)
772 $database_code = config(); # use _ from hash
773 $database_name = config('name');
774 $database_input_name = config('input name');
776 Up to three levels are supported.
781 return unless ($config);
789 warn "### getting config($p)\n" if ($debug > 1);
791 my @p = split(/\s+/,$p);
793 $v = $config->{ '_' }; # special, database code
796 my $c = dclone( $config );
799 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
800 if (ref($c) eq 'ARRAY') {
802 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
806 if (! defined($c->{$k}) ) {
817 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
818 warn "config( '$p' ) is empty\n" if (! $v);
825 Returns unique id of this record
829 Returns C<42/2> for 2nd occurence of MFN 42.
834 my $mfn = $config->{_mfn} || die "no _mfn in config data";
835 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
840 Joins walues with some delimiter
842 $v = join_with(", ", @v);
848 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
849 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
850 return '' unless defined($v);
856 Split record subfield on some regex and take one of parts out
858 $a_before_semi_column =
859 split_rec_on('200','a', /\s*;\s*/, $part);
861 C<$part> is optional number of element. First element is
864 If there is no C<$part> parameter or C<$part> is 0, this function will
865 return all values produced by splitting.
870 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
872 my ($fld, $sf, $regex, $part) = @_;
873 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
875 my @r = rec( $fld, $sf );
877 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
879 return '' if ( ! defined($v) || $v =~ /^\s*$/);
881 my @s = split( $regex, $v );
882 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
883 if ($part && $part > 0) {
884 return $s[ $part - 1 ];
894 set( key => 'value' );
900 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
911 my $k = shift || return;
913 warn "## get $k = ", dump( $v ), $/ if ( $debug );
919 if ( count( @result ) == 1 ) {
920 # do something if only 1 result is there
926 warn "## count ",dump(@_),$/ if ( $debug );