1 package WebPAC::Normalize;
10 search_display search display sorted
14 regex prefix suffix surround
15 first lookup join_with
28 #use base qw/WebPAC::Common/;
29 use Data::Dump qw/dump/;
37 use WebPAC::Normalize::ISBN;
38 push @EXPORT, ( 'isbn_10', 'isbn_13' );
40 use WebPAC::Normalize::MARC;
42 marc marc_indicators marc_repeatable_subfield
43 marc_compose marc_leader marc_fixed
44 marc_duplicate marc_remove marc_count
51 WebPAC::Normalize - describe normalisaton rules using sets
55 our $VERSION = '0.35';
59 This module uses C<conf/normalize/*.pl> files to perform normalisation
60 from input records using perl functions which are specialized for set
63 Sets are implemented as arrays, and normalisation file is valid perl, which
64 means that you check it's validity before running WebPAC using
65 C<perl -c normalize.pl>.
67 Normalisation can generate multiple output normalized data. For now, supported output
68 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
73 Functions which start with C<_> are private and used by WebPAC internally.
74 All other functions are available for use within normalisation rules.
80 my $ds = WebPAC::Normalize::data_structure(
81 lookup => $lookup_hash,
83 rules => $normalize_pl_config,
84 marc_encoding => 'utf-8',
86 load_row_coderef => sub {
87 my ($database,$input,$mfn) = @_;
88 $store->load_row( database => $database, input => $input, id => $mfn );
92 Options C<row>, C<rules> and C<log> are mandatory while all
95 C<load_row_coderef> is closure only used when executing lookups, so they will
96 die if it's not defined.
98 This function will B<die> if normalizastion can't be evaled.
100 Since this function isn't exported you have to call it with
101 C<WebPAC::Normalize::data_structure>.
105 my $load_row_coderef;
110 die "need row argument" unless ($arg->{row});
111 die "need normalisation argument" unless ($arg->{rules});
113 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114 _set_ds( $arg->{row} );
115 _set_config( $arg->{config} ) if defined($arg->{config});
116 _clean_ds( %{ $arg } );
117 $load_row_coderef = $arg->{load_row_coderef};
120 no warnings 'redefine';
121 eval "$arg->{rules};";
122 die "error evaling $arg->{rules}: $@\n" if ($@);
129 Set current record hash
138 $rec = shift or die "no record hash";
139 $WebPAC::Normalize::MARC::rec = $rec;
144 my $rec = _get_rec();
148 sub _get_rec { $rec };
152 Set current config hash
154 _set_config( $config );
162 Code of current database
180 Return hash formatted as data structure
189 #warn "## out = ",dump($out);
195 Clean data structure hash for next record
204 WebPAC::Normalize::MARC::_clean();
209 Set current lookup hash
211 _set_lookup( $lookup );
223 Get current lookup hash
225 my $lookup = _get_lookup();
235 Setup code reference which will return L<data_structure> from
239 my ($database,$input,$mfn) = @_;
240 $store->load_row( database => $database, input => $input, id => $mfn );
247 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
249 $load_row_coderef = $coderef;
254 Change level of debug warnings
262 return $debug unless defined($l);
263 warn "debug level $l",$/ if ($l > 0);
265 $WebPAC::Normalize::MARC::debug = $debug;
266 warn "#### MARC::debug = ",dump($WebPAC::Normalize::MARC::debug);
269 =head1 Functions to create C<data_structure>
271 Those functions generally have to first in your normalization file.
273 =head2 search_display
275 Define output for L<search> and L<display> at the same time
277 search_display('Title', rec('200','a') );
283 my $name = shift or die "search_display needs name as first argument";
284 my @o = grep { defined($_) && $_ ne '' } @_;
286 $out->{$name}->{search} = \@o;
287 $out->{$name}->{display} = \@o;
292 Old name for L<search_display>, but supported
297 search_display( @_ );
302 Define output just for I<display>
304 @v = display('Title', rec('200','a') );
309 my $type = shift or confess "need type -- BUG?";
310 my $name = shift or confess "needs name as first argument";
311 my @o = grep { defined($_) && $_ ne '' } @_;
313 $out->{$name}->{$type} = \@o;
316 sub display { _field( 'display', @_ ) }
320 Prepare values just for I<search>
322 @v = search('Title', rec('200','a') );
326 sub search { _field( 'search', @_ ) }
330 Insert into lists which will be automatically sorted
332 sorted('Title', rec('200','a') );
336 sub sorted { _field( 'sorted', @_ ) }
340 =head1 Functions to extract data from input
342 This function should be used inside functions to create C<data_structure> described
345 =head2 _pack_subfields_hash
347 @subfields = _pack_subfields_hash( $h );
348 $subfields = _pack_subfields_hash( $h, 1 );
350 Return each subfield value in array or pack them all together and return scalar
351 with subfields (denoted by C<^>) and values.
355 sub _pack_subfields_hash {
357 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
359 my ($h,$include_subfields) = @_;
361 # sanity and ease of use
362 return $h if (ref($h) ne 'HASH');
364 if ( defined($h->{subfields}) ) {
365 my $sfs = delete $h->{subfields} || die "no subfields?";
368 my $sf = shift @$sfs;
369 push @out, '^' . $sf if ($include_subfields);
371 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
372 # single element subfields are not arrays
373 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
375 push @out, $h->{$sf};
377 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
378 push @out, $h->{$sf}->[$o];
381 if ($include_subfields) {
382 return join('', @out);
387 if ($include_subfields) {
389 foreach my $sf (sort keys %$h) {
390 if (ref($h->{$sf}) eq 'ARRAY') {
391 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
393 $out .= '^' . $sf . $h->{$sf};
398 # FIXME this should probably be in alphabetical order instead of hash order
406 Return all values in some field
410 TODO: order of values is probably same as in source data, need to investigate that
416 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
417 return unless (defined($rec) && defined($rec->{$f}));
418 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
419 if (ref($rec->{$f}) eq 'ARRAY') {
421 foreach my $h ( @{ $rec->{$f} } ) {
422 if (ref($h) eq 'HASH') {
423 push @out, ( _pack_subfields_hash( $h ) );
429 } elsif( defined($rec->{$f}) ) {
436 Return all values in specific field and subfield
444 return unless (defined($rec && $rec->{$f}));
446 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
448 if (ref($_->{$sf}) eq 'ARRAY') {
453 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
463 If rec() returns just single value, it will
464 return scalar, not array.
475 if ($#out == 0 && ! wantarray) {
486 Returns first value from field
489 $v = frec('200','a');
495 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
503 Check if first values from two fields are same or different
505 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
508 # values are different
511 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
512 could write something like:
514 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
518 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
519 in order to parse text and create invalid function C<eqfrec>.
524 my ( $f1,$sf1, $f2, $sf2 ) = @_;
525 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
529 return ! frec_eq( @_ );
534 Apply regex to some or all values
536 @v = regex( 's/foo/bar/g', @v );
543 #warn "r: $r\n", dump(\@_);
547 push @out, $t if ($t && $t ne '');
554 Prefix all values with a string
556 @v = prefix( 'my_', @v );
562 return @_ unless defined( $p );
563 return map { $p . $_ } grep { defined($_) } @_;
568 suffix all values with a string
570 @v = suffix( '_my', @v );
576 return @_ unless defined( $s );
577 return map { $_ . $s } grep { defined($_) } @_;
582 surround all values with a two strings
584 @v = surround( 'prefix_', '_suffix', @v );
591 $p = '' unless defined( $p );
592 $s = '' unless defined( $s );
593 return map { $p . $_ . $s } grep { defined($_) } @_;
611 Consult lookup hashes for some value
615 'ffkk/peri/mfn'.rec('000')
617 'ffkk','peri','200-a-200-e',
619 first(rec(200,'a')).' '.first(rec('200','e'))
623 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
624 normal lookup definition in C<conf/lookup/something.pl> which looks like:
627 # which results to return from record recorded in lookup
628 sub { 'ffkk/peri/mfn' . rec('000') },
629 # from which database and input
631 # such that following values match
632 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
633 # if this part is missing, we will try to match same fields
634 # from lookup record and current one, or you can override
635 # which records to use from current record using
636 sub { rec('900','x') . ' ' . rec('900','y') },
639 You can think about this lookup as SQL (if that helps):
646 sub { filter from lookuped record }
648 sub { optional filter on current record }
655 my ($what, $database, $input, $key, $having) = @_;
657 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
659 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
660 return unless (defined($lookup->{$database}->{$input}->{$key}));
662 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
665 my @having = $having->();
667 warn "## having = ", dump( @having ) if ($debug > 2);
669 foreach my $h ( @having ) {
670 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
671 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
672 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
676 return unless ($mfns);
678 my @mfns = sort keys %$mfns;
680 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
685 foreach my $mfn (@mfns) {
686 $rec = $load_row_coderef->( $database, $input, $mfn );
688 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
690 my @vals = $what->();
692 push @out, ( @vals );
694 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
697 # if (ref($lookup->{$k}) eq 'ARRAY') {
698 # return @{ $lookup->{$k} };
700 # return $lookup->{$k};
705 warn "## lookup returns = ", dump(@out), $/ if ($debug);
714 =head2 save_into_lookup
716 Save value into lookup. It associates current database, input
717 and specific keys with one or more values which will be
720 MFN will be extracted from first occurence current of field 000
721 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
723 my $nr = save_into_lookup($database,$input,$key,sub {
724 # code which produce one or more values
727 It returns number of items saved.
729 This function shouldn't be called directly, it's called from code created by
734 sub save_into_lookup {
735 my ($database,$input,$key,$coderef) = @_;
736 die "save_into_lookup needs database" unless defined($database);
737 die "save_into_lookup needs input" unless defined($input);
738 die "save_into_lookup needs key" unless defined($key);
739 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
741 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
744 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
745 defined($config->{_mfn}) ? $config->{_mfn} :
746 die "mfn not defined or zero";
750 foreach my $v ( $coderef->() ) {
751 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
752 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
761 Consult config values stored in C<config.yml>
763 # return database code (key under databases in yaml)
764 $database_code = config(); # use _ from hash
765 $database_name = config('name');
766 $database_input_name = config('input name');
768 Up to three levels are supported.
773 return unless ($config);
781 warn "### getting config($p)\n" if ($debug > 1);
783 my @p = split(/\s+/,$p);
785 $v = $config->{ '_' }; # special, database code
788 my $c = dclone( $config );
791 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
792 if (ref($c) eq 'ARRAY') {
794 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
798 if (! defined($c->{$k}) ) {
809 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
810 warn "config( '$p' ) is empty\n" if (! $v);
817 Returns unique id of this record
821 Returns C<42/2> for 2nd occurence of MFN 42.
826 my $mfn = $config->{_mfn} || die "no _mfn in config data";
827 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
832 Joins walues with some delimiter
834 $v = join_with(", ", @v);
840 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
841 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
842 return '' unless defined($v);
848 Split record subfield on some regex and take one of parts out
850 $a_before_semi_column =
851 split_rec_on('200','a', /\s*;\s*/, $part);
853 C<$part> is optional number of element. First element is
856 If there is no C<$part> parameter or C<$part> is 0, this function will
857 return all values produced by splitting.
862 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
864 my ($fld, $sf, $regex, $part) = @_;
865 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
867 my @r = rec( $fld, $sf );
869 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
871 return '' if ( ! defined($v) || $v =~ /^\s*$/);
873 my @s = split( $regex, $v );
874 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
875 if ($part && $part > 0) {
876 return $s[ $part - 1 ];
886 set( key => 'value' );
892 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
903 my $k = shift || return;
905 warn "## get $k = ", dump( $v ), $/ if ( $debug );
911 if ( count( @result ) == 1 ) {
912 # do something if only 1 result is there
918 warn "## count ",dump(@_),$/ if ( $debug );