1 package WebPAC::Normalize;
11 search_display search display sorted
15 regex prefix suffix surround
16 first lookup join_with
32 #use base qw/WebPAC::Common/;
33 use Data::Dump qw/dump/;
41 use WebPAC::Normalize::ISBN;
42 push @EXPORT, ( 'isbn_10', 'isbn_13' );
44 use WebPAC::Normalize::MARC;
46 marc marc_indicators marc_repeatable_subfield
47 marc_compose marc_leader marc_fixed
48 marc_duplicate marc_remove marc_count
53 use Storable qw/dclone/;
57 WebPAC::Normalize - describe normalisaton rules using sets
61 our $VERSION = '0.36';
65 This module uses C<conf/normalize/*.pl> files to perform normalisation
66 from input records using perl functions which are specialized for set
69 Sets are implemented as arrays, and normalisation file is valid perl, which
70 means that you check it's validity before running WebPAC using
71 C<perl -c normalize.pl>.
73 Normalisation can generate multiple output normalized data. For now, supported output
74 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
79 Functions which start with C<_> are private and used by WebPAC internally.
80 All other functions are available for use within normalisation rules.
86 my $ds = WebPAC::Normalize::data_structure(
87 lookup => $lookup_hash,
89 rules => $normalize_pl_config,
90 marc_encoding => 'utf-8',
92 load_row_coderef => sub {
93 my ($database,$input,$mfn) = @_;
94 $store->load_row( database => $database, input => $input, id => $mfn );
98 Options C<row>, C<rules> and C<log> are mandatory while all
101 C<load_row_coderef> is closure only used when executing lookups, so they will
102 die if it's not defined.
104 This function will B<die> if normalizastion can't be evaled.
106 Since this function isn't exported you have to call it with
107 C<WebPAC::Normalize::data_structure>.
111 my $load_row_coderef;
116 die "need row argument" unless ($arg->{row});
117 die "need normalisation argument" unless ($arg->{rules});
119 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
120 _set_ds( $arg->{row} );
121 _set_config( $arg->{config} ) if defined($arg->{config});
122 _clean_ds( %{ $arg } );
123 $load_row_coderef = $arg->{load_row_coderef};
127 eval "$arg->{rules};";
128 die "error evaling [$@] using rules " . $arg->{rules} . "\n" if ($@);
135 Set current record hash
144 $rec = shift or die "no record hash";
145 $WebPAC::Normalize::MARC::rec = $rec;
150 my $rec = _get_rec();
154 sub _get_rec { $rec };
162 sub _set_rec { $rec = $_[0] }
166 Set current config hash
168 _set_config( $config );
176 Code of current database
194 Return hash formatted as data structure
203 #warn "## out = ",dump($out);
209 Clean data structure hash for next record
218 WebPAC::Normalize::MARC::_clean();
223 Set current lookup hash
225 _set_lookup( $lookup );
237 Get current lookup hash
239 my $lookup = _get_lookup();
249 Setup code reference which will return L<data_structure> from
253 my ($database,$input,$mfn) = @_;
254 $store->load_row( database => $database, input => $input, id => $mfn );
261 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
263 $load_row_coderef = $coderef;
268 Change level of debug warnings
276 return $debug unless defined($l);
277 warn "debug level $l",$/ if ($l > 0);
279 $WebPAC::Normalize::MARC::debug = $debug;
282 =head1 Functions to create C<data_structure>
284 Those functions generally have to first in your normalization file.
288 Generic way to set values for some name
290 to('field-name', 'name-value' => rec('200','a') );
292 There are many helpers defined below which might be easier to use.
297 my $type = shift or confess "need type -- BUG?";
298 my $name = shift or confess "needs name as first argument";
299 my @o = grep { defined($_) && $_ ne '' } @_;
301 $out->{$name}->{$type} = \@o;
304 =head2 search_display
306 Define output for L<search> and L<display> at the same time
308 search_display('Title', rec('200','a') );
313 my $name = shift or die "search_display needs name as first argument";
314 my @o = grep { defined($_) && $_ ne '' } @_;
316 $out->{$name}->{search} = \@o;
317 $out->{$name}->{display} = \@o;
322 Old name for L<search_display>, it will probably be removed at one point.
327 search_display( @_ );
332 Define output just for I<display>
334 @v = display('Title', rec('200','a') );
338 sub display { to( 'display', @_ ) }
342 Prepare values just for I<search>
344 @v = search('Title', rec('200','a') );
348 sub search { to( 'search', @_ ) }
352 Insert into lists which will be automatically sorted
354 sorted('Title', rec('200','a') );
358 sub sorted { to( 'sorted', @_ ) }
362 Insert new row of data into output module
364 row( column => 'foo', column2 => 'bar' );
368 use Data::Dump qw/dump/;
371 die "array doesn't have odd number of elements but $#_: ",dump( @_ ) if $#_ % 2 == 1;
372 my $table = shift @_;
373 push @{ $out->{'_rows'}->{$table} }, {@_};
377 =head1 Functions to extract data from input
379 This function should be used inside functions to create C<data_structure> described
382 =head2 _pack_subfields_hash
384 @subfields = _pack_subfields_hash( $h );
385 $subfields = _pack_subfields_hash( $h, 1 );
387 Return each subfield value in array or pack them all together and return scalar
388 with subfields (denoted by C<^>) and values.
392 sub _pack_subfields_hash {
394 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
396 my ($hash,$include_subfields) = @_;
398 # sanity and ease of use
399 return $hash if (ref($hash) ne 'HASH');
401 my $h = dclone( $hash );
403 if ( defined($h->{subfields}) ) {
404 my $sfs = delete $h->{subfields} || die "no subfields?";
407 my $sf = shift @$sfs;
408 push @out, '^' . $sf if ($include_subfields);
410 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
411 # single element subfields are not arrays
412 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
414 push @out, $h->{$sf};
416 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
417 push @out, $h->{$sf}->[$o];
420 if ($include_subfields) {
421 return join('', @out);
426 if ($include_subfields) {
428 foreach my $sf (sort keys %$h) {
429 if (ref($h->{$sf}) eq 'ARRAY') {
430 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
432 $out .= '^' . $sf . $h->{$sf};
437 # FIXME this should probably be in alphabetical order instead of hash order
445 Return all values in some field
449 TODO: order of values is probably same as in source data, need to investigate that
455 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
456 return unless (defined($rec) && defined($rec->{$f}));
457 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
458 if (ref($rec->{$f}) eq 'ARRAY') {
460 foreach my $h ( @{ $rec->{$f} } ) {
461 if (ref($h) eq 'HASH') {
462 push @out, ( _pack_subfields_hash( $h ) );
468 } elsif( defined($rec->{$f}) ) {
475 Return all values in specific field and subfield
483 return unless (defined($rec && $rec->{$f}));
485 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
487 if (ref($_->{$sf}) eq 'ARRAY') {
492 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
502 If rec() returns just single value, it will
503 return scalar, not array.
514 if ($#out == 0 && ! wantarray) {
525 Returns first value from field
528 $v = frec('200','a');
534 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
542 Check if first values from two fields are same or different
544 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
547 # values are different
550 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
551 could write something like:
553 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
557 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
558 in order to parse text and create invalid function C<eqfrec>.
563 my ( $f1,$sf1, $f2, $sf2 ) = @_;
564 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
568 return ! frec_eq( @_ );
573 Apply regex to some or all values
575 @v = regex( 's/foo/bar/g', @v );
582 #warn "r: $r\n", dump(\@_);
586 push @out, $t if ($t && $t ne '');
593 Prefix all values with a string
595 @v = prefix( 'my_', @v );
601 return @_ unless defined( $p );
602 return map { $p . $_ } grep { defined($_) } @_;
607 suffix all values with a string
609 @v = suffix( '_my', @v );
615 return @_ unless defined( $s );
616 return map { $_ . $s } grep { defined($_) } @_;
621 surround all values with a two strings
623 @v = surround( 'prefix_', '_suffix', @v );
630 $p = '' unless defined( $p );
631 $s = '' unless defined( $s );
632 return map { $p . $_ . $s } grep { defined($_) } @_;
650 Consult lookup hashes for some value
654 'ffkk/peri/mfn'.rec('000')
656 'ffkk','peri','200-a-200-e',
658 first(rec(200,'a')).' '.first(rec('200','e'))
662 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
663 normal lookup definition in C<conf/lookup/something.pl> which looks like:
666 # which results to return from record recorded in lookup
667 sub { 'ffkk/peri/mfn' . rec('000') },
668 # from which database and input
670 # such that following values match
671 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
672 # if this part is missing, we will try to match same fields
673 # from lookup record and current one, or you can override
674 # which records to use from current record using
675 sub { rec('900','x') . ' ' . rec('900','y') },
678 You can think about this lookup as SQL (if that helps):
685 sub { filter from lookuped record }
687 sub { optional filter on current record }
694 my ($what, $database, $input, $key, $having) = @_;
696 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
698 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
699 return unless (defined($lookup->{$database}->{$input}->{$key}));
701 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
704 my @having = $having->();
706 warn "## having = ", dump( @having ) if ($debug > 2);
708 foreach my $h ( @having ) {
709 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
710 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
711 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
715 return unless ($mfns);
717 my @mfns = sort keys %$mfns;
719 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
724 foreach my $mfn (@mfns) {
725 $rec = $load_row_coderef->( $database, $input, $mfn );
727 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
729 my @vals = $what->();
731 push @out, ( @vals );
733 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
736 # if (ref($lookup->{$k}) eq 'ARRAY') {
737 # return @{ $lookup->{$k} };
739 # return $lookup->{$k};
744 warn "## lookup returns = ", dump(@out), $/ if ($debug);
753 =head2 save_into_lookup
755 Save value into lookup. It associates current database, input
756 and specific keys with one or more values which will be
759 MFN will be extracted from first occurence current of field 000
760 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
762 my $nr = save_into_lookup($database,$input,$key,sub {
763 # code which produce one or more values
766 It returns number of items saved.
768 This function shouldn't be called directly, it's called from code created by
773 sub save_into_lookup {
774 my ($database,$input,$key,$coderef) = @_;
775 die "save_into_lookup needs database" unless defined($database);
776 die "save_into_lookup needs input" unless defined($input);
777 die "save_into_lookup needs key" unless defined($key);
778 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
780 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
783 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
784 defined($config->{_mfn}) ? $config->{_mfn} :
785 die "mfn not defined or zero";
789 foreach my $v ( $coderef->() ) {
790 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
791 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
800 Consult config values stored in C<config.yml>
802 # return database code (key under databases in yaml)
803 $database_code = config(); # use _ from hash
804 $database_name = config('name');
805 $database_input_name = config('input name');
807 Up to three levels are supported.
812 return unless ($config);
820 warn "### getting config($p)\n" if ($debug > 1);
822 my @p = split(/\s+/,$p);
824 $v = $config->{ '_' }; # special, database code
827 my $c = dclone( $config );
830 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
831 if (ref($c) eq 'ARRAY') {
833 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
837 if (! defined($c->{$k}) ) {
848 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
849 warn "config( '$p' ) is empty\n" if (! $v);
856 Returns unique id of this record
860 Returns C<42/2> for 2nd occurence of MFN 42.
865 my $mfn = $config->{_mfn} || die "no _mfn in config data";
866 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
871 Joins walues with some delimiter
873 $v = join_with(", ", @v);
879 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
880 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
881 return '' unless defined($v);
887 Split record subfield on some regex and take one of parts out
889 $a_before_semi_column =
890 split_rec_on('200','a', /\s*;\s*/, $part);
892 C<$part> is optional number of element. First element is
895 If there is no C<$part> parameter or C<$part> is 0, this function will
896 return all values produced by splitting.
901 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
903 my ($fld, $sf, $regex, $part) = @_;
904 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
906 my @r = rec( $fld, $sf );
908 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
910 return '' if ( ! defined($v) || $v =~ /^\s*$/);
912 my @s = split( $regex, $v );
913 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
914 if ($part && $part > 0) {
915 return $s[ $part - 1 ];
925 set( key => 'value' );
931 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
942 my $k = shift || return;
944 warn "## get $k = ", dump( $v ), $/ if ( $debug );
950 if ( count( @result ) == 1 ) {
951 # do something if only 1 result is there
957 warn "## count ",dump(@_),$/ if ( $debug );
963 Always return field as array
965 foreach my $d ( rec_array('field') ) {
972 my $d = $rec->{ $_[0] };
973 return @$d if ref($d) eq 'ARRAY';