From: Dobrica Pavlinusic Date: Mon, 26 Jun 2006 16:39:51 +0000 (+0000) Subject: r719@llin: dpavlin | 2006-06-26 18:40:57 +0200 X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=bbacd62d35e8d7b421f5e8cc0134d9282d6310dc;p=webpac2 r719@llin: dpavlin | 2006-06-26 18:40:57 +0200 big refacture: depriciate and remove all normalisation formats except .pl sets (but old code is still available in WebPAC::Lookup::Normalize because lookups use it) [2.20] git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@536 07558da8-63fa-0310-ba24-9fe276d99e06 --- diff --git a/MANIFEST b/MANIFEST index 469227b..94d97ae 100644 --- a/MANIFEST +++ b/MANIFEST @@ -10,7 +10,6 @@ lib/WebPAC/Store.pm lib/WebPAC/Input.pm lib/WebPAC/Input/ISIS.pm lib/WebPAC/Normalize.pm -lib/WebPAC/Normalize/XML.pm lib/WebPAC/Output.pm lib/WebPAC/Output/CDBI.pm lib/WebPAC/Output/Estraier.pm @@ -33,4 +32,3 @@ t/pod-coverage.t t/pod.t conf/lookup/example.pm conf/lookup/isis.pm -conf/normalize/isis.xml diff --git a/Makefile.PL b/Makefile.PL index 2261619..ee3c0d3 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -15,10 +15,8 @@ WriteMakefile( 'Log::Log4perl' => 1.02, 'Data::Dumper' => 0, 'Cwd' => 0, - 'Text::Iconv' => 0, 'Storable' => 0, 'DBM::Deep' => 0, - 'XML::Simple' => 0, 'Template' => 0, 'Time::HiRes' => 0, 'File::Temp' => 0, diff --git a/TODO b/TODO index a643614..4171ec9 100644 --- a/TODO +++ b/TODO @@ -16,7 +16,7 @@ + added --stats to report field and subfield usage [2.14] + add validator for input data [2.15] + add Excel input format [2.16] -- fix varous annoyences in code [2.17] ++ remove WebPAC::Normalize::XML and promote WebPAC::Normalize::Set to WebPAC::Normalize [2.20] - support arrays for normalize/path and lookup - add dBase input format - remove delimiters characters from index and query entered diff --git a/conf/normalize/isis.xml b/conf/normalize/isis.xml deleted file mode 100644 index c94018a..0000000 --- a/conf/normalize/isis.xml +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - - - %s]]> - - - - - v250^a - [v251] - / [v562^4] v562^a - filter{CROVOC}v800 - - - - out/thes/v000.html - - - - v250 v450 v258 v458 v253 v453 v254 v454 v330 v338 - - - - - v800 -- v901^c - - - - line - - - - eval{"v901^a" eq "Deskriptor"}v250^a - eval{"s901^a" eq "Deskriptor"}v450^a - eval{"v901^a" eq "Deskriptor"}v250^a / [v562^4] v562^a - eval{qq#v250^a# and "v901^a" eq "Deskriptor"}filter{CROVOC}v800 - - eval{qq#v450^a#}filter{CROVOC}v450^a >> s250^a / [s562^4] s562^a s800 - [v251] - v250^a - eval{qq#v250^a#}filter{CROVOC}v800 - - - - - * - v450^a - - - - - - ]]> - v330^a - - - - - - eval{"v901^a" eq "Podruèje"}v250^a - eval{"v901^a" eq "Podruèje"}[v251] v250^a - - [v561^4] - v561^1;;v561^a - lookup{crovoc:v561^1} - - - - [v562^4] - v562^1;;v562^a - lookup{crovoc:v562^1} - - - - v856^u - - - - -
  • ]]> - v556^1;;v556^a lookup{crovoc:v556^1} - - -
  • ]]> - v461^1;;v461^a lookup{crovoc:v461^1} - - - - - -
  • ]]> - v553^1;;v553^a lookup{crovoc:v553^1} - - - -
  • ]]> - eval{"v901^a" eq "Podruèje"}lookup{a:v251::};;lookup{d:lookup{a:v251::}} - - -
  • ]]> - eval{"v901^a" eq "Mikrotezaurus"}lookup{a:v561^4:v251:};;lookup{d:lookup{a:v561^4:v251:}} - - -
  • ]]> - eval{"v901^a" eq "Deskriptor"}lookup{a:v561^4:v562^4:v900};;lookup{d:lookup{a:v561^4:v562^4:v900}} - - - - - - * - v550^1;;v550^a lookup{crovoc:v550^1} - - - - - - , - v440^1;;v440^a lookup{crovoc:v440^1} - - - - - - , - v430^1;;v430^a lookup{crovoc:v430^1} - - - - - line - - - - eval{"v901^a" eq "Deskriptor"}v258^a - eval{"s901^a" eq "Deskriptor"}v458^a - eval{"v901^a" eq "Deskriptor"}v258^a / [v572^4] v572^a - eval{qq#v458^a#}filter{CROVOC}v458^a >> s258^a / [s572^4] s572^a s800 - v258^a - - - - - ]]> - v338^a - - - - - - * - v458^a - - - - - - v571 v572 - [v251] - v571^a. v572^a - - - - - eval{"v253^9" eq "1"} line - - - - eval{"v901^a" eq "Deskriptor"}v253^a - eval{"s901^a" eq "Deskriptor"}v453^a - eval{"v901^a" eq "Deskriptor"}v253^a / [v572^4] v572^a - eval{qq#v453^a#}filter{CROVOC}v453^a >> s253^a / [s572^4] s572^a s800 - v253^a - - - - v453^a - - - - v573 v574 - [v251] - v573^a. v574^a - - - - eval{"v254^9" eq "1"} line - - - - eval{"v901^a" eq "Deskriptor"}v254^a - eval{"s901^a" eq "Deskriptor"}v454^a - eval{"v901^a" eq "Deskriptor"}v254^a / [v572^4] v572^a - eval{qq#v454^a#}filter{CROVOC}v454^a >> s254^a / [s572^4] s572^a s800 - v254^a - - - - v454^a - - - - v575 v576 - [v251] - v575^a. v576^a. - - - - - - v900 - v900 - - - - eval{"v901^a" eq "Deskriptor"} / [v562^4] v562^a - eval{"v901^a" eq "Deskriptor"} [v251] - - - - - diff --git a/conf/normalize/isis_ffzg.xml b/conf/normalize/isis_ffzg.xml deleted file mode 100644 index 54383f1..0000000 --- a/conf/normalize/isis_ffzg.xml +++ /dev/null @@ -1,285 +0,0 @@ - - - - - - v000 - - - - v10 - - - - v11 - - - - v10 v11 - - - - v101 - - - - ]*>/)}v200^a]]> - - - - v200^9 - - - - v200^e - - - - v200^c - - - - v200^d - - - - v200^f ; v200^g - - - - v200^f - - - - v200^g - - - - v200^v - - - - v205^a - - - - v207^a - - - - v209^a - - - - v210^a - - - - v210^c - - - - v210^d - - - - v215^a : v215^c ; v215^d - - - - v215^a - - - - v215^c - - - - v215^e - - - - v225^a = v225^d : v225^e ; v225^v. v225h, v225^i ; v225^w - - - - v225^a - - - - v300 - - - - v305 - - - - v307 - - - - v314 - - - - v320 - - - - v326 - - - - v327 - - - - v330 - - - - v337 - - - - v423^z: v423^a / v423^c v423^b - - - - lookup{dio-jzav:v900} - lookup{id-dio-jzav:v900} - - - - lookup{naslov-efzg:001v001} - - - - v463^1 - - - - lookup{naslov-efzg:s463^1} - - - - lookup{podnaslov-efzg:s463^1} - - - - v463^v - - - - - v464^a / v464^g v464^f - - - - v500^a. v503^b - - - - v532 - - -
    - v608 -
    - - - v610 - - - - v675 - - - - v675^a - - - - v675^b - - - - v686 - - - - v700^a, v700^b - v700^a, v700^b - - - - v701^a, v701^b - v701^a, v701^b - - - - v702^a, v702^b - v702^a, v702^b - - - - v700^a, v700^b - v701^a, v701^b - v702^a, v702^b - v700^a, v700^b - v701^a, v701^b - v702^a, v702^b - - - - v710^a - - - - v711^a - - - - v801 - - - - v856^u - - - - v909 - - - - v900 - 001v001 - - - - lookup{set-jzav:v946^1} - v946^1 - lookup{set-efzg:v461^1} - v461^1 - - - - lookup{set-jzav:lookup{parent-id:v946^1}} - - - - v990 - - - - v991 - - -
    -
    diff --git a/lib/WebPAC.pm b/lib/WebPAC.pm index b1970c0..567b5c8 100644 --- a/lib/WebPAC.pm +++ b/lib/WebPAC.pm @@ -9,30 +9,25 @@ WebPAC - core module =head1 VERSION -Version 2.17 +Version 2.20 =cut -our $VERSION = '2.17'; +our $VERSION = '2.20'; =head1 SYNOPSIS -This is quick description of what WebPAC is. This is third iteration of -WebPAC design (second one was semi-private creatation of CD ROM with L -module). This code will eventually become official WebPAC version 2. +This is quick description of what WebPAC is. This is another iteration of +WebPAC design (first was system with XML files and CGI, second one was semi-private +creatation of CD ROM with L module and third was older version 2 +with supprot for lagacy XML and YAML). + +Current version supports different input formats and normalisation using set rules. =head1 AUTHOR Dobrica Pavlinusic, C<< >> -=head1 BUGS - -Please report any bugs or feature requests to -C, or through the web interface at -L. -I will be notified, and then you'll automatically be notified of progress on -your bug as I make changes. - =head1 SEE ALSO To undestand concpets behind WebPAC examine L, and then @@ -40,7 +35,7 @@ respective documentation for each component. =head1 COPYRIGHT & LICENSE -Copyright 2005 Dobrica Pavlinusic, All Rights Reserved. +Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/lib/WebPAC/Lookup.pm b/lib/WebPAC/Lookup.pm index 2f1ba12..db772fe 100644 --- a/lib/WebPAC/Lookup.pm +++ b/lib/WebPAC/Lookup.pm @@ -3,7 +3,7 @@ package WebPAC::Lookup; use warnings; use strict; -use base qw/WebPAC::Common WebPAC::Normalize/; +use base qw/WebPAC::Common WebPAC::Lookup::Normalize/; use File::Slurp; use YAML qw/LoadFile/; use Data::Dumper; @@ -219,7 +219,7 @@ Dobrica Pavlinusic, C<< >> =head1 COPYRIGHT & LICENSE -Copyright 2005 Dobrica Pavlinusic, All Rights Reserved. +Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/lib/WebPAC/Lookup/Normalize.pm b/lib/WebPAC/Lookup/Normalize.pm new file mode 100644 index 0000000..dd10717 --- /dev/null +++ b/lib/WebPAC/Lookup/Normalize.pm @@ -0,0 +1,795 @@ +package WebPAC::Lookup::Normalize; + +use warnings; +use strict; +use blib; +use WebPAC::Common; +use base 'WebPAC::Common'; +use Data::Dumper; + +=head1 NAME + +WebPAC::Lookup::Normalize - data mungling for normalisation + +=head1 VERSION + +Version 0.09 + +=cut + +our $VERSION = '0.09'; + +=head1 SYNOPSIS + +This package contains code that mungle data to produce normalized format. + +B + +This code is obsolete. It moved to here so that I don't have to re-write +L to use set configuration files (using L) +just yet. But it will dissapear real soon! + +It contains several assumptions: + +=over + +=item * + +format of fields is defined using C notation for repeatable fields +or C for single (or first) value, where C<123> is field number and +C is subfield. + +=item * + +source data records (C<$rec>) have unique identifiers in field C<000> + +=item * + +optional C tag at B will be +perl code that is evaluated before producing output (value of field will be +interpolated before that) + +=item * + +optional C at B will apply perl +code defined as code ref on format after field substitution to producing +output + +There is one built-in filter called C which can be use like this: + + filter{regex(s/foo/bar/)} + +=item * + +optional C will be then performed. See C. + +=item * + +at end, optional Cs rules are resolved. Format rules are similar to +C and can also contain C which is performed after +values are inserted in format. + +=back + +This also describes order in which transformations are applied (eval, +filter, lookup, format) which is important to undestand when deciding how to +solve your data mungling and normalisation process. + + + + +=head1 FUNCTIONS + +=head2 new + +Create new normalisation object + + my $n = new WebPAC::Lookup::Normalize::Something( + filter => { + 'filter_name_1' => sub { + # filter code + return length($_); + }, ... + }, + db => $db_obj, + lookup_regex => $lookup->regex, + lookup => $lookup_obj, + prefix => 'foobar', + ); + +Parametar C defines user supplied snippets of perl code which can +be use with C notation. + +C is used to form filename for database record (to support multiple +source files which are joined in one database). + +Recommended parametar C is used to enable parsing of lookups +in structures. If you pass this parametar, you must also pass C +which is C object. + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + my $r = $self->{'lookup_regex'} ? 1 : 0; + my $l = $self->{'lookup'} ? 1 : 0; + + my $log = $self->_get_logger(); + + # those two must be in pair + if ( ($r & $l) != ($r || $l) ) { + my $log = $self->_get_logger(); + $log->logdie("lookup_regex and lookup must be in pair"); + } + + $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup')); + + $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'}); + + $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l); + + if (! $self->{filter} || ! $self->{filter}->{regex}) { + $log->debug("adding built-in filter regex"); + $self->{filter}->{regex} = sub { + my ($val, $regex) = @_; + eval "\$val =~ $regex"; + return $val; + }; + } + + $self ? return $self : return undef; +} + +=head2 all_tags + +Returns all tags in document in specified order + + my $sorted_tags = $self->all_tags(); + +=cut + +sub all_tags { + my $self = shift; + + if (! $self->{_tags_by_order}) { + + my $log = $self->_get_logger; + # sanity check + $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer}); + + my @tags = keys %{ $self->{'import_xml'}->{'indexer'}}; + $log->debug("unsorted tags: " . join(", ", @tags)); + + @tags = sort { $self->_sort_by_order } @tags; + + $log->debug("sorted tags: " . join(",", @tags) ); + + $self->{_tags_by_order} = \@tags; + } + + return $self->{_tags_by_order}; +} + + + +=head2 data_structure + +Create in-memory data structure which represents normalized layout from +C. + +This structures are used to produce output. + + my $ds = $webpac->data_structure($rec); + +=cut + +sub data_structure { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift; + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + $log->debug("data_structure rec = ", sub { Dumper($rec) }); + + $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'})); + + my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!"); + + my $cache_file; + + if ($self->{'db'}) { + my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} ); + $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) }); + return $ds if ($ds); + $log->debug("cache miss, creating"); + } + + my $tags = $self->all_tags(); + + $log->debug("tags: ",sub { join(", ",@{ $tags }) }); + + my $ds; + + foreach my $field (@{ $tags }) { + + my $row; + +#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); + + foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { + my $format; + + $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH'); + $format = $tag->{'value'} || $tag->{'content'}; + + my @v; + if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { + @v = $self->_rec_to_arr($rec,$format,'fill_in'); + } else { + @v = $self->_rec_to_arr($rec,$format,'parse'); + } + if (! @v) { + $log->debug("$field <",$self->{tag},"> format: $format no values"); + next; + } else { + $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v)); + } + + if ($tag->{'sort'}) { + @v = $self->sort_arr(@v); + } + + # use format? + if ($tag->{'format_name'}) { + @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; + } + + # delimiter will join repeatable fields + if ($tag->{'delimiter'}) { + @v = ( join($tag->{'delimiter'}, @v) ); + } + + # default types + my @types = qw(display search); + # override by type attribute + @types = ( $tag->{'type'} ) if ($tag->{'type'}); + + foreach my $type (@types) { + # append to previous line? + $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append'); + if ($tag->{'append'}) { + + # I will delimit appended part with + # delimiter (or ,) + my $d = $tag->{'delimiter'}; + # default delimiter + $d ||= " "; + + my $last = pop @{$row->{$type}}; + $d = "" if (! $last); + $last .= $d . join($d, @v); + push @{$row->{$type}}, $last; + + } else { + push @{$row->{$type}}, @v; + } + } + + + } + + if ($row) { + $row->{'tag'} = $field; + + # TODO: name_sigular, name_plural + my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; + my $row_name = $name ? $self->_x($name) : $field; + + # post-sort all values in field + if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { + $log->warn("sort at field tag not implemented"); + } + + $ds->{$row_name} = $row; + + $log->debug("row $field: ",sub { Dumper($row) }); + } + + } + + $self->{'db'}->save_ds( + id => $id, + ds => $ds, + prefix => $self->{prefix}, + ) if ($self->{'db'}); + + $log->debug("ds: ", sub { Dumper($ds) }); + + $log->logconfess("data structure returned is not array any more!") if wantarray; + + return $ds; + +} + +=head2 parse + +Perform smart parsing of string, skipping delimiters for fields which aren't +defined. It can also eval code in format starting with C and +return output or nothing depending on eval code. + + my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); + +Filters are implemented here. While simple form of filters looks like this: + + filter{name_of_filter} + +but, filters can also have variable number of parametars like this: + + filter{name_of_filter(param,param,param)} + +=cut + +my $warn_once; + +sub parse { + my $self = shift; + + my ($rec, $format_utf8, $i, $rec_size) = @_; + + return if (! $format_utf8); + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + $i = 0 if (! $i); + + my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); + + my @out; + + $log->debug("format: $format [$i]"); + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + # did we found any (att all) field from format in row? + my $found_any; + # prefix before first field which we preserve it $found_any + my $prefix; + + my $f_step = 1; + + while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) { + + my $del = $1 || ''; + $prefix = $del if ($f_step == 1); + + my $fld_type = lc($2); + + # repeatable index + my $r = $i; + if ($fld_type eq 's') { + if ($found_any->{'v'}) { + $r = 0; + } else { + return; + } + } + + my $found = 0; + my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size); + + if ($found) { + $found_any->{$fld_type} += $found; + + # we will skip delimiter before first occurence of field! + push @out, $del unless($found_any->{$fld_type} == 1); + push @out, $tmp if ($tmp); + } + $f_step++; + } + + # test if any fields found? + return if (! $found_any->{'v'} && ! $found_any->{'s'}); + + my $out = join('',@out); + + if ($out) { + # add rest of format (suffix) + $out .= $format; + + # add prefix if not there + $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + + $log->debug("result: $out"); + } + + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i) || return; + $log->debug("about to eval{$eval} format: $out"); + return if (! $self->_eval($eval)); + } + + if ($filter_name) { + my @filter_args; + if ($filter_name =~ s/(\w+)\((.*)\)/$1/) { + @filter_args = split(/,/, $2); + } + if ($self->{'filter'}->{$filter_name}) { + $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args)); + unshift @filter_args, $out; + $out = $self->{'filter'}->{$filter_name}->(@filter_args); + return unless(defined($out)); + $log->debug("filter result: $out"); + } elsif (! $warn_once->{$filter_name}) { + $log->warn("trying to use undefined filter $filter_name"); + $warn_once->{$filter_name}++; + } + } + + return $out; +} + +=head2 fill_in + +Workhourse of all: takes record from in-memory structure of database and +strings with placeholders and returns string or array of with substituted +values from record. + + my $text = $webpac->fill_in($rec,'v250^a'); + +Optional argument is ordinal number for repeatable fields. By default, +it's assume to be first repeatable field (fields are perl array, so first +element is 0). +Following example will read second value from repeatable field. + + my $text = $webpac->fill_in($rec,'Title: v250^a',1); + +This function B perform parsing of format to inteligenty skip +delimiters before fields which aren't used. + +This method will automatically decode UTF-8 string to local code page +if needed. + +There is optional parametar C<$record_size> which can be used to get sizes of +all C combinations in this format. + + my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); + +=cut + +sub fill_in { + my $self = shift; + + my $log = $self->_get_logger(); + + my ($rec,$format,$i,$rec_size) = @_; + + $log->logconfess("need data record") unless ($rec); + $log->logconfess("need format to parse") unless($format); + + # iteration (for repeatable fields) + $i ||= 0; + + $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999)); + + # FIXME remove for speedup? + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); + } + + my $found = 0; + my $just_single = 1; + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + { + # fix warnings + no warnings 'uninitialized'; + + # do actual replacement of placeholders + # repeatable fields + if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) { + $just_single = 0; + } + + # non-repeatable fields + if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) { + return if ($i > 0 && $just_single); + } + } + + if ($found) { + $log->debug("format: $format"); + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i); + return if (! $self->_eval($eval)); + } + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("filter '$filter_name' for $format"); + $format = $self->{'filter'}->{$filter_name}->($format); + return unless(defined($format)); + $log->debug("filter result: $format"); + } + # do we have lookups? + if ($self->{'lookup'}) { + if ($self->{'lookup'}->can('lookup')) { + my @lookup = $self->{lookup}->lookup($format); + $log->debug("lookup $format", join(", ", @lookup)); + return @lookup; + } else { + $log->warn("Have lookup object but can't invoke lookup method"); + } + } else { + return $format; + } + } else { + return; + } +} + + +=head2 _rec_to_arr + +Similar to C and C, but returns array of all repeatable fields. Usable +for fields which have lookups, so they shouldn't be parsed but rather +Cd or Ced. Last argument is name of operation: C or C. + + my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste'); + +=cut + +sub _rec_to_arr { + my $self = shift; + + my ($rec, $format_utf8, $code) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + $log->debug("using $code on $format_utf8"); + + my $i = 0; + my $max = 0; + my @arr; + my $rec_size = {}; + + while ($i <= $max) { + my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size); + if ($rec_size) { + foreach my $f (keys %{ $rec_size }) { + $max = $rec_size->{$f} if ($rec_size->{$f} > $max); + } + $log->debug("max set to $max"); + undef $rec_size; + } + if (@v) { + push @arr, @v; + } else { + push @arr, '' if ($max > $i); + } + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + + +=head2 get_data + +Returns value from record. + + my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size); + +Required arguments are: + +=over 8 + +=item C<$rec> + +record reference + +=item C<$f> + +field + +=item C<$sf> + +optional subfield + +=item C<$i> + +index offset for repeatable values ( 0 ... $rec_size->{'400^a'} ) + +=item C<$found> + +optional variable that will be incremeted if preset + +=item C<$rec_size> + +hash to hold maximum occurances of C combinations +(which can be accessed using keys in same format) + +=back + +Returns value or empty string, updates C<$found> and C +if present. + +=cut + +sub get_data { + my $self = shift; + + my ($rec,$f,$sf,$i,$found,$cache) = @_; + + return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY'); + + if (defined($$cache)) { + $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} }; + } + + return '' unless ($$rec->{$f}->[$i]); + + { + no strict 'refs'; + if (defined($sf)) { + $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf}); + return $$rec->{$f}->[$i]->{$sf}; + } else { + $$found++ if (defined($$found)); + # it still might have subfields, just + # not specified, so we'll dump some debug info + if ($$rec->{$f}->[$i] =~ /HASH/o) { + my $out; + foreach my $k (keys %{$$rec->{$f}->[$i]}) { + my $v = $$rec->{$f}->[$i]->{$k}; + $out .= '$' . $k .':' . $v if ($v); + } + return $out; + } else { + return $$rec->{$f}->[$i]; + } + } + } +} + + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->{'lookup'}->lookup($out); + } else { + return $out; + } + +} + +=head2 sort_arr + +Sort array ignoring case and html in data + + my @sorted = $webpac->sort_arr(@unsorted); + +=cut + +sub sort_arr { + my $self = shift; + + my $log = $self->_get_logger(); + + # FIXME add Schwartzian Transformation? + + my @sorted = sort { + $a =~ s#<[^>]+/*>##; + $b =~ s#<[^>]+/*>##; + lc($b) cmp lc($a) + } @_; + $log->debug("sorted values: ",sub { join(", ",@sorted) }); + + return @sorted; +} + + +=head1 INTERNAL METHODS + +=head2 _sort_by_order + +Sort xml tags data structure accoding to C attribute. + +=cut + +sub _sort_by_order { + my $self = shift; + + my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$a}; + my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$b}; + + return $va <=> $vb; +} + +=head2 _x + +Convert strings from C encoding into application +specific encoding (optinally specified using C to C +constructor). + + my $text = $n->_x('normalize text string'); + +This is a stub so that other modules doesn't have to implement it. + +=cut + +sub _x { + my $self = shift; + return shift; +} + + +=head1 AUTHOR + +Dobrica Pavlinusic, C<< >> + +=head1 COPYRIGHT & LICENSE + +Copyright 2005 Dobrica Pavlinusic, All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut + +1; # End of WebPAC::Lookup::Normalize diff --git a/lib/WebPAC/Normalize.pm b/lib/WebPAC/Normalize.pm index 93810ab..cf78071 100644 --- a/lib/WebPAC/Normalize.pm +++ b/lib/WebPAC/Normalize.pm @@ -1,789 +1,342 @@ package WebPAC::Normalize; +use Exporter 'import'; +@EXPORT = qw/ + set_rec set_lookup + get_ds clean_ds + tag search display + rec1 rec2 rec + regex prefix suffix surround + first lookup join_with +/; use warnings; use strict; -use blib; -use WebPAC::Common; -use base 'WebPAC::Common'; + +#use base qw/WebPAC::Common/; use Data::Dumper; =head1 NAME -WebPAC::Normalize - data mungling for normalisation +WebPAC::Normalize - describe normalisaton rules using sets =head1 VERSION -Version 0.09 +Version 0.04 =cut -our $VERSION = '0.09'; +our $VERSION = '0.04'; =head1 SYNOPSIS -This package contains code that mungle data to produce normalized format. - -It contains several assumptions: - -=over - -=item * - -format of fields is defined using C notation for repeatable fields -or C for single (or first) value, where C<123> is field number and -C is subfield. - -=item * - -source data records (C<$rec>) have unique identifiers in field C<000> - -=item * - -optional C tag at B will be -perl code that is evaluated before producing output (value of field will be -interpolated before that) - -=item * - -optional C at B will apply perl -code defined as code ref on format after field substitution to producing -output - -There is one built-in filter called C which can be use like this: - - filter{regex(s/foo/bar/)} - -=item * - -optional C will be then performed. See C. - -=item * - -at end, optional Cs rules are resolved. Format rules are similar to -C and can also contain C which is performed after -values are inserted in format. - -=back - -This also describes order in which transformations are applied (eval, -filter, lookup, format) which is important to undestand when deciding how to -solve your data mungling and normalisation process. - +This module uses C files to perform normalisation +from input records using perl functions which are specialized for set +processing. +Sets are implemented as arrays, and normalisation file is valid perl, which +means that you check it's validity before running WebPAC using +C. +Normalisation can generate multiple output normalized data. For now, supported output +types (on the left side of definition) are: C, C and C. =head1 FUNCTIONS -=head2 new - -Create new normalisation object - - my $n = new WebPAC::Normalize::Something( - filter => { - 'filter_name_1' => sub { - # filter code - return length($_); - }, ... - }, - db => $db_obj, - lookup_regex => $lookup->regex, - lookup => $lookup_obj, - prefix => 'foobar', - ); +=head2 data_structure -Parametar C defines user supplied snippets of perl code which can -be use with C notation. +Return data structure -C is used to form filename for database record (to support multiple -source files which are joined in one database). + my $ds = WebPAC::Normalize( + lookup => $lookup->lookup_hash, + row => $row, + rules => $normalize_pl_config, + ); -Recommended parametar C is used to enable parsing of lookups -in structures. If you pass this parametar, you must also pass C -which is C object. +This function will B if normalizastion can't be evaled. =cut -sub new { - my $class = shift; - my $self = {@_}; - bless($self, $class); - - my $r = $self->{'lookup_regex'} ? 1 : 0; - my $l = $self->{'lookup'} ? 1 : 0; - - my $log = $self->_get_logger(); - - # those two must be in pair - if ( ($r & $l) != ($r || $l) ) { - my $log = $self->_get_logger(); - $log->logdie("lookup_regex and lookup must be in pair"); - } - - $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup')); - - $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'}); - - $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l); - - if (! $self->{filter} || ! $self->{filter}->{regex}) { - $log->debug("adding built-in filter regex"); - $self->{filter}->{regex} = sub { - my ($val, $regex) = @_; - eval "\$val =~ $regex"; - return $val; - }; - } - - $self ? return $self : return undef; +sub data_structure { + my $arg = {@_}; + + die "need row argument" unless ($arg->{row}); + die "need normalisation argument" unless ($arg->{rules}); + + no strict 'subs'; + set_lookup( $arg->{lookup} ); + set_rec( $arg->{row} ); + clean_ds(); + eval "$arg->{rules}"; + die "error evaling $arg->{rules}: $@\n" if ($@); + return get_ds(); } -=head2 all_tags +=head2 set_rec -Returns all tags in document in specified order +Set current record hash - my $sorted_tags = $self->all_tags(); + set_rec( $rec ); =cut -sub all_tags { - my $self = shift; - - if (! $self->{_tags_by_order}) { - - my $log = $self->_get_logger; - # sanity check - $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer}); - - my @tags = keys %{ $self->{'import_xml'}->{'indexer'}}; - $log->debug("unsorted tags: " . join(", ", @tags)); +my $rec; - @tags = sort { $self->_sort_by_order } @tags; - - $log->debug("sorted tags: " . join(",", @tags) ); - - $self->{_tags_by_order} = \@tags; - } - - return $self->{_tags_by_order}; +sub set_rec { + $rec = shift or die "no record hash"; } +=head2 tag +Define new tag for I and I. -=head2 data_structure - -Create in-memory data structure which represents normalized layout from -C. - -This structures are used to produce output. + tag('Title', rec('200','a') ); - my $ds = $webpac->data_structure($rec); =cut -sub data_structure { - my $self = shift; - - my $log = $self->_get_logger(); - - my $rec = shift; - $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); - - $log->debug("data_structure rec = ", sub { Dumper($rec) }); - - $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'})); - - my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!"); - - my $cache_file; - - if ($self->{'db'}) { - my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} ); - $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) }); - return $ds if ($ds); - $log->debug("cache miss, creating"); - } - - my $tags = $self->all_tags(); - - $log->debug("tags: ",sub { join(", ",@{ $tags }) }); - - my $ds; - - foreach my $field (@{ $tags }) { - - my $row; - -#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); - - foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { - my $format; - - $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH'); - $format = $tag->{'value'} || $tag->{'content'}; - - my @v; - if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { - @v = $self->_rec_to_arr($rec,$format,'fill_in'); - } else { - @v = $self->_rec_to_arr($rec,$format,'parse'); - } - if (! @v) { - $log->debug("$field <",$self->{tag},"> format: $format no values"); - next; - } else { - $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v)); - } - - if ($tag->{'sort'}) { - @v = $self->sort_arr(@v); - } - - # use format? - if ($tag->{'format_name'}) { - @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; - } - - # delimiter will join repeatable fields - if ($tag->{'delimiter'}) { - @v = ( join($tag->{'delimiter'}, @v) ); - } - - # default types - my @types = qw(display search); - # override by type attribute - @types = ( $tag->{'type'} ) if ($tag->{'type'}); - - foreach my $type (@types) { - # append to previous line? - $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append'); - if ($tag->{'append'}) { - - # I will delimit appended part with - # delimiter (or ,) - my $d = $tag->{'delimiter'}; - # default delimiter - $d ||= " "; - - my $last = pop @{$row->{$type}}; - $d = "" if (! $last); - $last .= $d . join($d, @v); - push @{$row->{$type}}, $last; - - } else { - push @{$row->{$type}}, @v; - } - } - - - } - - if ($row) { - $row->{'tag'} = $field; - - # TODO: name_sigular, name_plural - my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; - my $row_name = $name ? $self->_x($name) : $field; - - # post-sort all values in field - if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { - $log->warn("sort at field tag not implemented"); - } - - $ds->{$row_name} = $row; - - $log->debug("row $field: ",sub { Dumper($row) }); - } - - } - - $self->{'db'}->save_ds( - id => $id, - ds => $ds, - prefix => $self->{prefix}, - ) if ($self->{'db'}); - - $log->debug("ds: ", sub { Dumper($ds) }); - - $log->logconfess("data structure returned is not array any more!") if wantarray; - - return $ds; +my $out; +sub tag { + my $name = shift or die "tag needs name as first argument"; + my @o = grep { defined($_) && $_ ne '' } @_; + return unless (@o); + $out->{$name}->{tag} = $name; + $out->{$name}->{search} = \@o; + $out->{$name}->{display} = \@o; } -=head2 parse - -Perform smart parsing of string, skipping delimiters for fields which aren't -defined. It can also eval code in format starting with C and -return output or nothing depending on eval code. - - my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); - -Filters are implemented here. While simple form of filters looks like this: +=head2 display - filter{name_of_filter} +Define tag just for I -but, filters can also have variable number of parametars like this: - - filter{name_of_filter(param,param,param)} + @v = display('Title', rec('200','a') ); =cut -my $warn_once; - -sub parse { - my $self = shift; - - my ($rec, $format_utf8, $i, $rec_size) = @_; - - return if (! $format_utf8); - - my $log = $self->_get_logger(); - - $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); - - $i = 0 if (! $i); - - my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); - - my @out; - - $log->debug("format: $format [$i]"); - - my $eval_code; - # remove eval{...} from beginning - $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); - - my $filter_name; - # remove filter{...} from beginning - $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); - - # did we found any (att all) field from format in row? - my $found_any; - # prefix before first field which we preserve it $found_any - my $prefix; - - my $f_step = 1; - - while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) { - - my $del = $1 || ''; - $prefix = $del if ($f_step == 1); - - my $fld_type = lc($2); - - # repeatable index - my $r = $i; - if ($fld_type eq 's') { - if ($found_any->{'v'}) { - $r = 0; - } else { - return; - } - } +sub display { + my $name = shift or die "display needs name as first argument"; + my @o = grep { defined($_) && $_ ne '' } @_; + return unless (@o); + $out->{$name}->{tag} = $name; + $out->{$name}->{display} = \@o; +} - my $found = 0; - my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size); +=head2 search - if ($found) { - $found_any->{$fld_type} += $found; +Prepare values just for I - # we will skip delimiter before first occurence of field! - push @out, $del unless($found_any->{$fld_type} == 1); - push @out, $tmp if ($tmp); - } - $f_step++; - } + @v = search('Title', rec('200','a') ); - # test if any fields found? - return if (! $found_any->{'v'} && ! $found_any->{'s'}); +=cut - my $out = join('',@out); +sub search { + my $name = shift or die "search needs name as first argument"; + my @o = grep { defined($_) && $_ ne '' } @_; + return unless (@o); + $out->{$name}->{tag} = $name; + $out->{$name}->{search} = \@o; +} - if ($out) { - # add rest of format (suffix) - $out .= $format; +=head2 get_ds - # add prefix if not there - $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); +Return hash formatted as data structure - $log->debug("result: $out"); - } + my $ds = get_ds(); - if ($eval_code) { - my $eval = $self->fill_in($rec,$eval_code,$i) || return; - $log->debug("about to eval{$eval} format: $out"); - return if (! $self->_eval($eval)); - } - - if ($filter_name) { - my @filter_args; - if ($filter_name =~ s/(\w+)\((.*)\)/$1/) { - @filter_args = split(/,/, $2); - } - if ($self->{'filter'}->{$filter_name}) { - $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args)); - unshift @filter_args, $out; - $out = $self->{'filter'}->{$filter_name}->(@filter_args); - return unless(defined($out)); - $log->debug("filter result: $out"); - } elsif (! $warn_once->{$filter_name}) { - $log->warn("trying to use undefined filter $filter_name"); - $warn_once->{$filter_name}++; - } - } +=cut +sub get_ds { return $out; } -=head2 fill_in - -Workhourse of all: takes record from in-memory structure of database and -strings with placeholders and returns string or array of with substituted -values from record. +=head2 clean_ds - my $text = $webpac->fill_in($rec,'v250^a'); +Clean data structure hash for next record -Optional argument is ordinal number for repeatable fields. By default, -it's assume to be first repeatable field (fields are perl array, so first -element is 0). -Following example will read second value from repeatable field. - - my $text = $webpac->fill_in($rec,'Title: v250^a',1); - -This function B perform parsing of format to inteligenty skip -delimiters before fields which aren't used. - -This method will automatically decode UTF-8 string to local code page -if needed. - -There is optional parametar C<$record_size> which can be used to get sizes of -all C combinations in this format. - - my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); + clean_ds(); =cut -sub fill_in { - my $self = shift; - - my $log = $self->_get_logger(); - - my ($rec,$format,$i,$rec_size) = @_; +sub clean_ds { + $out = undef; +} - $log->logconfess("need data record") unless ($rec); - $log->logconfess("need format to parse") unless($format); +=head2 set_lookup - # iteration (for repeatable fields) - $i ||= 0; +Set current lookup hash - $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999)); + set_lookup( $lookup ); - # FIXME remove for speedup? - $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); +=cut - if (utf8::is_utf8($format)) { - $format = $self->_x($format); - } +my $lookup; - my $found = 0; - my $just_single = 1; +sub set_lookup { + $lookup = shift; +} - my $eval_code; - # remove eval{...} from beginning - $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); +=head2 rec1 - my $filter_name; - # remove filter{...} from beginning - $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); +Return all values in some field - { - # fix warnings - no warnings 'uninitialized'; + @v = rec1('200') - # do actual replacement of placeholders - # repeatable fields - if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) { - $just_single = 0; - } +TODO: order of values is probably same as in source data, need to investigate that - # non-repeatable fields - if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) { - return if ($i > 0 && $just_single); - } - } +=cut - if ($found) { - $log->debug("format: $format"); - if ($eval_code) { - my $eval = $self->fill_in($rec,$eval_code,$i); - return if (! $self->_eval($eval)); - } - if ($filter_name && $self->{'filter'}->{$filter_name}) { - $log->debug("filter '$filter_name' for $format"); - $format = $self->{'filter'}->{$filter_name}->($format); - return unless(defined($format)); - $log->debug("filter result: $format"); - } - # do we have lookups? - if ($self->{'lookup'}) { - if ($self->{'lookup'}->can('lookup')) { - my @lookup = $self->{lookup}->lookup($format); - $log->debug("lookup $format", join(", ", @lookup)); - return @lookup; +sub rec1 { + my $f = shift; + return unless (defined($rec) && defined($rec->{$f})); + if (ref($rec->{$f}) eq 'ARRAY') { + return map { + if (ref($_) eq 'HASH') { + values %{$_}; } else { - $log->warn("Have lookup object but can't invoke lookup method"); + $_; } - } else { - return $format; - } - } else { - return; + } @{ $rec->{$f} }; + } elsif( defined($rec->{$f}) ) { + return $rec->{$f}; } } +=head2 rec2 -=head2 _rec_to_arr - -Similar to C and C, but returns array of all repeatable fields. Usable -for fields which have lookups, so they shouldn't be parsed but rather -Cd or Ced. Last argument is name of operation: C or C. +Return all values in specific field and subfield - my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste'); + @v = rec2('200','a') =cut -sub _rec_to_arr { - my $self = shift; - - my ($rec, $format_utf8, $code) = @_; +sub rec2 { + my $f = shift; + return unless (defined($rec && $rec->{$f})); + my $sf = shift; + return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} }; +} - my $log = $self->_get_logger(); +=head2 rec - $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); - return if (! $format_utf8); +syntaxtic sugar for - $log->debug("using $code on $format_utf8"); + @v = rec('200') + @v = rec('200','a') - my $i = 0; - my $max = 0; - my @arr; - my $rec_size = {}; +=cut - while ($i <= $max) { - my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size); - if ($rec_size) { - foreach my $f (keys %{ $rec_size }) { - $max = $rec_size->{$f} if ($rec_size->{$f} > $max); - } - $log->debug("max set to $max"); - undef $rec_size; - } - if (@v) { - push @arr, @v; - } else { - push @arr, '' if ($max > $i); - } +sub rec { + if ($#_ == 0) { + return rec1(@_); + } elsif ($#_ == 1) { + return rec2(@_); } - - $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); - - return @arr; } +=head2 regex -=head2 get_data - -Returns value from record. - - my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size); - -Required arguments are: +Apply regex to some or all values -=over 8 - -=item C<$rec> - -record reference - -=item C<$f> - -field - -=item C<$sf> - -optional subfield - -=item C<$i> - -index offset for repeatable values ( 0 ... $rec_size->{'400^a'} ) - -=item C<$found> - -optional variable that will be incremeted if preset - -=item C<$rec_size> - -hash to hold maximum occurances of C combinations -(which can be accessed using keys in same format) - -=back - -Returns value or empty string, updates C<$found> and C -if present. + @v = regex( 's/foo/bar/g', @v ); =cut -sub get_data { - my $self = shift; - - my ($rec,$f,$sf,$i,$found,$cache) = @_; - - return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY'); - - if (defined($$cache)) { - $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} }; - } - - return '' unless ($$rec->{$f}->[$i]); - - { - no strict 'refs'; - if (defined($sf)) { - $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf}); - return $$rec->{$f}->[$i]->{$sf}; - } else { - $$found++ if (defined($$found)); - # it still might have subfields, just - # not specified, so we'll dump some debug info - if ($$rec->{$f}->[$i] =~ /HASH/o) { - my $out; - foreach my $k (keys %{$$rec->{$f}->[$i]}) { - my $v = $$rec->{$f}->[$i]->{$k}; - $out .= '$' . $k .':' . $v if ($v); - } - return $out; - } else { - return $$rec->{$f}->[$i]; - } - } +sub regex { + my $r = shift; + my @out; + #warn "r: $r\n",Dumper(\@_); + foreach my $t (@_) { + next unless ($t); + eval "\$t =~ $r"; + push @out, $t if ($t && $t ne ''); } + return @out; } +=head2 prefix -=head2 apply_format - -Apply format specified in tag with C and -C. - - my $text = $webpac->apply_format($format_name,$format_delimiter,$data); +Prefix all values with a string -Formats can contain C if you need them. + @v = prefix( 'my_', @v ); =cut -sub apply_format { - my $self = shift; - - my ($name,$delimiter,$data) = @_; - - my $log = $self->_get_logger(); - - if (! $self->{'import_xml'}->{'format'}->{$name}) { - $log->warn(" is not defined in ",$self->{'import_xml_file'}); - return $data; - } - - $log->warn("no delimiter for format $name") if (! $delimiter); +sub prefix { + my $p = shift or die "prefix needs string as first argument"; + return map { $p . $_ } grep { defined($_) } @_; +} - my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); +=head2 suffix - my @data = split(/\Q$delimiter\E/, $data); +suffix all values with a string - my $out = sprintf($format, @data); - $log->debug("using format $name [$format] on $data to produce: $out"); + @v = suffix( '_my', @v ); - if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { - return $self->{'lookup'}->lookup($out); - } else { - return $out; - } +=cut +sub suffix { + my $s = shift or die "suffix needs string as first argument"; + return map { $_ . $s } grep { defined($_) } @_; } -=head2 sort_arr +=head2 surround -Sort array ignoring case and html in data +surround all values with a two strings - my @sorted = $webpac->sort_arr(@unsorted); + @v = surround( 'prefix_', '_suffix', @v ); =cut -sub sort_arr { - my $self = shift; - - my $log = $self->_get_logger(); - - # FIXME add Schwartzian Transformation? - - my @sorted = sort { - $a =~ s#<[^>]+/*>##; - $b =~ s#<[^>]+/*>##; - lc($b) cmp lc($a) - } @_; - $log->debug("sorted values: ",sub { join(", ",@sorted) }); - - return @sorted; +sub surround { + my $p = shift or die "surround need prefix as first argument"; + my $s = shift or die "surround needs suffix as second argument"; + return map { $p . $_ . $s } grep { defined($_) } @_; } +=head2 first -=head1 INTERNAL METHODS - -=head2 _sort_by_order +Return first element -Sort xml tags data structure accoding to C attribute. + $v = first( @v ); =cut -sub _sort_by_order { - my $self = shift; - - my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || - $self->{'import_xml'}->{'indexer'}->{$a}; - my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || - $self->{'import_xml'}->{'indexer'}->{$b}; - - return $va <=> $vb; +sub first { + my $r = shift; + return $r; } -=head2 _x - -Convert strings from C encoding into application -specific encoding (optinally specified using C to C -constructor). +=head2 lookup - my $text = $n->_x('normalize text string'); +Consult lookup hashes for some value -This is a stub so that other modules doesn't have to implement it. + @v = lookup( $v ); + @v = lookup( @v ); =cut -sub _x { - my $self = shift; - return shift; +sub lookup { + my $k = shift or return; + return unless (defined($lookup->{$k})); + if (ref($lookup->{$k}) eq 'ARRAY') { + return @{ $lookup->{$k} }; + } else { + return $lookup->{$k}; + } } +=head2 join_with -=head1 AUTHOR - -Dobrica Pavlinusic, C<< >> - -=head1 COPYRIGHT & LICENSE +Joins walues with some delimiter -Copyright 2005 Dobrica Pavlinusic, All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it -under the same terms as Perl itself. + $v = join_with(", ", @v); =cut -1; # End of WebPAC::Normalize +sub join_with { + my $d = shift; + return join($d, grep { defined($_) && $_ ne '' } @_); +} + +# END +1; diff --git a/lib/WebPAC/Normalize/Set.pm b/lib/WebPAC/Normalize/Set.pm deleted file mode 100644 index 0b8a17e..0000000 --- a/lib/WebPAC/Normalize/Set.pm +++ /dev/null @@ -1,342 +0,0 @@ -package WebPAC::Normalize::Set; -use Exporter 'import'; -@EXPORT = qw/ - set_rec set_lookup - get_ds clean_ds - tag search display - rec1 rec2 rec - regex prefix suffix surround - first lookup join_with -/; - -use warnings; -use strict; - -#use base qw/WebPAC::Common/; -use Data::Dumper; - -=head1 NAME - -WebPAC::Normalize::Set - describe normalisaton rules using sets - -=head1 VERSION - -Version 0.04 - -=cut - -our $VERSION = '0.04'; - -=head1 SYNOPSIS - -This module uses C files to perform normalisation -from input records using perl functions which are specialized for set -processing. - -Sets are implemented as arrays, and normalisation file is valid perl, which -means that you check it's validity before running WebPAC using -C. - -Normalisation can generate multiple output normalized data. For now, supported output -types (on the left side of definition) are: C, C and C. - -=head1 FUNCTIONS - -=head2 data_structure - -Return data structure - - my $ds = WebPAC::Normalize::Set( - lookup => $lookup->lookup_hash, - row => $row, - rules => $normalize_pl_config, - ); - -This function will B if normalizastion can't be evaled. - -=cut - -sub data_structure { - my $arg = {@_}; - - die "need row argument" unless ($arg->{row}); - die "need normalisation argument" unless ($arg->{rules}); - - no strict 'subs'; - set_lookup( $arg->{lookup} ); - set_rec( $arg->{row} ); - clean_ds(); - eval "$arg->{rules}"; - die "error evaling $arg->{rules}: $@\n" if ($@); - return get_ds(); -} - -=head2 set_rec - -Set current record hash - - set_rec( $rec ); - -=cut - -my $rec; - -sub set_rec { - $rec = shift or die "no record hash"; -} - -=head2 tag - -Define new tag for I and I. - - tag('Title', rec('200','a') ); - - -=cut - -my $out; - -sub tag { - my $name = shift or die "tag needs name as first argument"; - my @o = grep { defined($_) && $_ ne '' } @_; - return unless (@o); - $out->{$name}->{tag} = $name; - $out->{$name}->{search} = \@o; - $out->{$name}->{display} = \@o; -} - -=head2 display - -Define tag just for I - - @v = display('Title', rec('200','a') ); - -=cut - -sub display { - my $name = shift or die "display needs name as first argument"; - my @o = grep { defined($_) && $_ ne '' } @_; - return unless (@o); - $out->{$name}->{tag} = $name; - $out->{$name}->{display} = \@o; -} - -=head2 search - -Prepare values just for I - - @v = search('Title', rec('200','a') ); - -=cut - -sub search { - my $name = shift or die "search needs name as first argument"; - my @o = grep { defined($_) && $_ ne '' } @_; - return unless (@o); - $out->{$name}->{tag} = $name; - $out->{$name}->{search} = \@o; -} - -=head2 get_ds - -Return hash formatted as data structure - - my $ds = get_ds(); - -=cut - -sub get_ds { - return $out; -} - -=head2 clean_ds - -Clean data structure hash for next record - - clean_ds(); - -=cut - -sub clean_ds { - $out = undef; -} - -=head2 set_lookup - -Set current lookup hash - - set_lookup( $lookup ); - -=cut - -my $lookup; - -sub set_lookup { - $lookup = shift; -} - -=head2 rec1 - -Return all values in some field - - @v = rec1('200') - -TODO: order of values is probably same as in source data, need to investigate that - -=cut - -sub rec1 { - my $f = shift; - return unless (defined($rec) && defined($rec->{$f})); - if (ref($rec->{$f}) eq 'ARRAY') { - return map { - if (ref($_) eq 'HASH') { - values %{$_}; - } else { - $_; - } - } @{ $rec->{$f} }; - } elsif( defined($rec->{$f}) ) { - return $rec->{$f}; - } -} - -=head2 rec2 - -Return all values in specific field and subfield - - @v = rec2('200','a') - -=cut - -sub rec2 { - my $f = shift; - return unless (defined($rec && $rec->{$f})); - my $sf = shift; - return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} }; -} - -=head2 rec - -syntaxtic sugar for - - @v = rec('200') - @v = rec('200','a') - -=cut - -sub rec { - if ($#_ == 0) { - return rec1(@_); - } elsif ($#_ == 1) { - return rec2(@_); - } -} - -=head2 regex - -Apply regex to some or all values - - @v = regex( 's/foo/bar/g', @v ); - -=cut - -sub regex { - my $r = shift; - my @out; - #warn "r: $r\n",Dumper(\@_); - foreach my $t (@_) { - next unless ($t); - eval "\$t =~ $r"; - push @out, $t if ($t && $t ne ''); - } - return @out; -} - -=head2 prefix - -Prefix all values with a string - - @v = prefix( 'my_', @v ); - -=cut - -sub prefix { - my $p = shift or die "prefix needs string as first argument"; - return map { $p . $_ } grep { defined($_) } @_; -} - -=head2 suffix - -suffix all values with a string - - @v = suffix( '_my', @v ); - -=cut - -sub suffix { - my $s = shift or die "suffix needs string as first argument"; - return map { $_ . $s } grep { defined($_) } @_; -} - -=head2 surround - -surround all values with a two strings - - @v = surround( 'prefix_', '_suffix', @v ); - -=cut - -sub surround { - my $p = shift or die "surround need prefix as first argument"; - my $s = shift or die "surround needs suffix as second argument"; - return map { $p . $_ . $s } grep { defined($_) } @_; -} - -=head2 first - -Return first element - - $v = first( @v ); - -=cut - -sub first { - my $r = shift; - return $r; -} - -=head2 lookup - -Consult lookup hashes for some value - - @v = lookup( $v ); - @v = lookup( @v ); - -=cut - -sub lookup { - my $k = shift or return; - return unless (defined($lookup->{$k})); - if (ref($lookup->{$k}) eq 'ARRAY') { - return @{ $lookup->{$k} }; - } else { - return $lookup->{$k}; - } -} - -=head2 join_with - -Joins walues with some delimiter - - $v = join_with(", ", @v); - -=cut - -sub join_with { - my $d = shift; - return join($d, grep { defined($_) && $_ ne '' } @_); -} - -# END -1; diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm deleted file mode 100644 index 7e829b5..0000000 --- a/lib/WebPAC/Normalize/XML.pm +++ /dev/null @@ -1,164 +0,0 @@ -package WebPAC::Normalize::XML; - -use warnings; -use strict; - -use base qw/WebPAC::Common WebPAC::Normalize/; -use XML::Simple; -use Data::Dumper; -use Text::Iconv; -use YAML qw/Dump LoadFile/; - -=head1 NAME - -WebPAC::Normalize::XML - apply XML or YAML normalisaton rules - -=head1 VERSION - -Version 0.03 - -=cut - -our $VERSION = '0.03'; - -=head1 SYNOPSIS - -This module uses C files to perform normalisation -from input records - -=cut - -=head1 FUNCTIONS - -=head2 open - -Read normalisation rules defined using XML from C and -parse it. - - my $n = new WebPAC::Normalize::XML; - $n->open( - tag => 'isis', - xml_file => '/path/to/conf/normalize/isis.xml', - ); - -C defines tag to use within C - -C defines path to normalize XML - -C define additional tags that can be forced (and an be array). - -=cut - -sub open { - my $self = shift; - - my $arg = {@_}; - - my $log = $self->_get_logger(); - - foreach my $req (qw/tag xml_file/) { - $log->logconfess("need argument $req") unless $arg->{$req}; - } - - $self->{'tag'} = $arg->{'tag'}; - my $xml_file = $arg->{'xml_file'}; - - $log->info("using $xml_file tag <",$self->{'tag'},">"); - - $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file); - - $self->{'import_xml_file'} = $xml_file; - - my @force_array = [ $self->{'tag'}, 'config', 'format' ]; - push @force_array, $self->{'tags'} if ($self->{'tags'}); - - $self->{'import_xml'} = XMLin($xml_file, - ForceArray => @force_array, - ForceContent => 1, - ); - - $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled"); - - #print STDERR Dump($self->{import_xml}); - - return $self; -} - -=head2 open_yaml - -Read normalisation rules defined in YAML file located usually at -C and parse it. - - my $n = new WebPAC::Normalize::XML; - $n->open_yaml( - tag => 'isis', - path => '/path/to/conf/normalize/isis.yml', - ); - -=cut - -sub open_yaml { - my $self = shift; - - my $arg = {@_}; - - my $log = $self->_get_logger(); - - foreach my $req (qw/tag path/) { - $log->logconfess("need argument $req") unless $arg->{$req}; - } - - my $path = $arg->{path}; - $self->{tag} = $arg->{tag}; - - $log->logdie("normalisation yaml file '$path' doesn't exist!") if (! -e $path); - - $log->info("using $path normalization YAML"); - - $self->{'import_xml'} = LoadFile( $path ) || $log->die("can't load $path: $!"); - - $log->debug("import yaml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled"); - - $self->{_skip_x} = 1; - - return $self; -} - -=head2 _x - -Convert string from XML UTF-8 encoding to code page defined in C. - - my $text = $n->_x('utf8 text'); - -Default application code page is C. You will probably want to -change that when creating new instance of object based on this one. - -=cut - -sub _x { - my $self = shift; - my $utf8 = shift || return; - return $utf8 if ($self->{_skip_x}); - - # create UTF-8 convertor for import_xml files - $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2'); - - return $self->{'utf2cp'}->convert($utf8) || - $self->_get_logger()->logwarn("can't convert '$utf8'"); -} - - -=head1 AUTHOR - -Dobrica Pavlinusic, C<< >> - -=head1 COPYRIGHT & LICENSE - -Copyright 2005 Dobrica Pavlinusic, All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it -under the same terms as Perl itself. - -=cut - -1; # End of WebPAC::Normalize::XML diff --git a/lib/WebPAC/Output/KinoSearch.pm b/lib/WebPAC/Output/KinoSearch.pm index 7f562a4..6c50ac3 100644 --- a/lib/WebPAC/Output/KinoSearch.pm +++ b/lib/WebPAC/Output/KinoSearch.pm @@ -9,6 +9,7 @@ use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; use Encode qw/from_to/; use Data::Dumper; +use Storable; =head1 NAME @@ -16,11 +17,11 @@ WebPAC::Output::KinoSearch - Create KinoSearch full text index =head1 VERSION -Version 0.01 +Version 0.02 =cut -our $VERSION = '0.01'; +our $VERSION = '0.02'; =head1 SYNOPSIS @@ -99,6 +100,18 @@ sub new { analyzer => $analyzer, ); + my $fields_path = $self->{index_path} . '/fields.storable'; + $fields_path =~ s#//#/#g; + if (-e $fields_path) { + $self->{fields} = retrieve($fields_path) || + $log->warn("can't open $fields_path: $!"); + } else { + $log->error("This will be dummy run since no fields statistics are found!"); + $log->error("You will have to re-run indexing to get search results!"); + $self->{dummy_run} = 1; + } + $self->{fields_path} = $fields_path; + foreach my $f (@{ $self->{fields} }) { $self->{invindex}->spec_field( name => $f, @@ -158,6 +171,10 @@ sub add { sub add_value($$$$$) { my ($self,$log,$doc,$n,$v) = @_; return unless ($v); + + $self->{value_usage}->{$n}++; + return if ($self->{dummy_run}); + eval { $doc->set_value($n, $self->convert($v) ) }; $log->warn("can't insert: $n = $v") if ($@); } @@ -208,8 +225,20 @@ Close index sub finish { my $self = shift; - $self->_get_logger()->info("finish index writing to disk"); + my $log = $self->_get_logger(); + + $log->info("finish index writing to disk"); $self->{invindex}->finish; + + $log->info("writing value usage file"); + + # add fields from last run + map { $self->{value_usage}->{$_}++ } @{ $self->{fields} }; + + my @fields = keys %{ $self->{value_usage} }; + store \@fields, $self->{fields_path} || + $log->warn("can't write $self->{fields_path}: $!"); + } =head2 convert diff --git a/run.pl b/run.pl index b8fb7ce..825facd 100755 --- a/run.pl +++ b/run.pl @@ -11,8 +11,7 @@ use WebPAC::Common 0.02; use WebPAC::Lookup; use WebPAC::Input 0.03; use WebPAC::Store 0.03; -use WebPAC::Normalize::XML; -use WebPAC::Normalize::Set; +use WebPAC::Normalize; use WebPAC::Output::TT; use WebPAC::Validate; use YAML qw/LoadFile/; @@ -54,11 +53,6 @@ or C from input path to YAML configuration file -=item --force-set - -force conversion C<< normalize->path >> in C from -C<.xml> to C<.pl> - =item --stats disable indexing and dump statistics about field and subfield @@ -79,7 +73,6 @@ my $clean = 0; my $config = 'conf/config.yml'; my $debug = 0; my $only_filter; -my $force_set = 0; my $stats = 0; my $validate_path; @@ -91,7 +84,6 @@ GetOptions( "only=s" => \$only_filter, "config" => \$config, "debug" => \$debug, - "force-set" => \$force_set, "stats" => \$stats, "validate=s" => \$validate_path, ); @@ -228,43 +220,12 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) { %{ $input }, ); - my $n = new WebPAC::Normalize::XML( - # filter => { 'foo' => sub { shift } }, - db => $db, - lookup_regex => $lookup ? $lookup->regex : undef, - lookup => $lookup, - prefix => $input->{name}, - ); - my $rules; my $normalize_path = $input->{normalize}->{path}; - if ($force_set) { - my $new_norm_path = $normalize_path; - $new_norm_path =~ s/\.xml$/.pl/; - if (-e $new_norm_path) { - $log->debug("--force-set replaced $normalize_path with $new_norm_path"); - $normalize_path = $new_norm_path; - } else { - $log->debug("--force-set failed on $new_norm_path, fallback to $normalize_path"); - } - } + $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i ); - if ($normalize_path =~ m/\.xml$/i) { - $n->open( - tag => $input->{normalize}->{tag}, - xml_file => $normalize_path, - ); - } elsif ($normalize_path =~ m/\.(?:yml|yaml)$/i) { - $n->open_yaml( - path => $normalize_path, - tag => $input->{normalize}->{tag}, - ); - } elsif ($normalize_path =~ m/\.(?:pl)$/i) { - $n = undef; - $log->info("using WebPAC::Normalize::Set to process $normalize_path"); - $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!"; - } + my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!"; foreach my $pos ( 0 ... $input_db->size ) { @@ -285,22 +246,17 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) { } - my $ds; - if ($n) { - $ds = $n->data_structure($row); - } else { - $ds = WebPAC::Normalize::Set::data_structure( - row => $row, - rules => $rules, - lookup => $lookup ? $lookup->lookup_hash : undef, - ); - - $db->save_ds( - id => $mfn, - ds => $ds, - prefix => $input->{name}, - ) if ($ds && !$stats); - } + my $ds = WebPAC::Normalize::data_structure( + row => $row, + rules => $rules, + lookup => $lookup ? $lookup->lookup_hash : undef, + ); + + $db->save_ds( + id => $mfn, + ds => $ds, + prefix => $input->{name}, + ) if ($ds && !$stats); $indexer->add( id => $input->{name} . "/" . $mfn, diff --git a/t/0-load.t b/t/0-load.t index 06d6667..633eab2 100644 --- a/t/0-load.t +++ b/t/0-load.t @@ -11,7 +11,7 @@ use_ok( 'WebPAC::Input' ); use_ok( 'WebPAC::Input::ISIS' ); use_ok( 'WebPAC::Store' ); use_ok( 'WebPAC::Lookup' ); -use_ok( 'WebPAC::Normalize::XML' ); +use_ok( 'WebPAC::Normalize' ); use_ok( 'WebPAC::Output' ); use_ok( 'WebPAC::Output::Estraier' ); use_ok( 'WebPAC::Output::TT' ); diff --git a/t/3-normalize-set.t b/t/3-normalize-set.t deleted file mode 100755 index da7a881..0000000 --- a/t/3-normalize-set.t +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -use Test::More tests => 67; -use Test::Exception; -use Cwd qw/abs_path/; -use blib; -use File::Slurp; - -use Data::Dumper; -my $debug = shift @ARGV; - -BEGIN { - use_ok( 'WebPAC::Normalize::Set' ); -} - -ok(my $abs_path = abs_path($0), "abs_path"); -$abs_path =~ s#/[^/]*$#/#; -diag "abs_path: $abs_path" if ($debug); - -#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup"; - -my $rec1 = { - '200' => [{ - 'a' => '200a', - 'b' => '200b', - },{ - 'c' => '200c', - 'd' => '200d', - },{ - 'a' => '200a*2', - 'd' => '200d*2', - }], - '201' => [{ - 'x' => '201x', - 'y' => '201y', - }], - '900' => [ - '900-no_subfield' - ], - '901' => [{ - 'a' => '900a', - }], - '902' => [{ - 'z' => '900', - }], -}; - -my $rec2 = { - '675' => [ { - 'a' => '159.9' - } ], - '210' => [ { - 'c' => 'New York University press', - 'a' => 'New York', - 'd' => 'cop. 1988' - } ], - '700' => [ { - 'a' => 'Haynal', - 'b' => 'André' - } ], - '801' => [ 'FFZG' ], - '991' => [ '8302' ], - '000' => [ 1 ], - '702' => [ { - 'a' => 'Holder', - 'b' => 'Elizabeth' - } ], - '215' => [ { - 'c' => 'ilustr', - 'a' => 'xix, 202 str', - 'd' => '23cm' - } ], - '990' => [ - '2140', - '88', - 'HAY' - ], - '200' => [ { - 'e' => 'from Freud and Ferenczi to Michael balint', - 'a' => 'Controversies in psychoanalytic method', - 'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern', - 'f' => 'by André E. Haynal' - } ], - '610' => [ 'povijest psihoanalize' ], - '994' => [ { - 'c' => '', - 'a' => 'PS', - 'b' => 'MG' - } ], - '320' => [ 'Kazalo' ], - '101' => [ 'ENG' ], - '686' => [ '2140' ], - '300' => [ 'Prijevod djela: ' ], -}; - - -my $lookup1 = { - '00900' => [ - 'lookup 1', - 'lookup 2', - ], -}; - -my $lookup2 = { - '00900' => 'lookup', -}; - - -sub test { - print Dumper( @_ ), ("-" x 78), "\n"; - ok( defined(@_) ); -} - -# how much of string evaled to display? -my $max_eval_output = 170; - -sub dump_error { - my ($msg,$code) = @_; - - my @l = split(/[\n\r]/, $code); - my $out = "$msg\n"; - - foreach my $i ( 0 .. $#l ) { - $out .= sprintf("%2d: %s\n", $i, $l[$i]); - } - - return $out; -} - -sub test_s { - my $t = shift || die; - - my $eval_t = $t; - $eval_t =~ s/[\n\r\s]+/ /gs; - $eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output); - - eval "$t"; - ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t"); -} - -{ - no strict 'subs'; - use WebPAC::Normalize::Set; - - ok(! set_lookup( undef ), "set_lookup(undef)"); - - set_rec( $rec1 ); - - cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' ); - cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' ); - cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' ); - diag "is_deeply checks\n"; - is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] ); - is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]); - is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]); - is_deeply( \[ rec('902') ], \[ '900' ] ); - - cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' ); - - # simple list manipulatons - cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix'); - cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix'); - cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround'); - - - set_lookup( $lookup1 ); - - cmp_ok( - join_with(" i ", - lookup( - regex( 's/^/00/', - rec2('902','z') - ) - ) - ), - 'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2'); - - # check join_with operations - - sub test_join_with_2 { - my ($a,$b,$e) = @_; - - cmp_ok( - join_with(" <1> ", - rec('201',$a), - rec('201',$b), - ), - 'eq', $e, "join_with $a <1> $b = $e"); - } - - test_join_with_2('_','_',''); - test_join_with_2('x','_','201x'); - test_join_with_2('_','x','201x'); - test_join_with_2('x','y','201x <1> 201y'); - - sub test_join_with_3 { - my ($a,$b,$c,$e) = @_; - - cmp_ok( - join_with(" <1> ", rec('201',$a), - join_with(" <2> ", rec('201',$b), - rec('201',$c), - ) - ), - 'eq', $e, "join_with $a <1> $b <2> $c = $e"); - }; - - test_join_with_3('_','_','_',''); - test_join_with_3('x','_','_','201x'); - test_join_with_3('_','x','_','201x'); - test_join_with_3('_','_','x','201x'); - test_join_with_3('x','y','_','201x <1> 201y'); - test_join_with_3('x','_','y','201x <1> 201y'); - test_join_with_3('_','x','y','201x <2> 201y'); - test_join_with_3('x','_','y','201x <1> 201y'); - test_join_with_3('x','y','x','201x <1> 201y <2> 201x'); - - # test lookups - - set_lookup( $lookup2 ); - - is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' ); - - ok(! lookup('non-existent'), 'lookup non-existant' ); - - set_rec( $rec2 ); - - test_s(qq{ - tag('Title', - rec('200','a') - ); - }); - test_s(qq{ - tag('Who', - join_with(" ", - rec('702','a'), - rec('702','b') - ) - ); - }); - - test_s(qq{ - display('Publisher', - rec('210','c') - ) - }); - - test_s(qq{ - search('Year', - regex( 's/[^\\d]+//', - rec('210','d') - ) - ) - }); - - ok(my $ds = get_ds(), "get_ds"); - diag "ds = ", Dumper($ds) if ($debug); - - - sub test_check_ds { - - my $t = shift; - - ok($ds = get_ds(), 'get_ds'); - diag Dumper( $ds ) if ($debug); - - ok( $ds && $ds->{something}, 'get_ds->something exists' ); - ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t); - ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' ); - - return $ds; - } - - clean_ds(); - test_s(qq{ search('something', '42'); }); - test_s(qq{ search('empty', ''); }); - test_check_ds('search'); - - clean_ds(); - test_s(qq{ display('something', '42'); }); - test_s(qq{ display('empty', ''); }); - test_check_ds('display'); - - clean_ds(); - test_s(qq{ tag('something', '42'); }); - test_s(qq{ tag('empty', ''); }); - test_check_ds('search'); - test_check_ds('display'); - - clean_ds(); - my $n = read_file( "$abs_path/data/normalize.pl" ); - $n .= "\n1;\n"; - #diag "normalize code:\n$n\n"; - test_s( $n ); - - ok($ds = get_ds(), "get_ds"); - diag "ds = ", Dumper($ds) if ($debug); - - my $rec = { - '200' => [{ - 'a' => '200a', - 'b' => '200b', - }], - }; - my $rules = qq{ search('mixed', rec('200') ) }; - - clean_ds(); - set_rec( $rec ); - test_s( $rules ); - ok($ds = get_ds(), "get_ds"); - is_deeply( $ds, { - 'mixed' => { - 'search' => [ '200a', '200b' ], - 'tag' => 'mixed' - } - }, 'correct get_ds'); - - ok(my $ds2 = WebPAC::Normalize::Set::data_structure( - row => $rec, - rules => $rules, - ), 'data_structure'); - is_deeply( $ds, $ds2, 'data_structure(s) same'); - - # wird and non-valid structure which is supported anyway - clean_ds(); - set_rec({ - '200' => [{ - 'a' => '200a', - }, - '200-solo' - ] - }); - test_s(qq{ search('mixed', rec('200') ) }); - ok($ds = get_ds(), "get_ds"); - is_deeply( $ds, { - 'mixed' => { - 'search' => [ '200a', '200-solo' ], - 'tag' => 'mixed' - } - }, 'correct get_ds'); - -} - diff --git a/t/3-normalize-xml.t b/t/3-normalize-xml.t deleted file mode 100755 index a55e52b..0000000 --- a/t/3-normalize-xml.t +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/perl -w - -use Test::More tests => 140; -use Test::Exception; -use Cwd qw/abs_path/; -use blib; -use strict; -use Data::Dumper; - -BEGIN { -use_ok( 'WebPAC::Normalize::XML' ); -} - -ok(my $abs_path = abs_path($0), "abs_path"); -$abs_path =~ s#/[^/]*$#/#; -#diag "abs_path: $abs_path"; - -throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup"; -throws_ok { new WebPAC::Normalize::XML( lookup => 'bar' ) } qr/pair/, "lookup without lookup_regex"; - -ok(my $n = new WebPAC::Normalize::XML( - debug => 0, - filter => { - regex => sub { - my ($val, $regex) = @_; - eval "\$val =~ $regex"; - return $val; - }, - }, -), "new"); - -throws_ok { $n->open() } qr/tag/, "open without tag"; -throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file"; -throws_ok { $n->open( tag => 'isis', xml_file => '/foo/bar/baz' ) } qr/file.*doesn't exist/, "open with invalid xml_file"; -ok( $n->open( - tag => 'isis', - xml_file => "$abs_path/data/normalize.xml", -), "open"); - -my $rec = { - '675' => [ - { - 'a' => '159.9' - } - ], - '210' => [ - { - 'c' => 'New York University press', - 'a' => 'New York', - 'd' => 'cop. 1988' - } - ], - '700' => [ - { - 'a' => 'Haynal', - 'b' => 'André' - } - ], - '801' => [ - 'FFZG' - ], - '991' => [ - '8302' - ], - '000' => [ - 1 - ], - '702' => [ - { - 'a' => 'Holder', - 'b' => 'Elizabeth' - } - ], - '215' => [ - { - 'c' => 'ilustr', - 'a' => 'xix, 202 str', - 'd' => '23cm' - } - ], - '990' => [ - '2140', - '88', - 'HAY' - ], - '200' => [ - { - 'e' => 'from Freud and Ferenczi to Michael balint', - 'a' => 'Controversies in psychoanalytic method', - 'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern', - 'f' => 'by André E. Haynal' - } - ], - '610' => [ - 'povijest psihoanalize' - ], - '994' => [ - { - 'c' => '', - 'a' => 'PS', - 'b' => 'MG' - } - ], - '320' => [ - 'Kazalo' - ], - '101' => [ - 'ENG' - ], - '686' => [ - '2140' - ], - '300' => [ - 'Prijevod djela: ' - ] -}; - -foreach my $fld (keys %$rec) { - my $r = 0; - foreach my $item ($rec->{$fld}) { - if (ref($item) eq 'HASH') { - foreach my $sf (keys %$item) { - my $found = 0; - ok($n->get_data(\$rec, $fld, $sf, $r, \$found), "v${fld}^${sf} / $r"); - ok($found, "found"); - } - my $found = 0; - ok(! $n->get_data(\$rec, $fld, 'x', $r, \$found), "no v${fld}^x / $r"); - ok(! $found, "not found"); - } else { - my $found = 0; - ok($n->get_data(\$rec, $fld, undef, $r, \$found), "v${fld} / $r"); - ok($found, "found"); - } - } - my $found = 0; - ok(! $n->get_data(\$rec, '999', '', $r, \$found), "no v${fld} / $r"); - ok(! $found, "not found"); -} - -ok(my $ds = $n->data_structure( $rec ), "data_structure"); - -#diag Dumper($rec, $ds); - -# fake load of our test normalisation data -$n->{tag} = 'isis'; - -#diag Dumper($n->{import_xml}->{indexer}); - - -$rec = { - '900' => [ { - 'a' => '1', - 'b' => '2', - 'c' => '3', - 'x' => 'yap', - } ], -}; - -my $import = { - 'Tag' => { 'isis' => [ - { content => 'v900^a + v900^b = v900^c [v900^x]' }, - ] }, -}; - -sub parse_test($$$$$) { - my ($import,$tag,$rec,$i,$r) = @_; - $n->{import_xml}->{indexer} = $import; - # erase internal cache (yak!) - delete($n->{_tags_by_order}); - push @{$rec->{'000'}}, 42 unless ($rec->{'000'}); - #diag "test normalisation of: ",Dumper($n->{import_xml}->{indexer}, $rec); - ok(my $ds = $n->data_structure( $rec ), "data_structure"); - #diag Dumper($ds->{$tag}->{display}); # if ($i == 0); - cmp_ok($ds->{$tag}->{display}->[$i], 'eq', $r, "parse $tag/$i - $r"); -} - -parse_test($import, 'Tag', $rec, 0, '1 + 2 = 3 [yap]'); - -delete($rec->{'900'}->[0]->{'b'}); -parse_test($import, 'Tag', $rec, 0, '1 = 3 [yap]'); - -$rec->{'900'}->[0]->{'b'} = 5; -$rec->{'900'}->[0]->{'c'} = 6; -parse_test($import, 'Tag', $rec, 0, '1 + 5 = 6 [yap]'); - -delete($rec->{'900'}->[0]->{'c'}); -$rec->{'900'}->[0]->{'x'} = 'hmmm'; -parse_test($import, 'Tag', $rec, 0, '1 + 5 [hmmm]'); - -$rec->{'900'}->[0]->{'x'} = 'nope!'; -delete($rec->{'900'}->[0]->{'a'}); -parse_test($import, 'Tag', $rec, 0, '5 [nope!]'); - -$rec = { - '900' => [ { - 'b' => 'b1', - 'x' => 'b1', - },{ - 'a' => 'a1', - 'b' => 'b2', - 'x' => 'a1_b2', - },{ - 'b' => 'b3', - 'c' => 'c1', - 'x' => 'b3_c1', - },{ - 'a' => 'a2', - 'b' => 'b4', - 'c' => 'c2', - 'x' => 'a2_b4_c2', - } ], -}; - -$import = { - 'Tag' => { 'isis' => [ - { content => '0 v900^a 1 v900^b 2 v900^c 3 v900^x 4' }, - ] }, -}; - -parse_test($import, 'Tag', $rec, 0, '0 b1 3 b1 4'); -parse_test($import, 'Tag', $rec, 1, '0 a1 1 b2 3 a1_b2 4'); -parse_test($import, 'Tag', $rec, 2, '0 b3 2 c1 3 b3_c1 4'); -parse_test($import, 'Tag', $rec, 3, '0 a2 1 b4 2 c2 3 a2_b4_c2 4'); - -sub parse_test_arr($$$$) { - my ($import, $tag, $rec, $arr) = @_; - my $i = 0; - foreach my $res (@{$arr}) { - parse_test($import, $tag, $rec, $i, $res); - $i++; - } -} - -$import = { - 'Tag_a' => { 'isis' => [ - { content => 'v900^a' }, - ] }, - 'Tag_b' => { 'isis' => [ - { content => 'v900^b' }, - ] }, - 'Tag_c' => { 'isis' => [ - { content => 'v900^c' }, - ] }, - 'Tag_x' => { 'isis' => [ - { content => 'v900^x' }, - ] }, - 'Tag_s1' => { 'isis' => [ - { content => 'v900^a = v900^c' }, - ] }, - 'Tag_noval' => { 'isis' => [ - { content => 'v911^1' }, - { content => 'v900^c' }, - ] }, -}; - -parse_test_arr($import, 'Tag_a', $rec, [ '','a1','','a2' ] ); -parse_test_arr($import, 'Tag_b', $rec, [ 'b1','b2','b3','b4' ] ); -parse_test_arr($import, 'Tag_c', $rec, [ '','','c1','c2' ] ); -parse_test_arr($import, 'Tag_x', $rec, [ 'b1','a1_b2','b3_c1','a2_b4_c2' ] ); -parse_test_arr($import, 'Tag_s1', $rec, [ '', 'a1', 'c1', 'a2 = c2' ] ); -parse_test_arr($import, 'Tag_noval', $rec, [ '','','c1','c2' ] ); - diff --git a/t/3-normalize.t b/t/3-normalize.t new file mode 100755 index 0000000..7321bf5 --- /dev/null +++ b/t/3-normalize.t @@ -0,0 +1,345 @@ +#!/usr/bin/perl -w + +use strict; + +use Test::More tests => 67; +use Test::Exception; +use Cwd qw/abs_path/; +use blib; +use File::Slurp; + +use Data::Dumper; +my $debug = shift @ARGV; + +BEGIN { + use_ok( 'WebPAC::Normalize' ); +} + +ok(my $abs_path = abs_path($0), "abs_path"); +$abs_path =~ s#/[^/]*$#/#; +diag "abs_path: $abs_path" if ($debug); + +#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup"; + +my $rec1 = { + '200' => [{ + 'a' => '200a', + 'b' => '200b', + },{ + 'c' => '200c', + 'd' => '200d', + },{ + 'a' => '200a*2', + 'd' => '200d*2', + }], + '201' => [{ + 'x' => '201x', + 'y' => '201y', + }], + '900' => [ + '900-no_subfield' + ], + '901' => [{ + 'a' => '900a', + }], + '902' => [{ + 'z' => '900', + }], +}; + +my $rec2 = { + '675' => [ { + 'a' => '159.9' + } ], + '210' => [ { + 'c' => 'New York University press', + 'a' => 'New York', + 'd' => 'cop. 1988' + } ], + '700' => [ { + 'a' => 'Haynal', + 'b' => 'André' + } ], + '801' => [ 'FFZG' ], + '991' => [ '8302' ], + '000' => [ 1 ], + '702' => [ { + 'a' => 'Holder', + 'b' => 'Elizabeth' + } ], + '215' => [ { + 'c' => 'ilustr', + 'a' => 'xix, 202 str', + 'd' => '23cm' + } ], + '990' => [ + '2140', + '88', + 'HAY' + ], + '200' => [ { + 'e' => 'from Freud and Ferenczi to Michael balint', + 'a' => 'Controversies in psychoanalytic method', + 'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern', + 'f' => 'by André E. Haynal' + } ], + '610' => [ 'povijest psihoanalize' ], + '994' => [ { + 'c' => '', + 'a' => 'PS', + 'b' => 'MG' + } ], + '320' => [ 'Kazalo' ], + '101' => [ 'ENG' ], + '686' => [ '2140' ], + '300' => [ 'Prijevod djela: ' ], +}; + + +my $lookup1 = { + '00900' => [ + 'lookup 1', + 'lookup 2', + ], +}; + +my $lookup2 = { + '00900' => 'lookup', +}; + + +sub test { + print Dumper( @_ ), ("-" x 78), "\n"; + ok( defined(@_) ); +} + +# how much of string evaled to display? +my $max_eval_output = 170; + +sub dump_error { + my ($msg,$code) = @_; + + my @l = split(/[\n\r]/, $code); + my $out = "$msg\n"; + + foreach my $i ( 0 .. $#l ) { + $out .= sprintf("%2d: %s\n", $i, $l[$i]); + } + + return $out; +} + +sub test_s { + my $t = shift || die; + + my $eval_t = $t; + $eval_t =~ s/[\n\r\s]+/ /gs; + $eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output); + + eval "$t"; + ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t"); +} + +{ + no strict 'subs'; + use WebPAC::Normalize; + + ok(! set_lookup( undef ), "set_lookup(undef)"); + + set_rec( $rec1 ); + + cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' ); + cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' ); + cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' ); + diag "is_deeply checks\n"; + is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] ); + is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]); + is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]); + is_deeply( \[ rec('902') ], \[ '900' ] ); + + cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' ); + + # simple list manipulatons + cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix'); + cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix'); + cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround'); + + + set_lookup( $lookup1 ); + + cmp_ok( + join_with(" i ", + lookup( + regex( 's/^/00/', + rec2('902','z') + ) + ) + ), + 'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2'); + + # check join_with operations + + sub test_join_with_2 { + my ($a,$b,$e) = @_; + + cmp_ok( + join_with(" <1> ", + rec('201',$a), + rec('201',$b), + ), + 'eq', $e, "join_with $a <1> $b = $e"); + } + + test_join_with_2('_','_',''); + test_join_with_2('x','_','201x'); + test_join_with_2('_','x','201x'); + test_join_with_2('x','y','201x <1> 201y'); + + sub test_join_with_3 { + my ($a,$b,$c,$e) = @_; + + cmp_ok( + join_with(" <1> ", rec('201',$a), + join_with(" <2> ", rec('201',$b), + rec('201',$c), + ) + ), + 'eq', $e, "join_with $a <1> $b <2> $c = $e"); + }; + + test_join_with_3('_','_','_',''); + test_join_with_3('x','_','_','201x'); + test_join_with_3('_','x','_','201x'); + test_join_with_3('_','_','x','201x'); + test_join_with_3('x','y','_','201x <1> 201y'); + test_join_with_3('x','_','y','201x <1> 201y'); + test_join_with_3('_','x','y','201x <2> 201y'); + test_join_with_3('x','_','y','201x <1> 201y'); + test_join_with_3('x','y','x','201x <1> 201y <2> 201x'); + + # test lookups + + set_lookup( $lookup2 ); + + is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' ); + + ok(! lookup('non-existent'), 'lookup non-existant' ); + + set_rec( $rec2 ); + + test_s(qq{ + tag('Title', + rec('200','a') + ); + }); + test_s(qq{ + tag('Who', + join_with(" ", + rec('702','a'), + rec('702','b') + ) + ); + }); + + test_s(qq{ + display('Publisher', + rec('210','c') + ) + }); + + test_s(qq{ + search('Year', + regex( 's/[^\\d]+//', + rec('210','d') + ) + ) + }); + + ok(my $ds = get_ds(), "get_ds"); + diag "ds = ", Dumper($ds) if ($debug); + + + sub test_check_ds { + + my $t = shift; + + ok($ds = get_ds(), 'get_ds'); + diag Dumper( $ds ) if ($debug); + + ok( $ds && $ds->{something}, 'get_ds->something exists' ); + ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t); + ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' ); + + return $ds; + } + + clean_ds(); + test_s(qq{ search('something', '42'); }); + test_s(qq{ search('empty', ''); }); + test_check_ds('search'); + + clean_ds(); + test_s(qq{ display('something', '42'); }); + test_s(qq{ display('empty', ''); }); + test_check_ds('display'); + + clean_ds(); + test_s(qq{ tag('something', '42'); }); + test_s(qq{ tag('empty', ''); }); + test_check_ds('search'); + test_check_ds('display'); + + clean_ds(); + my $n = read_file( "$abs_path/data/normalize.pl" ); + $n .= "\n1;\n"; + #diag "normalize code:\n$n\n"; + test_s( $n ); + + ok($ds = get_ds(), "get_ds"); + diag "ds = ", Dumper($ds) if ($debug); + + my $rec = { + '200' => [{ + 'a' => '200a', + 'b' => '200b', + }], + }; + my $rules = qq{ search('mixed', rec('200') ) }; + + clean_ds(); + set_rec( $rec ); + test_s( $rules ); + ok($ds = get_ds(), "get_ds"); + is_deeply( $ds, { + 'mixed' => { + 'search' => [ '200a', '200b' ], + 'tag' => 'mixed' + } + }, 'correct get_ds'); + + ok(my $ds2 = WebPAC::Normalize::data_structure( + row => $rec, + rules => $rules, + ), 'data_structure'); + is_deeply( $ds, $ds2, 'data_structure(s) same'); + + # wird and non-valid structure which is supported anyway + clean_ds(); + set_rec({ + '200' => [{ + 'a' => '200a', + }, + '200-solo' + ] + }); + test_s(qq{ search('mixed', rec('200') ) }); + ok($ds = get_ds(), "get_ds"); + is_deeply( $ds, { + 'mixed' => { + 'search' => [ '200a', '200-solo' ], + 'tag' => 'mixed' + } + }, 'correct get_ds'); + +} + diff --git a/t/6-unit.t b/t/6-unit.t index a17aa4a..5658ce6 100755 --- a/t/6-unit.t +++ b/t/6-unit.t @@ -2,7 +2,7 @@ use strict; -use Test::More tests => 41; +use Test::More tests => 24; use Test::Exception; use Cwd qw/abs_path/; use File::Temp qw/tempdir/; @@ -17,8 +17,8 @@ BEGIN { use_ok( 'WebPAC::Lookup' ); use_ok( 'WebPAC::Input' ); use_ok( 'WebPAC::Store' ); -use_ok( 'WebPAC::Normalize::XML' ); -use_ok( 'WebPAC::Normalize::Set' ); +use_ok( 'WebPAC::Lookup::Normalize' ); +use_ok( 'WebPAC::Normalize' ); use_ok( 'WebPAC::Output::TT' ); } @@ -28,15 +28,13 @@ diag "abs_path: $abs_path" if ($debug); my $isis_file = "$abs_path../t/winisis/BIBL"; #$isis_file = '/data/hidra/THS/THS'; -$isis_file = '/data/isis_data/ffkk/'; +#$isis_file = '/data/isis_data/ffkk/'; diag "isis_file: $isis_file" if ($debug); my $normalize_set_pl = "$abs_path/data/normalize.pl"; my $lookup_file = "$abs_path../conf/lookup/isis.pm"; -my ($t1,$t2) = (0,0); - ok(my $lookup = new WebPAC::Lookup( lookup_file => $lookup_file, ), "new Lookup"); @@ -61,19 +59,6 @@ ok(my $db = new WebPAC::Store( database => '.', ), "new Store"); -ok(my $n = new WebPAC::Normalize::XML( -# filter => { 'foo' => sub { shift } }, - db => $db, - lookup_regex => $lookup->regex, - lookup => $lookup, - no_progress_bar => 1, -), "new Normalize::XML"); - -ok($n->open( - tag => 'isis', - xml_file => "$abs_path/data/normalize.xml", -), "Normalize::XML->open"); - ok(my $norm_pl = read_file( $normalize_set_pl ), "set definitions: $normalize_set_pl" ); ok(my $out = new WebPAC::Output::TT( @@ -83,6 +68,8 @@ ok(my $out = new WebPAC::Output::TT( diag " lookup => ",Dumper($lookup->lookup_hash) if ($debug); +my $t_norm = 0; + foreach my $pos ( 0 ... $isis->size ) { my $row = $isis->fetch || next; @@ -90,21 +77,14 @@ foreach my $pos ( 0 ... $isis->size ) { diag " row $pos => ",Dumper($row) if ($debug); my $t = time(); - ok(my $ds = $n->data_structure($row), "XML data_structure"); - $t1 += time() - $t; - - diag " ds $pos => ",Dumper($ds) if ($debug); - - $t = time(); - ok( my $ds2 = WebPAC::Normalize::Set::data_structure( + ok( my $ds = WebPAC::Normalize::data_structure( lookup => $lookup->lookup_hash, row => $row, rules => $norm_pl, ), "Set data_structure"); - $t2 += time() - $t; + $t_norm += time() - $t; - diag " ds2 $pos => ",Dumper($ds2) if ($debug); - is_deeply( $ds, $ds2, 'ds same for xml and sets'); + diag " ds $pos => ",Dumper($ds) if ($debug); ok(my $html = $out->apply( template => 'html.tt', @@ -117,4 +97,4 @@ foreach my $pos ( 0 ... $isis->size ) { }; -diag sprintf("timings: %.2fs vs %.2fs [%1.2f%%]\n", $t1, $t2, ($t1 / $t2) * 100); +diag sprintf("timings: %.2fs\n", $t_norm);