1 package WebPAC::Normalize;
5 use base 'WebPAC::Common';
10 WebPAC::Normalize - data mungling for normalisation
18 our $VERSION = '0.06';
22 This package contains code that mungle data to produce normalized format.
24 It contains several assumptions:
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
36 source data records (C<$rec>) have unique identifiers in field C<000>
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
50 There is one built-in filter called C<regex> which can be use like this:
52 filter{regex(s/foo/bar/)}
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
77 Create new normalisation object
79 my $n = new WebPAC::Normalize::Something(
81 'filter_name_1' => sub {
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
107 bless($self, $class);
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
112 my $log = $self->_get_logger();
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
126 if ($self->{filter} && ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
135 $self ? return $self : return undef;
139 =head2 data_structure
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
144 This structures are used to produce output.
146 my $ds = $webpac->data_structure($rec);
153 my $log = $self->_get_logger();
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
160 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
170 $log->debug("cache miss, creating");
173 undef $self->{'currnet_filename'};
174 undef $self->{'headline'};
177 if ($self->{tags_by_order}) {
178 @sorted_tags = @{$self->{tags_by_order}};
180 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
181 $self->{tags_by_order} = \@sorted_tags;
186 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
188 foreach my $field (@sorted_tags) {
192 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
194 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
197 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
198 $format = $tag->{'value'} || $tag->{'content'};
200 $log->debug("format: $format");
203 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
204 @v = $self->fill_in_to_arr($rec,$format);
206 @v = $self->parse_to_arr($rec,$format);
210 if ($tag->{'sort'}) {
211 @v = $self->sort_arr(@v);
215 if ($tag->{'format_name'}) {
216 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219 # delimiter will join repeatable fields
220 if ($tag->{'delimiter'}) {
221 @v = ( join($tag->{'delimiter'}, @v) );
225 my @types = qw(display search);
226 # override by type attribute
227 @types = ( $tag->{'type'} ) if ($tag->{'type'});
229 foreach my $type (@types) {
230 # append to previous line?
231 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
232 if ($tag->{'append'}) {
234 # I will delimit appended part with
236 my $d = $tag->{'delimiter'};
240 my $last = pop @{$row->{$type}};
241 $d = "" if (! $last);
242 $last .= $d . join($d, @v);
243 push @{$row->{$type}}, $last;
246 push @{$row->{$type}}, @v;
254 $row->{'tag'} = $field;
256 # TODO: name_sigular, name_plural
257 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258 my $row_name = $name ? $self->_x($name) : $field;
260 # post-sort all values in field
261 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262 $log->warn("sort at field tag not implemented");
265 $ds->{$row_name} = $row;
267 $log->debug("row $field: ",sub { Dumper($row) });
272 $self->{'db'}->save_ds(
275 prefix => $self->{prefix},
276 ) if ($self->{'db'});
278 $log->debug("ds: ", sub { Dumper($ds) });
280 $log->logconfess("data structure returned is not array any more!") if wantarray;
288 Perform smart parsing of string, skipping delimiters for fields which aren't
289 defined. It can also eval code in format starting with C<eval{...}> and
290 return output or nothing depending on eval code.
292 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
294 Filters are implemented here. While simple form of filters looks like this:
296 filter{name_of_filter}
298 but, filters can also have variable number of parametars like this:
300 filter{name_of_filter(param,param,param)}
309 my ($rec, $format_utf8, $i) = @_;
311 return if (! $format_utf8);
313 my $log = $self->_get_logger();
315 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
319 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
323 $log->debug("format: $format");
326 # remove eval{...} from beginning
327 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330 # remove filter{...} from beginning
331 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
336 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
339 $prefix ||= $del if ($all_found == 0);
343 $r = 0 if (lc("$2") eq 's');
346 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
351 $all_found += $found;
355 return if (! $all_found);
357 my $out = join('',@out);
360 # add rest of format (suffix)
363 # add prefix if not there
364 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
366 $log->debug("result: $out");
370 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
371 $log->debug("about to eval{$eval} format: $out");
372 return if (! $self->_eval($eval));
377 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
378 @filter_args = split(/,/, $2);
380 if ($self->{'filter'}->{$filter_name}) {
381 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
382 unshift @filter_args, $out;
383 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
384 return unless(defined($out));
385 $log->debug("filter result: $out");
386 } elsif (! $warn_once->{$filter_name}) {
387 $log->warn("trying to use undefined filter $filter_name");
388 $warn_once->{$filter_name}++;
397 Similar to C<parse>, but returns array of all repeatable fields
399 my @arr = $webpac->parse_to_arr($rec,'v250^a');
406 my ($rec, $format_utf8) = @_;
408 my $log = $self->_get_logger();
410 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411 return if (! $format_utf8);
416 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
420 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
428 Workhourse of all: takes record from in-memory structure of database and
429 strings with placeholders and returns string or array of with substituted
432 my $text = $webpac->fill_in($rec,'v250^a');
434 Optional argument is ordinal number for repeatable fields. By default,
435 it's assume to be first repeatable field (fields are perl array, so first
437 Following example will read second value from repeatable field.
439 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
441 This function B<does not> perform parsing of format to inteligenty skip
442 delimiters before fields which aren't used.
444 This method will automatically decode UTF-8 string to local code page
452 my $log = $self->_get_logger();
454 my $rec = shift || $log->logconfess("need data record");
455 my $format = shift || $log->logconfess("need format to parse");
456 # iteration (for repeatable fields)
459 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
461 # FIXME remove for speedup?
462 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
464 if (utf8::is_utf8($format)) {
465 $format = $self->_x($format);
471 # remove eval{...} from beginning
472 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
475 # remove filter{...} from beginning
476 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
478 # do actual replacement of placeholders
480 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
481 # non-repeatable fields
482 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
485 $log->debug("format: $format");
487 my $eval = $self->fill_in($rec,$eval_code,$i);
488 return if (! $self->_eval($eval));
490 if ($filter_name && $self->{'filter'}->{$filter_name}) {
491 $log->debug("filter '$filter_name' for $format");
492 $format = $self->{'filter'}->{$filter_name}->($format);
493 return unless(defined($format));
494 $log->debug("filter result: $format");
496 # do we have lookups?
497 if ($self->{'lookup'}) {
498 if ($self->{'lookup'}->can('lookup')) {
499 my @lookup = $self->{lookup}->lookup($format);
500 $log->debug("lookup $format", join(", ", @lookup));
503 $log->warn("Have lookup object but can't invoke lookup method");
514 =head2 fill_in_to_arr
516 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
517 for fields which have lookups, so they shouldn't be parsed but rather
520 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
527 my ($rec, $format_utf8) = @_;
529 my $log = $self->_get_logger();
531 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
532 return if (! $format_utf8);
537 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
541 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
549 Returns value from record.
551 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
554 record reference C<$rec>,
556 optional subfiled C<$sf>,
557 index for repeatable values C<$i>.
559 Optinal variable C<$found> will be incremeted if there
562 Returns value or empty string.
569 my ($rec,$f,$sf,$i,$found) = @_;
572 return '' if (! $$rec->{$f}->[$i]);
574 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
575 $$found++ if (defined($$found));
576 return $$rec->{$f}->[$i]->{$sf};
577 } elsif (! $sf && $$rec->{$f}->[$i]) {
578 $$found++ if (defined($$found));
579 # it still might have subfield, just
580 # not specified, so we'll dump all
581 if ($$rec->{$f}->[$i] =~ /HASH/o) {
583 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
584 $out .= $$rec->{$f}->[$i]->{$k}." ";
588 return $$rec->{$f}->[$i];
601 Apply format specified in tag with C<format_name="name"> and
602 C<format_delimiter=";;">.
604 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
606 Formats can contain C<lookup{...}> if you need them.
613 my ($name,$delimiter,$data) = @_;
615 my $log = $self->_get_logger();
617 if (! $self->{'import_xml'}->{'format'}->{$name}) {
618 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
622 $log->warn("no delimiter for format $name") if (! $delimiter);
624 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
626 my @data = split(/\Q$delimiter\E/, $data);
628 my $out = sprintf($format, @data);
629 $log->debug("using format $name [$format] on $data to produce: $out");
631 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
632 return $self->{'lookup'}->lookup($out);
641 Sort array ignoring case and html in data
643 my @sorted = $webpac->sort_arr(@unsorted);
650 my $log = $self->_get_logger();
652 # FIXME add Schwartzian Transformation?
659 $log->debug("sorted values: ",sub { join(", ",@sorted) });
665 =head1 INTERNAL METHODS
667 =head2 _sort_by_order
669 Sort xml tags data structure accoding to C<order=""> attribute.
676 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
677 $self->{'import_xml'}->{'indexer'}->{$a};
678 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
679 $self->{'import_xml'}->{'indexer'}->{$b};
686 Convert strings from C<conf/normalize/*.xml> encoding into application
687 specific encoding (optinally specified using C<code_page> to C<new>
690 my $text = $n->_x('normalize text string');
692 This is a stub so that other modules doesn't have to implement it.
704 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
706 =head1 COPYRIGHT & LICENSE
708 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
710 This program is free software; you can redistribute it and/or modify it
711 under the same terms as Perl itself.
715 1; # End of WebPAC::Normalize