1 package WebPAC::Normalize;
5 use base 'WebPAC::Common';
10 WebPAC::Normalize - data mungling for normalisation
18 our $VERSION = '0.06';
22 This package contains code that mungle data to produce normalized format.
24 It contains several assumptions:
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
36 source data records (C<$rec>) have unique identifiers in field C<000>
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
50 There is one built-in filter called C<regex> which can be use like this:
52 filter{regex(s/foo/bar/)}
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
77 Create new normalisation object
79 my $n = new WebPAC::Normalize::Something(
81 'filter_name_1' => sub {
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
107 bless($self, $class);
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
112 my $log = $self->_get_logger();
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
126 if ($self->{filter} && ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
135 $self ? return $self : return undef;
139 =head2 data_structure
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
144 This structures are used to produce output.
146 my $ds = $webpac->data_structure($rec);
153 my $log = $self->_get_logger();
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
160 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
170 $log->debug("cache miss, creating");
173 undef $self->{'currnet_filename'};
174 undef $self->{'headline'};
177 if ($self->{tags_by_order}) {
178 @sorted_tags = @{$self->{tags_by_order}};
180 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
181 $self->{tags_by_order} = \@sorted_tags;
186 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
188 foreach my $field (@sorted_tags) {
192 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
194 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
197 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
198 $format = $tag->{'value'} || $tag->{'content'};
200 $log->debug("format: $format");
203 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
204 @v = $self->fill_in_to_arr($rec,$format);
206 @v = $self->parse_to_arr($rec,$format);
210 if ($tag->{'sort'}) {
211 @v = $self->sort_arr(@v);
215 if ($tag->{'format_name'}) {
216 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219 # delimiter will join repeatable fields
220 if ($tag->{'delimiter'}) {
221 @v = ( join($tag->{'delimiter'}, @v) );
225 my @types = qw(display search);
226 # override by type attribute
227 @types = ( $tag->{'type'} ) if ($tag->{'type'});
229 foreach my $type (@types) {
230 # append to previous line?
231 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
232 if ($tag->{'append'}) {
234 # I will delimit appended part with
236 my $d = $tag->{'delimiter'};
240 my $last = pop @{$row->{$type}};
241 $d = "" if (! $last);
242 $last .= $d . join($d, @v);
243 push @{$row->{$type}}, $last;
246 push @{$row->{$type}}, @v;
254 $row->{'tag'} = $field;
256 # TODO: name_sigular, name_plural
257 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258 my $row_name = $name ? $self->_x($name) : $field;
260 # post-sort all values in field
261 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262 $log->warn("sort at field tag not implemented");
265 $ds->{$row_name} = $row;
267 $log->debug("row $field: ",sub { Dumper($row) });
272 $self->{'db'}->save_ds(
275 prefix => $self->{prefix},
276 ) if ($self->{'db'});
278 $log->debug("ds: ", sub { Dumper($ds) });
280 $log->logconfess("data structure returned is not array any more!") if wantarray;
288 Perform smart parsing of string, skipping delimiters for fields which aren't
289 defined. It can also eval code in format starting with C<eval{...}> and
290 return output or nothing depending on eval code.
292 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
294 Filters are implemented here. While simple form of filters looks like this:
296 filter{name_of_filter}
298 but, filters can also have variable number of parametars like this:
300 filter{name_of_filter(param,param,param)}
307 my ($rec, $format_utf8, $i) = @_;
309 return if (! $format_utf8);
311 my $log = $self->_get_logger();
313 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
317 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
321 $log->debug("format: $format");
324 # remove eval{...} from beginning
325 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
328 # remove filter{...} from beginning
329 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
337 $prefix ||= $del if ($all_found == 0);
341 $r = 0 if (lc("$2") eq 's');
344 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
349 $all_found += $found;
353 return if (! $all_found);
355 my $out = join('',@out);
358 # add rest of format (suffix)
361 # add prefix if not there
362 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
364 $log->debug("result: $out");
368 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
369 $log->debug("about to eval{$eval} format: $out");
370 return if (! $self->_eval($eval));
375 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
376 @filter_args = split(/,/, $2);
378 if ($self->{'filter'}->{$filter_name}) {
379 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
380 unshift @filter_args, $out;
381 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
382 return unless(defined($out));
383 $log->debug("filter result: $out");
385 $log->warn("trying to use undefined filter $filter_name");
394 Similar to C<parse>, but returns array of all repeatable fields
396 my @arr = $webpac->parse_to_arr($rec,'v250^a');
403 my ($rec, $format_utf8) = @_;
405 my $log = $self->_get_logger();
407 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
408 return if (! $format_utf8);
413 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
417 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
425 Workhourse of all: takes record from in-memory structure of database and
426 strings with placeholders and returns string or array of with substituted
429 my $text = $webpac->fill_in($rec,'v250^a');
431 Optional argument is ordinal number for repeatable fields. By default,
432 it's assume to be first repeatable field (fields are perl array, so first
434 Following example will read second value from repeatable field.
436 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
438 This function B<does not> perform parsing of format to inteligenty skip
439 delimiters before fields which aren't used.
441 This method will automatically decode UTF-8 string to local code page
449 my $log = $self->_get_logger();
451 my $rec = shift || $log->logconfess("need data record");
452 my $format = shift || $log->logconfess("need format to parse");
453 # iteration (for repeatable fields)
456 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
458 # FIXME remove for speedup?
459 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
461 if (utf8::is_utf8($format)) {
462 $format = $self->_x($format);
468 # remove eval{...} from beginning
469 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
472 # remove filter{...} from beginning
473 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
475 # do actual replacement of placeholders
477 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
478 # non-repeatable fields
479 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
482 $log->debug("format: $format");
484 my $eval = $self->fill_in($rec,$eval_code,$i);
485 return if (! $self->_eval($eval));
487 if ($filter_name && $self->{'filter'}->{$filter_name}) {
488 $log->debug("filter '$filter_name' for $format");
489 $format = $self->{'filter'}->{$filter_name}->($format);
490 return unless(defined($format));
491 $log->debug("filter result: $format");
493 # do we have lookups?
494 if ($self->{'lookup'}) {
495 if ($self->{'lookup'}->can('lookup')) {
496 my @lookup = $self->{lookup}->lookup($format);
497 $log->debug("lookup $format", join(", ", @lookup));
500 $log->warn("Have lookup object but can't invoke lookup method");
511 =head2 fill_in_to_arr
513 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
514 for fields which have lookups, so they shouldn't be parsed but rather
517 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
524 my ($rec, $format_utf8) = @_;
526 my $log = $self->_get_logger();
528 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
529 return if (! $format_utf8);
534 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
538 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
546 Returns value from record.
548 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
551 record reference C<$rec>,
553 optional subfiled C<$sf>,
554 index for repeatable values C<$i>.
556 Optinal variable C<$found> will be incremeted if there
559 Returns value or empty string.
566 my ($rec,$f,$sf,$i,$found) = @_;
569 return '' if (! $$rec->{$f}->[$i]);
571 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
572 $$found++ if (defined($$found));
573 return $$rec->{$f}->[$i]->{$sf};
574 } elsif (! $sf && $$rec->{$f}->[$i]) {
575 $$found++ if (defined($$found));
576 # it still might have subfield, just
577 # not specified, so we'll dump all
578 if ($$rec->{$f}->[$i] =~ /HASH/o) {
580 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
581 $out .= $$rec->{$f}->[$i]->{$k}." ";
585 return $$rec->{$f}->[$i];
598 Apply format specified in tag with C<format_name="name"> and
599 C<format_delimiter=";;">.
601 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
603 Formats can contain C<lookup{...}> if you need them.
610 my ($name,$delimiter,$data) = @_;
612 my $log = $self->_get_logger();
614 if (! $self->{'import_xml'}->{'format'}->{$name}) {
615 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
619 $log->warn("no delimiter for format $name") if (! $delimiter);
621 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
623 my @data = split(/\Q$delimiter\E/, $data);
625 my $out = sprintf($format, @data);
626 $log->debug("using format $name [$format] on $data to produce: $out");
628 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
629 return $self->{'lookup'}->lookup($out);
638 Sort array ignoring case and html in data
640 my @sorted = $webpac->sort_arr(@unsorted);
647 my $log = $self->_get_logger();
649 # FIXME add Schwartzian Transformation?
656 $log->debug("sorted values: ",sub { join(", ",@sorted) });
662 =head1 INTERNAL METHODS
664 =head2 _sort_by_order
666 Sort xml tags data structure accoding to C<order=""> attribute.
673 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
674 $self->{'import_xml'}->{'indexer'}->{$a};
675 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
676 $self->{'import_xml'}->{'indexer'}->{$b};
683 Convert strings from C<conf/normalize/*.xml> encoding into application
684 specific encoding (optinally specified using C<code_page> to C<new>
687 my $text = $n->_x('normalize text string');
689 This is a stub so that other modules doesn't have to implement it.
701 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
703 =head1 COPYRIGHT & LICENSE
705 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
707 This program is free software; you can redistribute it and/or modify it
708 under the same terms as Perl itself.
712 1; # End of WebPAC::Normalize