1 package WebPAC::Normalize;
5 use base 'WebPAC::Common';
10 WebPAC::Normalize - data mungling for normalisation
18 our $VERSION = '0.01';
22 This package contains code that mungle data to produce normalized format.
24 It contains several assumptions:
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
36 source data records (C<$rec>) have unique identifiers in field C<000>
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
73 Create new normalisation object
75 my $n = new WebPAC::Normalize::Something(
77 'filter_name_1' => sub {
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
104 my $log = $self->_get_logger();
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
114 $self ? return $self : return undef;
118 =head2 data_structure
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
123 This structures are used to produce output.
125 my @ds = $webpac->data_structure($rec);
127 B<Note: historical oddity follows>
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
138 my $log = $self->_get_logger();
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
146 my @ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148 return @ds if ($#ds > 0);
149 $log->debug("cache miss, creating");
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
167 foreach my $field (@sorted_tags) {
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
179 $log->debug("format: $format");
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
185 @v = $self->parse_to_arr($rec,$format);
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
198 if ($field eq 'filename') {
199 $self->{'current_filename'} = join('',@v);
200 $log->debug("filename: ",$self->{'current_filename'});
201 } elsif ($field eq 'headline') {
202 $self->{'headline'} .= join('',@v);
203 $log->debug("headline: ",$self->{'headline'});
204 next; # don't return headline in data_structure!
207 # delimiter will join repeatable fields
208 if ($tag->{'delimiter'}) {
209 @v = ( join($tag->{'delimiter'}, @v) );
213 my @types = qw(display swish);
214 # override by type attribute
215 @types = ( $tag->{'type'} ) if ($tag->{'type'});
217 foreach my $type (@types) {
218 # append to previous line?
219 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220 if ($tag->{'append'}) {
222 # I will delimit appended part with
224 my $d = $tag->{'delimiter'};
228 my $last = pop @{$row->{$type}};
229 $d = "" if (! $last);
230 $last .= $d . join($d, @v);
231 push @{$row->{$type}}, $last;
234 push @{$row->{$type}}, @v;
242 $row->{'tag'} = $field;
244 # TODO: name_sigular, name_plural
245 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 $row->{'name'} = $name ? $self->_x($name) : $field;
248 # post-sort all values in field
249 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250 $log->warn("sort at field tag not implemented");
255 $log->debug("row $field: ",sub { Dumper($row) });
260 $self->{'db'}->save_ds(
262 current_filename => $self->{'current_filename'},
263 headline => $self->{'headline'},
264 ) if ($self->{'db'});
266 $log->debug("ds: ", sub { Dumper(@ds) });
274 Perform smart parsing of string, skipping delimiters for fields which aren't
275 defined. It can also eval code in format starting with C<eval{...}> and
276 return output or nothing depending on eval code.
278 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
285 my ($rec, $format_utf8, $i) = @_;
287 return if (! $format_utf8);
289 my $log = $self->_get_logger();
291 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
295 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
299 $log->debug("format: $format");
302 # remove eval{...} from beginning
303 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
306 # remove filter{...} from beginning
307 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
312 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
315 $prefix ||= $del if ($all_found == 0);
319 $r = 0 if (lc("$2") eq 's');
322 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
327 $all_found += $found;
331 return if (! $all_found);
333 my $out = join('',@out);
336 # add rest of format (suffix)
339 # add prefix if not there
340 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
342 $log->debug("result: $out");
346 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
347 $log->debug("about to eval{$eval} format: $out");
348 return if (! $self->_eval($eval));
351 if ($filter_name && $self->{'filter'}->{$filter_name}) {
352 $log->debug("about to filter{$filter_name} format: $out");
353 $out = $self->{'filter'}->{$filter_name}->($out);
354 return unless(defined($out));
355 $log->debug("filter result: $out");
363 Similar to C<parse>, but returns array of all repeatable fields
365 my @arr = $webpac->parse_to_arr($rec,'v250^a');
372 my ($rec, $format_utf8) = @_;
374 my $log = $self->_get_logger();
376 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
377 return if (! $format_utf8);
382 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
386 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
394 Workhourse of all: takes record from in-memory structure of database and
395 strings with placeholders and returns string or array of with substituted
398 my $text = $webpac->fill_in($rec,'v250^a');
400 Optional argument is ordinal number for repeatable fields. By default,
401 it's assume to be first repeatable field (fields are perl array, so first
403 Following example will read second value from repeatable field.
405 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
407 This function B<does not> perform parsing of format to inteligenty skip
408 delimiters before fields which aren't used.
410 This method will automatically decode UTF-8 string to local code page
418 my $log = $self->_get_logger();
420 my $rec = shift || $log->logconfess("need data record");
421 my $format = shift || $log->logconfess("need format to parse");
422 # iteration (for repeatable fields)
425 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
427 # FIXME remove for speedup?
428 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
430 if (utf8::is_utf8($format)) {
431 $format = $self->_x($format);
437 # remove eval{...} from beginning
438 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
441 # remove filter{...} from beginning
442 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
444 # do actual replacement of placeholders
446 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
447 # non-repeatable fields
448 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
451 $log->debug("format: $format");
453 my $eval = $self->fill_in($rec,$eval_code,$i);
454 return if (! $self->_eval($eval));
456 if ($filter_name && $self->{'filter'}->{$filter_name}) {
457 $log->debug("filter '$filter_name' for $format");
458 $format = $self->{'filter'}->{$filter_name}->($format);
459 return unless(defined($format));
460 $log->debug("filter result: $format");
462 # do we have lookups?
463 if ($self->{'lookup'}) {
464 if ($self->{'lookup'}->can('lookup')) {
465 return $self->{'lookup'}->lookup($format);
467 $log->warn("Have lookup object but can't invoke lookup method");
478 =head2 fill_in_to_arr
480 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
481 for fields which have lookups, so they shouldn't be parsed but rather
484 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
491 my ($rec, $format_utf8) = @_;
493 my $log = $self->_get_logger();
495 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
496 return if (! $format_utf8);
501 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
505 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
513 Returns value from record.
515 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
518 record reference C<$rec>,
520 optional subfiled C<$sf>,
521 index for repeatable values C<$i>.
523 Optinal variable C<$found> will be incremeted if there
526 Returns value or empty string.
533 my ($rec,$f,$sf,$i,$found) = @_;
536 return '' if (! $$rec->{$f}->[$i]);
538 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
539 $$found++ if (defined($$found));
540 return $$rec->{$f}->[$i]->{$sf};
541 } elsif ($$rec->{$f}->[$i]) {
542 $$found++ if (defined($$found));
543 # it still might have subfield, just
544 # not specified, so we'll dump all
545 if ($$rec->{$f}->[$i] =~ /HASH/o) {
547 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
548 $out .= $$rec->{$f}->[$i]->{$k}." ";
552 return $$rec->{$f}->[$i];
563 Apply format specified in tag with C<format_name="name"> and
564 C<format_delimiter=";;">.
566 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
568 Formats can contain C<lookup{...}> if you need them.
575 my ($name,$delimiter,$data) = @_;
577 my $log = $self->_get_logger();
579 if (! $self->{'import_xml'}->{'format'}->{$name}) {
580 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
584 $log->warn("no delimiter for format $name") if (! $delimiter);
586 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
588 my @data = split(/\Q$delimiter\E/, $data);
590 my $out = sprintf($format, @data);
591 $log->debug("using format $name [$format] on $data to produce: $out");
593 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
594 return $self->{'lookup'}->lookup($out);
603 Sort array ignoring case and html in data
605 my @sorted = $webpac->sort_arr(@unsorted);
612 my $log = $self->_get_logger();
614 # FIXME add Schwartzian Transformation?
621 $log->debug("sorted values: ",sub { join(", ",@sorted) });
627 =head1 INTERNAL METHODS
629 =head2 _sort_by_order
631 Sort xml tags data structure accoding to C<order=""> attribute.
638 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
639 $self->{'import_xml'}->{'indexer'}->{$a};
640 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
641 $self->{'import_xml'}->{'indexer'}->{$b};
648 Convert strings from C<conf/normalize/*.xml> encoding into application
649 specific encoding (optinally specified using C<code_page> to C<new>
652 my $text = $n->_x('normalize text string');
654 This is a stub so that other modules doesn't have to implement it.
666 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
668 =head1 COPYRIGHT & LICENSE
670 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
672 This program is free software; you can redistribute it and/or modify it
673 under the same terms as Perl itself.
677 1; # End of WebPAC::DB