1 package WebPAC::Normalize;
5 use base 'WebPAC::Common';
10 WebPAC::Normalize - data mungling for normalisation
18 our $VERSION = '0.01';
22 This package contains code that mungle data to produce normalized format.
24 It contains several assumptions:
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
36 source data records (C<$rec>) have unique identifiers in field C<000>
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
73 Create new normalisation object
75 my $n = new WebPAC::Normalize::Something(
77 'filter_name_1' => sub {
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
104 my $log = $self->_get_logger();
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
114 $self ? return $self : return undef;
118 =head2 data_structure
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
123 This structures are used to produce output.
125 my @ds = $webpac->data_structure($rec);
127 B<Note: historical oddity follows>
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
138 my $log = $self->_get_logger();
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
146 my @ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148 return @ds if ($#ds > 0);
149 $log->debug("cache miss, creating");
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
167 foreach my $field (@sorted_tags) {
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
179 $log->debug("format: $format");
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
185 @v = $self->parse_to_arr($rec,$format);
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
198 if ($field eq 'filename') {
199 $self->{'current_filename'} = join('',@v);
200 $log->debug("filename: ",$self->{'current_filename'});
201 } elsif ($field eq 'headline') {
202 $self->{'headline'} .= join('',@v);
203 $log->debug("headline: ",$self->{'headline'});
204 next; # don't return headline in data_structure!
207 # delimiter will join repeatable fields
208 if ($tag->{'delimiter'}) {
209 @v = ( join($tag->{'delimiter'}, @v) );
213 my @types = qw(display swish);
214 # override by type attribute
215 @types = ( $tag->{'type'} ) if ($tag->{'type'});
217 foreach my $type (@types) {
218 # append to previous line?
219 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220 if ($tag->{'append'}) {
222 # I will delimit appended part with
224 my $d = $tag->{'delimiter'};
228 my $last = pop @{$row->{$type}};
229 $d = "" if (! $last);
230 $last .= $d . join($d, @v);
231 push @{$row->{$type}}, $last;
234 push @{$row->{$type}}, @v;
242 $row->{'tag'} = $field;
244 # TODO: name_sigular, name_plural
245 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 $row->{'name'} = $name ? $self->_x($name) : $field;
248 # post-sort all values in field
249 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250 $log->warn("sort at field tag not implemented");
255 $log->debug("row $field: ",sub { Dumper($row) });
260 $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
262 $self->{'db'}->save_ds(
264 current_filename => $self->{'current_filename'},
265 headline => $self->{'headline'},
266 ) if ($self->{'db'});
268 $log->debug("ds: ", sub { Dumper(@ds) });
276 Perform smart parsing of string, skipping delimiters for fields which aren't
277 defined. It can also eval code in format starting with C<eval{...}> and
278 return output or nothing depending on eval code.
280 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
287 my ($rec, $format_utf8, $i) = @_;
289 return if (! $format_utf8);
291 my $log = $self->_get_logger();
293 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
297 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
301 $log->debug("format: $format");
304 # remove eval{...} from beginning
305 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
308 # remove filter{...} from beginning
309 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
314 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
317 $prefix ||= $del if ($all_found == 0);
321 $r = 0 if (lc("$2") eq 's');
324 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
329 $all_found += $found;
333 return if (! $all_found);
335 my $out = join('',@out);
338 # add rest of format (suffix)
341 # add prefix if not there
342 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
344 $log->debug("result: $out");
348 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
349 $log->debug("about to eval{$eval} format: $out");
350 return if (! $self->_eval($eval));
353 if ($filter_name && $self->{'filter'}->{$filter_name}) {
354 $log->debug("about to filter{$filter_name} format: $out");
355 $out = $self->{'filter'}->{$filter_name}->($out);
356 return unless(defined($out));
357 $log->debug("filter result: $out");
365 Similar to C<parse>, but returns array of all repeatable fields
367 my @arr = $webpac->parse_to_arr($rec,'v250^a');
374 my ($rec, $format_utf8) = @_;
376 my $log = $self->_get_logger();
378 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
379 return if (! $format_utf8);
384 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
388 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
396 Workhourse of all: takes record from in-memory structure of database and
397 strings with placeholders and returns string or array of with substituted
400 my $text = $webpac->fill_in($rec,'v250^a');
402 Optional argument is ordinal number for repeatable fields. By default,
403 it's assume to be first repeatable field (fields are perl array, so first
405 Following example will read second value from repeatable field.
407 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
409 This function B<does not> perform parsing of format to inteligenty skip
410 delimiters before fields which aren't used.
412 This method will automatically decode UTF-8 string to local code page
420 my $log = $self->_get_logger();
422 my $rec = shift || $log->logconfess("need data record");
423 my $format = shift || $log->logconfess("need format to parse");
424 # iteration (for repeatable fields)
427 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
429 # FIXME remove for speedup?
430 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
432 if (utf8::is_utf8($format)) {
433 $format = $self->_x($format);
439 # remove eval{...} from beginning
440 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
443 # remove filter{...} from beginning
444 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
446 # do actual replacement of placeholders
448 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
449 # non-repeatable fields
450 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
453 $log->debug("format: $format");
455 my $eval = $self->fill_in($rec,$eval_code,$i);
456 return if (! $self->_eval($eval));
458 if ($filter_name && $self->{'filter'}->{$filter_name}) {
459 $log->debug("filter '$filter_name' for $format");
460 $format = $self->{'filter'}->{$filter_name}->($format);
461 return unless(defined($format));
462 $log->debug("filter result: $format");
464 # do we have lookups?
465 if ($self->{'lookup'}) {
466 if ($self->{'lookup'}->can('lookup')) {
467 return $self->{'lookup'}->lookup($format);
469 $log->warn("Have lookup object but can't invoke lookup method");
480 =head2 fill_in_to_arr
482 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
483 for fields which have lookups, so they shouldn't be parsed but rather
486 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
493 my ($rec, $format_utf8) = @_;
495 my $log = $self->_get_logger();
497 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
498 return if (! $format_utf8);
503 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
507 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
515 Returns value from record.
517 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
520 record reference C<$rec>,
522 optional subfiled C<$sf>,
523 index for repeatable values C<$i>.
525 Optinal variable C<$found> will be incremeted if there
528 Returns value or empty string.
535 my ($rec,$f,$sf,$i,$found) = @_;
538 return '' if (! $$rec->{$f}->[$i]);
540 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
541 $$found++ if (defined($$found));
542 return $$rec->{$f}->[$i]->{$sf};
543 } elsif (! $sf && $$rec->{$f}->[$i]) {
544 $$found++ if (defined($$found));
545 # it still might have subfield, just
546 # not specified, so we'll dump all
547 if ($$rec->{$f}->[$i] =~ /HASH/o) {
549 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
550 $out .= $$rec->{$f}->[$i]->{$k}." ";
554 return $$rec->{$f}->[$i];
567 Apply format specified in tag with C<format_name="name"> and
568 C<format_delimiter=";;">.
570 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
572 Formats can contain C<lookup{...}> if you need them.
579 my ($name,$delimiter,$data) = @_;
581 my $log = $self->_get_logger();
583 if (! $self->{'import_xml'}->{'format'}->{$name}) {
584 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
588 $log->warn("no delimiter for format $name") if (! $delimiter);
590 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
592 my @data = split(/\Q$delimiter\E/, $data);
594 my $out = sprintf($format, @data);
595 $log->debug("using format $name [$format] on $data to produce: $out");
597 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
598 return $self->{'lookup'}->lookup($out);
607 Sort array ignoring case and html in data
609 my @sorted = $webpac->sort_arr(@unsorted);
616 my $log = $self->_get_logger();
618 # FIXME add Schwartzian Transformation?
625 $log->debug("sorted values: ",sub { join(", ",@sorted) });
631 =head1 INTERNAL METHODS
633 =head2 _sort_by_order
635 Sort xml tags data structure accoding to C<order=""> attribute.
642 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
643 $self->{'import_xml'}->{'indexer'}->{$a};
644 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
645 $self->{'import_xml'}->{'indexer'}->{$b};
652 Convert strings from C<conf/normalize/*.xml> encoding into application
653 specific encoding (optinally specified using C<code_page> to C<new>
656 my $text = $n->_x('normalize text string');
658 This is a stub so that other modules doesn't have to implement it.
670 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
672 =head1 COPYRIGHT & LICENSE
674 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
676 This program is free software; you can redistribute it and/or modify it
677 under the same terms as Perl itself.
681 1; # End of WebPAC::DB