1 package WebPAC::Normalize;
5 use base 'WebPAC::Common';
10 WebPAC::Normalize - data mungling for normalisation
18 our $VERSION = '0.02';
22 This package contains code that mungle data to produce normalized format.
24 It contains several assumptions:
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
36 source data records (C<$rec>) have unique identifiers in field C<000>
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
73 Create new normalisation object
75 my $n = new WebPAC::Normalize::Something(
77 'filter_name_1' => sub {
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
104 my $log = $self->_get_logger();
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
114 $self ? return $self : return undef;
118 =head2 data_structure
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
123 This structures are used to produce output.
125 my $ds = $webpac->data_structure($rec);
127 B<Note: historical oddity follows>
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
138 my $log = $self->_get_logger();
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
146 my $ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
149 $log->debug("cache miss, creating");
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
167 foreach my $field (@sorted_tags) {
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
179 $log->debug("format: $format");
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
185 @v = $self->parse_to_arr($rec,$format);
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
198 if ($field eq 'filename') {
199 $self->{'current_filename'} = join('',@v);
200 $log->debug("filename: ",$self->{'current_filename'});
201 } elsif ($field eq 'headline') {
202 $self->{'headline'} .= join('',@v);
203 $log->debug("headline: ",$self->{'headline'});
204 next; # don't return headline in data_structure!
207 # delimiter will join repeatable fields
208 if ($tag->{'delimiter'}) {
209 @v = ( join($tag->{'delimiter'}, @v) );
213 my @types = qw(display search);
214 # override by type attribute
215 @types = ( $tag->{'type'} ) if ($tag->{'type'});
217 foreach my $type (@types) {
218 # append to previous line?
219 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220 if ($tag->{'append'}) {
222 # I will delimit appended part with
224 my $d = $tag->{'delimiter'};
228 my $last = pop @{$row->{$type}};
229 $d = "" if (! $last);
230 $last .= $d . join($d, @v);
231 push @{$row->{$type}}, $last;
234 push @{$row->{$type}}, @v;
242 $row->{'tag'} = $field;
244 # TODO: name_sigular, name_plural
245 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 my $row_name = $name ? $self->_x($name) : $field;
248 # post-sort all values in field
249 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250 $log->warn("sort at field tag not implemented");
253 $ds->{$row_name} = $row;
255 $log->debug("row $field: ",sub { Dumper($row) });
260 $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
262 $self->{'db'}->save_ds(
264 current_filename => $self->{'current_filename'},
265 headline => $self->{'headline'},
266 ) if ($self->{'db'});
268 $log->debug("ds: ", sub { Dumper($ds) });
270 $log->logconfess("data structure returned is not array any more!") if wantarray;
278 Perform smart parsing of string, skipping delimiters for fields which aren't
279 defined. It can also eval code in format starting with C<eval{...}> and
280 return output or nothing depending on eval code.
282 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
289 my ($rec, $format_utf8, $i) = @_;
291 return if (! $format_utf8);
293 my $log = $self->_get_logger();
295 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
299 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
303 $log->debug("format: $format");
306 # remove eval{...} from beginning
307 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
310 # remove filter{...} from beginning
311 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
316 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
319 $prefix ||= $del if ($all_found == 0);
323 $r = 0 if (lc("$2") eq 's');
326 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
331 $all_found += $found;
335 return if (! $all_found);
337 my $out = join('',@out);
340 # add rest of format (suffix)
343 # add prefix if not there
344 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
346 $log->debug("result: $out");
350 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
351 $log->debug("about to eval{$eval} format: $out");
352 return if (! $self->_eval($eval));
355 if ($filter_name && $self->{'filter'}->{$filter_name}) {
356 $log->debug("about to filter{$filter_name} format: $out");
357 $out = $self->{'filter'}->{$filter_name}->($out);
358 return unless(defined($out));
359 $log->debug("filter result: $out");
367 Similar to C<parse>, but returns array of all repeatable fields
369 my @arr = $webpac->parse_to_arr($rec,'v250^a');
376 my ($rec, $format_utf8) = @_;
378 my $log = $self->_get_logger();
380 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
381 return if (! $format_utf8);
386 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
390 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
398 Workhourse of all: takes record from in-memory structure of database and
399 strings with placeholders and returns string or array of with substituted
402 my $text = $webpac->fill_in($rec,'v250^a');
404 Optional argument is ordinal number for repeatable fields. By default,
405 it's assume to be first repeatable field (fields are perl array, so first
407 Following example will read second value from repeatable field.
409 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
411 This function B<does not> perform parsing of format to inteligenty skip
412 delimiters before fields which aren't used.
414 This method will automatically decode UTF-8 string to local code page
422 my $log = $self->_get_logger();
424 my $rec = shift || $log->logconfess("need data record");
425 my $format = shift || $log->logconfess("need format to parse");
426 # iteration (for repeatable fields)
429 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
431 # FIXME remove for speedup?
432 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
434 if (utf8::is_utf8($format)) {
435 $format = $self->_x($format);
441 # remove eval{...} from beginning
442 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
445 # remove filter{...} from beginning
446 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
448 # do actual replacement of placeholders
450 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
451 # non-repeatable fields
452 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
455 $log->debug("format: $format");
457 my $eval = $self->fill_in($rec,$eval_code,$i);
458 return if (! $self->_eval($eval));
460 if ($filter_name && $self->{'filter'}->{$filter_name}) {
461 $log->debug("filter '$filter_name' for $format");
462 $format = $self->{'filter'}->{$filter_name}->($format);
463 return unless(defined($format));
464 $log->debug("filter result: $format");
466 # do we have lookups?
467 if ($self->{'lookup'}) {
468 if ($self->{'lookup'}->can('lookup')) {
469 return $self->{'lookup'}->lookup($format);
471 $log->warn("Have lookup object but can't invoke lookup method");
482 =head2 fill_in_to_arr
484 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
485 for fields which have lookups, so they shouldn't be parsed but rather
488 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
495 my ($rec, $format_utf8) = @_;
497 my $log = $self->_get_logger();
499 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
500 return if (! $format_utf8);
505 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
509 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
517 Returns value from record.
519 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
522 record reference C<$rec>,
524 optional subfiled C<$sf>,
525 index for repeatable values C<$i>.
527 Optinal variable C<$found> will be incremeted if there
530 Returns value or empty string.
537 my ($rec,$f,$sf,$i,$found) = @_;
540 return '' if (! $$rec->{$f}->[$i]);
542 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
543 $$found++ if (defined($$found));
544 return $$rec->{$f}->[$i]->{$sf};
545 } elsif (! $sf && $$rec->{$f}->[$i]) {
546 $$found++ if (defined($$found));
547 # it still might have subfield, just
548 # not specified, so we'll dump all
549 if ($$rec->{$f}->[$i] =~ /HASH/o) {
551 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
552 $out .= $$rec->{$f}->[$i]->{$k}." ";
556 return $$rec->{$f}->[$i];
569 Apply format specified in tag with C<format_name="name"> and
570 C<format_delimiter=";;">.
572 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
574 Formats can contain C<lookup{...}> if you need them.
581 my ($name,$delimiter,$data) = @_;
583 my $log = $self->_get_logger();
585 if (! $self->{'import_xml'}->{'format'}->{$name}) {
586 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
590 $log->warn("no delimiter for format $name") if (! $delimiter);
592 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
594 my @data = split(/\Q$delimiter\E/, $data);
596 my $out = sprintf($format, @data);
597 $log->debug("using format $name [$format] on $data to produce: $out");
599 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
600 return $self->{'lookup'}->lookup($out);
609 Sort array ignoring case and html in data
611 my @sorted = $webpac->sort_arr(@unsorted);
618 my $log = $self->_get_logger();
620 # FIXME add Schwartzian Transformation?
627 $log->debug("sorted values: ",sub { join(", ",@sorted) });
633 =head1 INTERNAL METHODS
635 =head2 _sort_by_order
637 Sort xml tags data structure accoding to C<order=""> attribute.
644 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
645 $self->{'import_xml'}->{'indexer'}->{$a};
646 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
647 $self->{'import_xml'}->{'indexer'}->{$b};
654 Convert strings from C<conf/normalize/*.xml> encoding into application
655 specific encoding (optinally specified using C<code_page> to C<new>
658 my $text = $n->_x('normalize text string');
660 This is a stub so that other modules doesn't have to implement it.
672 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
674 =head1 COPYRIGHT & LICENSE
676 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
678 This program is free software; you can redistribute it and/or modify it
679 under the same terms as Perl itself.
683 1; # End of WebPAC::DB