1 package WebPAC::Normalize;
5 use base 'WebPAC::Common';
10 WebPAC::Normalize - data mungling for normalisation
18 our $VERSION = '0.02';
22 This package contains code that mungle data to produce normalized format.
24 It contains several assumptions:
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
36 source data records (C<$rec>) have unique identifiers in field C<000>
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
73 Create new normalisation object
75 my $n = new WebPAC::Normalize::Something(
77 'filter_name_1' => sub {
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
104 my $log = $self->_get_logger();
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
114 $self ? return $self : return undef;
118 =head2 data_structure
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
123 This structures are used to produce output.
125 my $ds = $webpac->data_structure($rec);
132 my $log = $self->_get_logger();
135 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
137 $log->debug("data_structure rec = ", sub { Dumper($rec) });
139 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
141 my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
146 my $ds = $self->{'db'}->load_ds( $mfn );
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
149 $log->debug("cache miss, creating");
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
167 foreach my $field (@sorted_tags) {
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
179 $log->debug("format: $format");
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
185 @v = $self->parse_to_arr($rec,$format);
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
198 # delimiter will join repeatable fields
199 if ($tag->{'delimiter'}) {
200 @v = ( join($tag->{'delimiter'}, @v) );
204 my @types = qw(display search);
205 # override by type attribute
206 @types = ( $tag->{'type'} ) if ($tag->{'type'});
208 foreach my $type (@types) {
209 # append to previous line?
210 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
211 if ($tag->{'append'}) {
213 # I will delimit appended part with
215 my $d = $tag->{'delimiter'};
219 my $last = pop @{$row->{$type}};
220 $d = "" if (! $last);
221 $last .= $d . join($d, @v);
222 push @{$row->{$type}}, $last;
225 push @{$row->{$type}}, @v;
233 $row->{'tag'} = $field;
235 # TODO: name_sigular, name_plural
236 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
237 my $row_name = $name ? $self->_x($name) : $field;
239 # post-sort all values in field
240 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
241 $log->warn("sort at field tag not implemented");
244 $ds->{$row_name} = $row;
246 $log->debug("row $field: ",sub { Dumper($row) });
251 $self->{'db'}->save_ds(
254 ) if ($self->{'db'});
256 $log->debug("ds: ", sub { Dumper($ds) });
258 $log->logconfess("data structure returned is not array any more!") if wantarray;
266 Perform smart parsing of string, skipping delimiters for fields which aren't
267 defined. It can also eval code in format starting with C<eval{...}> and
268 return output or nothing depending on eval code.
270 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
277 my ($rec, $format_utf8, $i) = @_;
279 return if (! $format_utf8);
281 my $log = $self->_get_logger();
283 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
287 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
291 $log->debug("format: $format");
294 # remove eval{...} from beginning
295 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
298 # remove filter{...} from beginning
299 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
304 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
307 $prefix ||= $del if ($all_found == 0);
311 $r = 0 if (lc("$2") eq 's');
314 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
319 $all_found += $found;
323 return if (! $all_found);
325 my $out = join('',@out);
328 # add rest of format (suffix)
331 # add prefix if not there
332 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
334 $log->debug("result: $out");
338 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
339 $log->debug("about to eval{$eval} format: $out");
340 return if (! $self->_eval($eval));
343 if ($filter_name && $self->{'filter'}->{$filter_name}) {
344 $log->debug("about to filter{$filter_name} format: $out");
345 $out = $self->{'filter'}->{$filter_name}->($out);
346 return unless(defined($out));
347 $log->debug("filter result: $out");
355 Similar to C<parse>, but returns array of all repeatable fields
357 my @arr = $webpac->parse_to_arr($rec,'v250^a');
364 my ($rec, $format_utf8) = @_;
366 my $log = $self->_get_logger();
368 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
369 return if (! $format_utf8);
374 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
378 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
386 Workhourse of all: takes record from in-memory structure of database and
387 strings with placeholders and returns string or array of with substituted
390 my $text = $webpac->fill_in($rec,'v250^a');
392 Optional argument is ordinal number for repeatable fields. By default,
393 it's assume to be first repeatable field (fields are perl array, so first
395 Following example will read second value from repeatable field.
397 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
399 This function B<does not> perform parsing of format to inteligenty skip
400 delimiters before fields which aren't used.
402 This method will automatically decode UTF-8 string to local code page
410 my $log = $self->_get_logger();
412 my $rec = shift || $log->logconfess("need data record");
413 my $format = shift || $log->logconfess("need format to parse");
414 # iteration (for repeatable fields)
417 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
419 # FIXME remove for speedup?
420 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
422 if (utf8::is_utf8($format)) {
423 $format = $self->_x($format);
429 # remove eval{...} from beginning
430 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
433 # remove filter{...} from beginning
434 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
436 # do actual replacement of placeholders
438 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
439 # non-repeatable fields
440 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
443 $log->debug("format: $format");
445 my $eval = $self->fill_in($rec,$eval_code,$i);
446 return if (! $self->_eval($eval));
448 if ($filter_name && $self->{'filter'}->{$filter_name}) {
449 $log->debug("filter '$filter_name' for $format");
450 $format = $self->{'filter'}->{$filter_name}->($format);
451 return unless(defined($format));
452 $log->debug("filter result: $format");
454 # do we have lookups?
455 if ($self->{'lookup'}) {
456 if ($self->{'lookup'}->can('lookup')) {
457 return $self->{'lookup'}->lookup($format);
459 $log->warn("Have lookup object but can't invoke lookup method");
470 =head2 fill_in_to_arr
472 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
473 for fields which have lookups, so they shouldn't be parsed but rather
476 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
483 my ($rec, $format_utf8) = @_;
485 my $log = $self->_get_logger();
487 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
488 return if (! $format_utf8);
493 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
497 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
505 Returns value from record.
507 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
510 record reference C<$rec>,
512 optional subfiled C<$sf>,
513 index for repeatable values C<$i>.
515 Optinal variable C<$found> will be incremeted if there
518 Returns value or empty string.
525 my ($rec,$f,$sf,$i,$found) = @_;
528 return '' if (! $$rec->{$f}->[$i]);
530 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
531 $$found++ if (defined($$found));
532 return $$rec->{$f}->[$i]->{$sf};
533 } elsif (! $sf && $$rec->{$f}->[$i]) {
534 $$found++ if (defined($$found));
535 # it still might have subfield, just
536 # not specified, so we'll dump all
537 if ($$rec->{$f}->[$i] =~ /HASH/o) {
539 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
540 $out .= $$rec->{$f}->[$i]->{$k}." ";
544 return $$rec->{$f}->[$i];
557 Apply format specified in tag with C<format_name="name"> and
558 C<format_delimiter=";;">.
560 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
562 Formats can contain C<lookup{...}> if you need them.
569 my ($name,$delimiter,$data) = @_;
571 my $log = $self->_get_logger();
573 if (! $self->{'import_xml'}->{'format'}->{$name}) {
574 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
578 $log->warn("no delimiter for format $name") if (! $delimiter);
580 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
582 my @data = split(/\Q$delimiter\E/, $data);
584 my $out = sprintf($format, @data);
585 $log->debug("using format $name [$format] on $data to produce: $out");
587 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
588 return $self->{'lookup'}->lookup($out);
597 Sort array ignoring case and html in data
599 my @sorted = $webpac->sort_arr(@unsorted);
606 my $log = $self->_get_logger();
608 # FIXME add Schwartzian Transformation?
615 $log->debug("sorted values: ",sub { join(", ",@sorted) });
621 =head1 INTERNAL METHODS
623 =head2 _sort_by_order
625 Sort xml tags data structure accoding to C<order=""> attribute.
632 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
633 $self->{'import_xml'}->{'indexer'}->{$a};
634 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
635 $self->{'import_xml'}->{'indexer'}->{$b};
642 Convert strings from C<conf/normalize/*.xml> encoding into application
643 specific encoding (optinally specified using C<code_page> to C<new>
646 my $text = $n->_x('normalize text string');
648 This is a stub so that other modules doesn't have to implement it.
660 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
662 =head1 COPYRIGHT & LICENSE
664 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
666 This program is free software; you can redistribute it and/or modify it
667 under the same terms as Perl itself.
671 1; # End of WebPAC::DB