1 package WebPAC::Normalize;
9 WebPAC::Normalize - data mungling for normalisation
17 our $VERSION = '0.01';
21 This package contains code that mungle data to produce normalized format.
23 It contains several assumptions:
29 format of fields is defined using C<v123^a> notation for repeatable fields
30 or C<s123^a> for single (or first) value, where C<123> is field number and
35 source data records (C<$rec>) have unique identifiers in field C<000>
39 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
40 perl code that is evaluated before producing output (value of field will be
41 interpolated before that)
45 optional C<filter{filter_name}> at B<begining of format> will apply perl
46 code defined as code ref on format after field substitution to producing
51 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
55 at end, optional C<format>s rules are resolved. Format rules are similar to
56 C<sprintf> and can also contain C<lookup{...}> which is performed after
57 values are inserted in format.
61 This also describes order in which transformations are applied (eval,
62 filter, lookup, format) which is important to undestand when deciding how to
63 solve your data mungling and normalisation process.
72 Create new normalisation object
74 my $n = new WebPAC::Normalize::Something(
76 'filter_name_1' => sub {
82 lookup_regex => $lookup->regex,
85 Parametar C<filter> defines user supplied snippets of perl code which can
86 be use with C<filter{...}> notation.
88 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
98 $self ? return $self : return undef;
102 =head2 data_structure
104 Create in-memory data structure which represents normalized layout from
105 C<conf/normalize/*.xml>.
107 This structures are used to produce output.
109 my @ds = $webpac->data_structure($rec);
111 B<Note: historical oddity follows>
113 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
114 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
115 C<< <headline> >> tag.
122 my $log = $self->_get_logger();
125 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
130 my @ds = $self->{'db'}->get_ds($rec);
134 undef $self->{'currnet_filename'};
135 undef $self->{'headline'};
138 if ($self->{tags_by_order}) {
139 @sorted_tags = @{$self->{tags_by_order}};
141 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
142 $self->{tags_by_order} = \@sorted_tags;
147 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
149 foreach my $field (@sorted_tags) {
153 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
155 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
156 my $format = $tag->{'value'} || $tag->{'content'};
158 $log->debug("format: $format");
161 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
162 @v = $self->fill_in_to_arr($rec,$format);
164 @v = $self->parse_to_arr($rec,$format);
168 if ($tag->{'sort'}) {
169 @v = $self->sort_arr(@v);
173 if ($tag->{'format_name'}) {
174 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
177 if ($field eq 'filename') {
178 $self->{'current_filename'} = join('',@v);
179 $log->debug("filename: ",$self->{'current_filename'});
180 } elsif ($field eq 'headline') {
181 $self->{'headline'} .= join('',@v);
182 $log->debug("headline: ",$self->{'headline'});
183 next; # don't return headline in data_structure!
186 # delimiter will join repeatable fields
187 if ($tag->{'delimiter'}) {
188 @v = ( join($tag->{'delimiter'}, @v) );
192 my @types = qw(display swish);
193 # override by type attribute
194 @types = ( $tag->{'type'} ) if ($tag->{'type'});
196 foreach my $type (@types) {
197 # append to previous line?
198 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
199 if ($tag->{'append'}) {
201 # I will delimit appended part with
203 my $d = $tag->{'delimiter'};
207 my $last = pop @{$row->{$type}};
208 $d = "" if (! $last);
209 $last .= $d . join($d, @v);
210 push @{$row->{$type}}, $last;
213 push @{$row->{$type}}, @v;
221 $row->{'tag'} = $field;
223 # TODO: name_sigular, name_plural
224 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
225 $row->{'name'} = $name ? $self->_x($name) : $field;
227 # post-sort all values in field
228 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
229 $log->warn("sort at field tag not implemented");
234 $log->debug("row $field: ",sub { Dumper($row) });
239 $self->{'db'}->put_gs(
241 current_filename => $self->{'current_filename'},
242 headline => $self->{'headline'},
243 ) if ($self->{'db'});
251 Perform smart parsing of string, skipping delimiters for fields which aren't
252 defined. It can also eval code in format starting with C<eval{...}> and
253 return output or nothing depending on eval code.
255 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
262 my ($rec, $format_utf8, $i) = @_;
264 return if (! $format_utf8);
266 my $log = $self->_get_logger();
268 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
272 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
276 $log->debug("format: $format");
279 # remove eval{...} from beginning
280 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
283 # remove filter{...} from beginning
284 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
289 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
292 $prefix ||= $del if ($all_found == 0);
296 $r = 0 if (lc("$2") eq 's');
299 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
304 $all_found += $found;
308 return if (! $all_found);
310 my $out = join('',@out);
313 # add rest of format (suffix)
316 # add prefix if not there
317 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
319 $log->debug("result: $out");
323 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
324 $log->debug("about to eval{$eval} format: $out");
325 return if (! $self->_eval($eval));
328 if ($filter_name && $self->{'filter'}->{$filter_name}) {
329 $log->debug("about to filter{$filter_name} format: $out");
330 $out = $self->{'filter'}->{$filter_name}->($out);
331 return unless(defined($out));
332 $log->debug("filter result: $out");
340 Similar to C<parse>, but returns array of all repeatable fields
342 my @arr = $webpac->parse_to_arr($rec,'v250^a');
349 my ($rec, $format_utf8) = @_;
351 my $log = $self->_get_logger();
353 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
354 return if (! $format_utf8);
359 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
363 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
371 Workhourse of all: takes record from in-memory structure of database and
372 strings with placeholders and returns string or array of with substituted
375 my $text = $webpac->fill_in($rec,'v250^a');
377 Optional argument is ordinal number for repeatable fields. By default,
378 it's assume to be first repeatable field (fields are perl array, so first
380 Following example will read second value from repeatable field.
382 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
384 This function B<does not> perform parsing of format to inteligenty skip
385 delimiters before fields which aren't used.
387 This method will automatically decode UTF-8 string to local code page
395 my $log = $self->_get_logger();
397 my $rec = shift || $log->logconfess("need data record");
398 my $format = shift || $log->logconfess("need format to parse");
399 # iteration (for repeatable fields)
402 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
404 # FIXME remove for speedup?
405 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
407 if (utf8::is_utf8($format)) {
408 $format = $self->_x($format);
414 # remove eval{...} from beginning
415 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
418 # remove filter{...} from beginning
419 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
421 # do actual replacement of placeholders
423 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
424 # non-repeatable fields
425 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
428 $log->debug("format: $format");
430 my $eval = $self->fill_in($rec,$eval_code,$i);
431 return if (! $self->_eval($eval));
433 if ($filter_name && $self->{'filter'}->{$filter_name}) {
434 $log->debug("filter '$filter_name' for $format");
435 $format = $self->{'filter'}->{$filter_name}->($format);
436 return unless(defined($format));
437 $log->debug("filter result: $format");
439 # do we have lookups?
440 if ($self->{'lookup'}) {
441 return $self->lookup($format);
451 =head2 fill_in_to_arr
453 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
454 for fields which have lookups, so they shouldn't be parsed but rather
457 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
464 my ($rec, $format_utf8) = @_;
466 my $log = $self->_get_logger();
468 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
469 return if (! $format_utf8);
474 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
478 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
486 Returns value from record.
488 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
491 record reference C<$rec>,
493 optional subfiled C<$sf>,
494 index for repeatable values C<$i>.
496 Optinal variable C<$found> will be incremeted if there
499 Returns value or empty string.
506 my ($rec,$f,$sf,$i,$found) = @_;
509 return '' if (! $$rec->{$f}->[$i]);
511 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
512 $$found++ if (defined($$found));
513 return $$rec->{$f}->[$i]->{$sf};
514 } elsif ($$rec->{$f}->[$i]) {
515 $$found++ if (defined($$found));
516 # it still might have subfield, just
517 # not specified, so we'll dump all
518 if ($$rec->{$f}->[$i] =~ /HASH/o) {
520 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
521 $out .= $$rec->{$f}->[$i]->{$k}." ";
525 return $$rec->{$f}->[$i];
536 Apply format specified in tag with C<format_name="name"> and
537 C<format_delimiter=";;">.
539 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
541 Formats can contain C<lookup{...}> if you need them.
548 my ($name,$delimiter,$data) = @_;
550 my $log = $self->_get_logger();
552 if (! $self->{'import_xml'}->{'format'}->{$name}) {
553 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
557 $log->warn("no delimiter for format $name") if (! $delimiter);
559 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
561 my @data = split(/\Q$delimiter\E/, $data);
563 my $out = sprintf($format, @data);
564 $log->debug("using format $name [$format] on $data to produce: $out");
566 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
567 return $self->lookup($out);
576 Sort array ignoring case and html in data
578 my @sorted = $webpac->sort_arr(@unsorted);
585 my $log = $self->_get_logger();
587 # FIXME add Schwartzian Transformation?
594 $log->debug("sorted values: ",sub { join(", ",@sorted) });
600 =head1 INTERNAL METHODS
602 =head2 _sort_by_order
604 Sort xml tags data structure accoding to C<order=""> attribute.
611 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
612 $self->{'import_xml'}->{'indexer'}->{$a};
613 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
614 $self->{'import_xml'}->{'indexer'}->{$b};
621 Convert strings from C<conf/normalize/*.xml> encoding into application
622 specific encoding (optinally specified using C<code_page> to C<new>
625 my $text = $n->_x('normalize text string');
627 This is a stub so that other modules doesn't have to implement it.
639 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
641 =head1 COPYRIGHT & LICENSE
643 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
645 This program is free software; you can redistribute it and/or modify it
646 under the same terms as Perl itself.
650 1; # End of WebPAC::DB