1 package WebPAC::Normalize;
7 use base 'WebPAC::Common';
12 WebPAC::Normalize - data mungling for normalisation
20 our $VERSION = '0.08';
24 This package contains code that mungle data to produce normalized format.
26 It contains several assumptions:
32 format of fields is defined using C<v123^a> notation for repeatable fields
33 or C<s123^a> for single (or first) value, where C<123> is field number and
38 source data records (C<$rec>) have unique identifiers in field C<000>
42 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43 perl code that is evaluated before producing output (value of field will be
44 interpolated before that)
48 optional C<filter{filter_name}> at B<begining of format> will apply perl
49 code defined as code ref on format after field substitution to producing
52 There is one built-in filter called C<regex> which can be use like this:
54 filter{regex(s/foo/bar/)}
58 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
62 at end, optional C<format>s rules are resolved. Format rules are similar to
63 C<sprintf> and can also contain C<lookup{...}> which is performed after
64 values are inserted in format.
68 This also describes order in which transformations are applied (eval,
69 filter, lookup, format) which is important to undestand when deciding how to
70 solve your data mungling and normalisation process.
79 Create new normalisation object
81 my $n = new WebPAC::Normalize::Something(
83 'filter_name_1' => sub {
89 lookup_regex => $lookup->regex,
90 lookup => $lookup_obj,
94 Parametar C<filter> defines user supplied snippets of perl code which can
95 be use with C<filter{...}> notation.
97 C<prefix> is used to form filename for database record (to support multiple
98 source files which are joined in one database).
100 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 in structures. If you pass this parametar, you must also pass C<lookup>
102 which is C<WebPAC::Lookup> object.
109 bless($self, $class);
111 my $r = $self->{'lookup_regex'} ? 1 : 0;
112 my $l = $self->{'lookup'} ? 1 : 0;
114 my $log = $self->_get_logger();
116 # those two must be in pair
117 if ( ($r & $l) != ($r || $l) ) {
118 my $log = $self->_get_logger();
119 $log->logdie("lookup_regex and lookup must be in pair");
122 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
124 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
126 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
128 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 $log->debug("adding built-in filter regex");
130 $self->{filter}->{regex} = sub {
131 my ($val, $regex) = @_;
132 eval "\$val =~ $regex";
137 $self ? return $self : return undef;
141 =head2 data_structure
143 Create in-memory data structure which represents normalized layout from
144 C<conf/normalize/*.xml>.
146 This structures are used to produce output.
148 my $ds = $webpac->data_structure($rec);
155 my $log = $self->_get_logger();
158 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
160 $log->debug("data_structure rec = ", sub { Dumper($rec) });
162 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
164 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
169 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
172 $log->debug("cache miss, creating");
176 if ($self->{tags_by_order}) {
177 @sorted_tags = @{$self->{tags_by_order}};
179 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180 $self->{tags_by_order} = \@sorted_tags;
185 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
187 foreach my $field (@sorted_tags) {
191 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
193 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
196 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197 $format = $tag->{'value'} || $tag->{'content'};
200 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 @v = $self->_rec_to_arr($rec,$format,'fill_in');
203 @v = $self->_rec_to_arr($rec,$format,'parse');
206 $log->debug("$field <",$self->{tag},"> format: $format no values");
209 $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
212 if ($tag->{'sort'}) {
213 @v = $self->sort_arr(@v);
217 if ($tag->{'format_name'}) {
218 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
221 # delimiter will join repeatable fields
222 if ($tag->{'delimiter'}) {
223 @v = ( join($tag->{'delimiter'}, @v) );
227 my @types = qw(display search);
228 # override by type attribute
229 @types = ( $tag->{'type'} ) if ($tag->{'type'});
231 foreach my $type (@types) {
232 # append to previous line?
233 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 if ($tag->{'append'}) {
236 # I will delimit appended part with
238 my $d = $tag->{'delimiter'};
242 my $last = pop @{$row->{$type}};
243 $d = "" if (! $last);
244 $last .= $d . join($d, @v);
245 push @{$row->{$type}}, $last;
248 push @{$row->{$type}}, @v;
256 $row->{'tag'} = $field;
258 # TODO: name_sigular, name_plural
259 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 my $row_name = $name ? $self->_x($name) : $field;
262 # post-sort all values in field
263 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264 $log->warn("sort at field tag not implemented");
267 $ds->{$row_name} = $row;
269 $log->debug("row $field: ",sub { Dumper($row) });
274 $self->{'db'}->save_ds(
277 prefix => $self->{prefix},
278 ) if ($self->{'db'});
280 $log->debug("ds: ", sub { Dumper($ds) });
282 $log->logconfess("data structure returned is not array any more!") if wantarray;
290 Perform smart parsing of string, skipping delimiters for fields which aren't
291 defined. It can also eval code in format starting with C<eval{...}> and
292 return output or nothing depending on eval code.
294 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
296 Filters are implemented here. While simple form of filters looks like this:
298 filter{name_of_filter}
300 but, filters can also have variable number of parametars like this:
302 filter{name_of_filter(param,param,param)}
311 my ($rec, $format_utf8, $i, $rec_size) = @_;
313 return if (! $format_utf8);
315 my $log = $self->_get_logger();
317 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
321 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
325 $log->debug("format: $format [$i]");
328 # remove eval{...} from beginning
329 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
332 # remove filter{...} from beginning
333 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
335 # did we found any (att all) field from format in row?
337 # prefix before first field which we preserve it $found_any
342 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
345 $prefix = $del if ($f_step == 1);
347 my $fld_type = lc($2);
351 if ($fld_type eq 's') {
352 if ($found_any->{'v'}) {
360 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
363 $found_any->{$fld_type} += $found;
365 # we will skip delimiter before first occurence of field!
366 push @out, $del unless($found_any->{$fld_type} == 1);
372 # test if any fields found?
373 return if (! $found_any->{'v'} && ! $found_any->{'s'});
375 my $out = join('',@out);
378 # add rest of format (suffix)
381 # add prefix if not there
382 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
384 $log->debug("result: $out");
388 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389 $log->debug("about to eval{$eval} format: $out");
390 return if (! $self->_eval($eval));
395 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396 @filter_args = split(/,/, $2);
398 if ($self->{'filter'}->{$filter_name}) {
399 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400 unshift @filter_args, $out;
401 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402 return unless(defined($out));
403 $log->debug("filter result: $out");
404 } elsif (! $warn_once->{$filter_name}) {
405 $log->warn("trying to use undefined filter $filter_name");
406 $warn_once->{$filter_name}++;
415 Workhourse of all: takes record from in-memory structure of database and
416 strings with placeholders and returns string or array of with substituted
419 my $text = $webpac->fill_in($rec,'v250^a');
421 Optional argument is ordinal number for repeatable fields. By default,
422 it's assume to be first repeatable field (fields are perl array, so first
424 Following example will read second value from repeatable field.
426 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
428 This function B<does not> perform parsing of format to inteligenty skip
429 delimiters before fields which aren't used.
431 This method will automatically decode UTF-8 string to local code page
434 There is optional parametar C<$record_size> which can be used to get sizes of
435 all C<field^subfield> combinations in this format.
437 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
444 my $log = $self->_get_logger();
446 my ($rec,$format,$i,$rec_size) = @_;
448 $log->logconfess("need data record") unless ($rec);
449 $log->logconfess("need format to parse") unless($format);
451 # iteration (for repeatable fields)
454 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
456 # FIXME remove for speedup?
457 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
459 if (utf8::is_utf8($format)) {
460 $format = $self->_x($format);
467 # remove eval{...} from beginning
468 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
471 # remove filter{...} from beginning
472 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
476 no warnings 'uninitialized';
478 # do actual replacement of placeholders
480 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
484 # non-repeatable fields
485 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
486 return if ($i > 0 && $just_single);
491 $log->debug("format: $format");
493 my $eval = $self->fill_in($rec,$eval_code,$i);
494 return if (! $self->_eval($eval));
496 if ($filter_name && $self->{'filter'}->{$filter_name}) {
497 $log->debug("filter '$filter_name' for $format");
498 $format = $self->{'filter'}->{$filter_name}->($format);
499 return unless(defined($format));
500 $log->debug("filter result: $format");
502 # do we have lookups?
503 if ($self->{'lookup'}) {
504 if ($self->{'lookup'}->can('lookup')) {
505 my @lookup = $self->{lookup}->lookup($format);
506 $log->debug("lookup $format", join(", ", @lookup));
509 $log->warn("Have lookup object but can't invoke lookup method");
522 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
523 for fields which have lookups, so they shouldn't be parsed but rather
524 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
526 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
533 my ($rec, $format_utf8, $code) = @_;
535 my $log = $self->_get_logger();
537 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
538 return if (! $format_utf8);
540 $log->debug("using $code on $format_utf8");
548 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
550 foreach my $f (keys %{ $rec_size }) {
551 $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
553 $log->debug("max set to $max");
559 push @arr, '' if ($max > $i);
563 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
571 Returns value from record.
573 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
575 Required arguments are:
593 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
597 optional variable that will be incremeted if preset
601 hash to hold maximum occurances of C<field^subfield> combinations
602 (which can be accessed using keys in same format)
606 Returns value or empty string, updates C<$found> and C<rec_size>
614 my ($rec,$f,$sf,$i,$found,$cache) = @_;
616 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
618 if (defined($$cache)) {
619 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
622 return '' unless ($$rec->{$f}->[$i]);
627 $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
628 return $$rec->{$f}->[$i]->{$sf};
630 $$found++ if (defined($$found));
631 # it still might have subfields, just
632 # not specified, so we'll dump some debug info
633 if ($$rec->{$f}->[$i] =~ /HASH/o) {
635 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
636 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
640 return $$rec->{$f}->[$i];
649 Apply format specified in tag with C<format_name="name"> and
650 C<format_delimiter=";;">.
652 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
654 Formats can contain C<lookup{...}> if you need them.
661 my ($name,$delimiter,$data) = @_;
663 my $log = $self->_get_logger();
665 if (! $self->{'import_xml'}->{'format'}->{$name}) {
666 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
670 $log->warn("no delimiter for format $name") if (! $delimiter);
672 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
674 my @data = split(/\Q$delimiter\E/, $data);
676 my $out = sprintf($format, @data);
677 $log->debug("using format $name [$format] on $data to produce: $out");
679 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
680 return $self->{'lookup'}->lookup($out);
689 Sort array ignoring case and html in data
691 my @sorted = $webpac->sort_arr(@unsorted);
698 my $log = $self->_get_logger();
700 # FIXME add Schwartzian Transformation?
707 $log->debug("sorted values: ",sub { join(", ",@sorted) });
713 =head1 INTERNAL METHODS
715 =head2 _sort_by_order
717 Sort xml tags data structure accoding to C<order=""> attribute.
724 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
725 $self->{'import_xml'}->{'indexer'}->{$a};
726 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
727 $self->{'import_xml'}->{'indexer'}->{$b};
734 Convert strings from C<conf/normalize/*.xml> encoding into application
735 specific encoding (optinally specified using C<code_page> to C<new>
738 my $text = $n->_x('normalize text string');
740 This is a stub so that other modules doesn't have to implement it.
752 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
754 =head1 COPYRIGHT & LICENSE
756 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
758 This program is free software; you can redistribute it and/or modify it
759 under the same terms as Perl itself.
763 1; # End of WebPAC::Normalize