This module defines common functions, and is used as base for other, more
specific modules.
- my $webpac = new WebPAC::Common(
- filter => {
- 'filter_name_1' => sub {
- # filter code
- return length($_);
- }, ...
- },
- }
-
=head1 FUNCTIONS
-=head2 fill_in
-
-Workhourse of all: takes record from in-memory structure of database and
-strings with placeholders and returns string or array of with substituted
-values from record.
-
- my $text = $webpac->fill_in($rec,'v250^a');
-
-Optional argument is ordinal number for repeatable fields. By default,
-it's assume to be first repeatable field (fields are perl array, so first
-element is 0).
-Following example will read second value from repeatable field.
-
- my $text = $webpac->fill_in($rec,'Title: v250^a',1);
-
-This function B<does not> perform parsing of format to inteligenty skip
-delimiters before fields which aren't used.
-
-This method will automatically decode UTF-8 string to local code page
-if needed.
-
-=cut
-
-sub fill_in {
- my $self = shift;
-
- my $log = $self->_get_logger();
-
- my $rec = shift || $log->logconfess("need data record");
- my $format = shift || $log->logconfess("need format to parse");
- # iteration (for repeatable fields)
- my $i = shift || 0;
-
- $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
-
- # FIXME remove for speedup?
- $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
- if (utf8::is_utf8($format)) {
- $format = $self->_x($format);
- }
-
- my $found = 0;
-
- my $eval_code;
- # remove eval{...} from beginning
- $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
-
- my $filter_name;
- # remove filter{...} from beginning
- $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
-
- # do actual replacement of placeholders
- # repeatable fields
- $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
- # non-repeatable fields
- $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
-
- if ($found) {
- $log->debug("format: $format");
- if ($eval_code) {
- my $eval = $self->fill_in($rec,$eval_code,$i);
- return if (! $self->_eval($eval));
- }
- if ($filter_name && $self->{'filter'}->{$filter_name}) {
- $log->debug("filter '$filter_name' for $format");
- $format = $self->{'filter'}->{$filter_name}->($format);
- return unless(defined($format));
- $log->debug("filter result: $format");
- }
- # do we have lookups?
- if ($self->{'lookup'}) {
- return $self->lookup($format);
- } else {
- return $format;
- }
- } else {
- return;
- }
-}
-
-
-=head2 get_data
-
-Returns value from record.
-
- my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
-
-Arguments are:
-record reference C<$rec>,
-field C<$f>,
-optional subfiled C<$sf>,
-index for repeatable values C<$i>.
-
-Optinal variable C<$found> will be incremeted if there
-is field.
-
-Returns value or empty string.
-
-=cut
-
-sub get_data {
- my $self = shift;
-
- my ($rec,$f,$sf,$i,$found) = @_;
-
- if ($$rec->{$f}) {
- return '' if (! $$rec->{$f}->[$i]);
- no strict 'refs';
- if ($sf && $$rec->{$f}->[$i]->{$sf}) {
- $$found++ if (defined($$found));
- return $$rec->{$f}->[$i]->{$sf};
- } elsif ($$rec->{$f}->[$i]) {
- $$found++ if (defined($$found));
- # it still might have subfield, just
- # not specified, so we'll dump all
- if ($$rec->{$f}->[$i] =~ /HASH/o) {
- my $out;
- foreach my $k (keys %{$$rec->{$f}->[$i]}) {
- $out .= $$rec->{$f}->[$i]->{$k}." ";
- }
- return $out;
- } else {
- return $$rec->{$f}->[$i];
- }
- }
- } else {
- return '';
- }
-}
-
-
=head2 progress_bar
Draw progress bar on STDERR.
=head1 NAME
-WebPAC::Normalize - normalisation of source file
+WebPAC::Normalize - data mungling for normalisation
=head1 VERSION
=head1 SYNOPSIS
-This package contains code that could be helpful in implementing different
-normalisation front-ends.
+This package contains code that mungle data to produce normalized format.
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
=head1 FUNCTIONS
Create new normalisation object
my $n = new WebPAC::Normalize::Something(
+ filter => {
+ 'filter_name_1' => sub {
+ # filter code
+ return length($_);
+ }, ...
+ },
cache_data_structure => './cache/ds/',
lookup_regex => $lookup->regex,
);
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
+
Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.
}
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
-
-Formats can contain C<lookup{...}> if you need them.
-
-=cut
-
-sub apply_format {
- my $self = shift;
-
- my ($name,$delimiter,$data) = @_;
-
- my $log = $self->_get_logger();
-
- if (! $self->{'import_xml'}->{'format'}->{$name}) {
- $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
- return $data;
- }
-
- $log->warn("no delimiter for format $name") if (! $delimiter);
-
- my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
-
- my @data = split(/\Q$delimiter\E/, $data);
-
- my $out = sprintf($format, @data);
- $log->debug("using format $name [$format] on $data to produce: $out");
-
- if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
- return $self->lookup($out);
- } else {
- return $out;
- }
-
-}
-
=head2 parse
Perform smart parsing of string, skipping delimiters for fields which aren't
return @arr;
}
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+=cut
+
+sub fill_in {
+ my $self = shift;
+
+ my $log = $self->_get_logger();
+
+ my $rec = shift || $log->logconfess("need data record");
+ my $format = shift || $log->logconfess("need format to parse");
+ # iteration (for repeatable fields)
+ my $i = shift || 0;
+
+ $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+ # FIXME remove for speedup?
+ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ if (utf8::is_utf8($format)) {
+ $format = $self->_x($format);
+ }
+
+ my $found = 0;
+
+ my $eval_code;
+ # remove eval{...} from beginning
+ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+ my $filter_name;
+ # remove filter{...} from beginning
+ $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+ # do actual replacement of placeholders
+ # repeatable fields
+ $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+ # non-repeatable fields
+ $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
+
+ if ($found) {
+ $log->debug("format: $format");
+ if ($eval_code) {
+ my $eval = $self->fill_in($rec,$eval_code,$i);
+ return if (! $self->_eval($eval));
+ }
+ if ($filter_name && $self->{'filter'}->{$filter_name}) {
+ $log->debug("filter '$filter_name' for $format");
+ $format = $self->{'filter'}->{$filter_name}->($format);
+ return unless(defined($format));
+ $log->debug("filter result: $format");
+ }
+ # do we have lookups?
+ if ($self->{'lookup'}) {
+ return $self->lookup($format);
+ } else {
+ return $format;
+ }
+ } else {
+ return;
+ }
+}
+
+
=head2 fill_in_to_arr
Similar to C<fill_in>, but returns array of all repeatable fields. Usable
return @arr;
}
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
+
+Arguments are:
+record reference C<$rec>,
+field C<$f>,
+optional subfiled C<$sf>,
+index for repeatable values C<$i>.
+
+Optinal variable C<$found> will be incremeted if there
+is field.
+
+Returns value or empty string.
+
+=cut
+
+sub get_data {
+ my $self = shift;
+
+ my ($rec,$f,$sf,$i,$found) = @_;
+
+ if ($$rec->{$f}) {
+ return '' if (! $$rec->{$f}->[$i]);
+ no strict 'refs';
+ if ($sf && $$rec->{$f}->[$i]->{$sf}) {
+ $$found++ if (defined($$found));
+ return $$rec->{$f}->[$i]->{$sf};
+ } elsif ($$rec->{$f}->[$i]) {
+ $$found++ if (defined($$found));
+ # it still might have subfield, just
+ # not specified, so we'll dump all
+ if ($$rec->{$f}->[$i] =~ /HASH/o) {
+ my $out;
+ foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+ $out .= $$rec->{$f}->[$i]->{$k}." ";
+ }
+ return $out;
+ } else {
+ return $$rec->{$f}->[$i];
+ }
+ }
+ } else {
+ return '';
+ }
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+ my $self = shift;
+
+ my ($name,$delimiter,$data) = @_;
+
+ my $log = $self->_get_logger();
+
+ if (! $self->{'import_xml'}->{'format'}->{$name}) {
+ $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+ return $data;
+ }
+
+ $log->warn("no delimiter for format $name") if (! $delimiter);
+
+ my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+ my @data = split(/\Q$delimiter\E/, $data);
+
+ my $out = sprintf($format, @data);
+ $log->debug("using format $name [$format] on $data to produce: $out");
+
+ if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+ return $self->lookup($out);
+ } else {
+ return $out;
+ }
+
+}
+
=head2 sort_arr
Sort array ignoring case and html in data
}
+=head1 INTERNAL METHODS
+
=head2 _sort_by_order
Sort xml tags data structure accoding to C<order=""> attribute.
=head2 _x
-Convert strings from C<conf/normalize> encoding into application specific
-(optinally specified using C<code_page> to C<new> constructor.
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
my $text = $n->_x('normalize text string');