-package WebPAC::Normalise::XML;
+package WebPAC::Normalize::XML;
use warnings;
use strict;
use base qw/WebPAC::Common/;
use Storable;
+use XML::Simple;
+use Data::Dumper;
=head1 NAME
-WebPAC::Normalise::XML - apply XML normalisaton rules
+WebPAC::Normalize::XML - apply XML normalisaton rules
=head1 VERSION
This module uses C<conf/normalize/*.xml> files to perform normalisation
from input records
- use WebPAC::Normalise::XML;
-
- my $foo = WebPAC::Normalise::XML->new();
- ...
-
=cut
-# mapping between data type and tag which specify
-# format in XML file
-my %type2tag = (
- 'isis' => 'isis',
-# 'excel' => 'column',
-# 'marc' => 'marc',
-# 'feed' => 'feed'
-);
-
-
-=head1 EXPORT
-
-A list of functions that can be exported. You can delete this section
-if you don't export anything, such as for a purely object-oriented module.
-
=head1 FUNCTIONS
=head2 new
-Create new instance of WebPAC using configuration specified by C<config_file>.
+Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
+parse it.
my $n = new WebPAC::Normalize::XML(
+ tag => 'isis',
+ xml_file => '/path/to/conf/normalize/isis.xml',
cache_data_structure => './cache/ds/',
+ lookup_regex => $lookup->regex,
}
+C<tag> defines tag to use within C<xml_file>
+
+C<xml_file> defines path to normalize XML.
+
Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.
+Recommended parametar C<lookup_regex> specify ...
+
=cut
sub new {
$self->setup_cache_dir( $self->{'cache_data_structure'} );
- return $self;
-}
-
-=head2 open_import_xml
-
-Read file from C<import_xml/> directory and parse it.
-
- $webpac->open_import_xml(type => 'isis');
-
-=cut
-
-sub open_import_xml {
- my $self = shift;
-
my $log = $self->_get_logger();
- my $arg = {@_};
- $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
-
- $self->{'type'} = $arg->{'type'};
-
- my $type_base = $arg->{'type'};
- $type_base =~ s/_.*$//g;
+ foreach my $req (qw/tag xml_file/) {
+ $log->logconfess("need argument $req") unless $self->{$req};
+ }
- $self->{'tag'} = $type2tag{$type_base};
+ my $f =
- $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+ my $xml_file = $self->{'xml_file'};
- my $f = "./import_xml/".$self->{'type'}.".xml";
- $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
+ $log->info("using $xml_file tag <",$self->{'tag'},">");
- $log->info("reading '$f'");
+ $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
- $self->{'import_xml_file'} = $f;
+ $self->{'import_xml_file'} = $xml_file;
$self->{'import_xml'} = XMLin($f,
- ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+ ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
);
$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
+ return $self;
}
=head2 setup_cache_dir
=head2 data_structure
-Create in-memory data structure which represents layout from C<import_xml>.
-It is used later to produce output.
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
my @ds = $webpac->data_structure($rec);
-This method will also set C<$webpac->{'currnet_filename'}> if there is
-<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
-<headline> tag.
+B<Note: historical oddity follows>
+
+This method will also set C<< $webpac->{'currnet_filename'} >> if there is
+C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
+C<< <headline> >> tag.
=cut
$log->debug("format: $format");
my @v;
- # FIXME this is a cludge!
- if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
+ if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
@v = $self->fill_in_to_arr($rec,$format);
} else {
@v = $self->parse_to_arr($rec,$format);
}
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+ my $self = shift;
+
+ my ($name,$delimiter,$data) = @_;
+
+ my $log = $self->_get_logger();
+
+ if (! $self->{'import_xml'}->{'format'}->{$name}) {
+ $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+ return $data;
+ }
+
+ $log->warn("no delimiter for format $name") if (! $delimiter);
+
+ my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+ my @data = split(/\Q$delimiter\E/, $data);
+
+ my $out = sprintf($format, @data);
+ $log->debug("using format $name [$format] on $data to produce: $out");
+
+ if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+ return $self->lookup($out);
+ } else {
+ return $out;
+ }
+
+}
+
=head1 AUTHOR
=cut
-1; # End of WebPAC::Normalise::XML
+1; # End of WebPAC::Normalize::XML