improvements to WebPAC::Normalize::XML

[webpac2] / lib / WebPAC / Normalize / XML.pm
diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm

index f8cc5fd..24c1c1d 100644 (file)
--- a/lib/WebPAC/Normalize/XML.pm
+++ b/lib/WebPAC/Normalize/XML.pm
@@ -1,14 +1,16 @@
-package WebPAC::Normalise::XML;
+package WebPAC::Normalize::XML;
  
  use warnings;
  use strict;
  
  use base qw/WebPAC::Common/;
  use Storable;
+use XML::Simple;
+use Data::Dumper;
  
  =head1 NAME
  
-WebPAC::Normalise::XML - apply XML normalisaton rules
+WebPAC::Normalize::XML - apply XML normalisaton rules
  
  =head1 VERSION
  
@@ -23,41 +25,31 @@ our $VERSION = '0.01';
  This module uses C<conf/normalize/*.xml> files to perform normalisation
  from input records
  
-    use WebPAC::Normalise::XML;
-
-    my $foo = WebPAC::Normalise::XML->new();
-    ...
-
  =cut
  
-# mapping between data type and tag which specify
-# format in XML file
-my %type2tag = (
-       'isis' => 'isis',
-#      'excel' => 'column',
-#      'marc' => 'marc',
-#      'feed' => 'feed'
-);
-
-
-=head1 EXPORT
-
-A list of functions that can be exported.  You can delete this section
-if you don't export anything, such as for a purely object-oriented module.
-
  =head1 FUNCTIONS
  
  =head2 new
  
-Create new instance of WebPAC using configuration specified by C<config_file>.
+Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
+parse it.
  
   my $n = new WebPAC::Normalize::XML(
+       tag => 'isis',
+       xml_file => '/path/to/conf/normalize/isis.xml',
         cache_data_structure => './cache/ds/',
+       lookup_regex => $lookup->regex,
   }
  
+C<tag> defines tag to use within C<xml_file>
+
+C<xml_file> defines path to normalize XML.
+
  Optional parameter C<cache_data_structure> defines path to directory
  in which cache file for C<data_structure> call will be created.
  
+Recommended parametar C<lookup_regex> specify ... 
+
  =cut
  
  sub new {
@@ -67,47 +59,29 @@ sub new {
  
         $self->setup_cache_dir( $self->{'cache_data_structure'} );
  
-       return $self;
-}
-
-=head2 open_import_xml
-
-Read file from C<import_xml/> directory and parse it.
-
- $webpac->open_import_xml(type => 'isis');
-
-=cut
-
-sub open_import_xml {
-       my $self = shift;
-
         my $log = $self->_get_logger();
  
-       my $arg = {@_};
-       $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
-
-       $self->{'type'} = $arg->{'type'};
-
-       my $type_base = $arg->{'type'};
-       $type_base =~ s/_.*$//g;
+       foreach my $req (qw/tag xml_file/) {
+               $log->logconfess("need argument $req") unless $self->{$req};
+       }
  
-       $self->{'tag'} = $type2tag{$type_base};
+       my $f = 
  
-       $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+       my $xml_file = $self->{'xml_file'};
  
-       my $f = "./import_xml/".$self->{'type'}.".xml";
-       $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
+       $log->info("using $xml_file tag <",$self->{'tag'},">");
  
-       $log->info("reading '$f'");
+       $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
  
-       $self->{'import_xml_file'} = $f;
+       $self->{'import_xml_file'} = $xml_file;
  
         $self->{'import_xml'} = XMLin($f,
-               ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+               ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
         );
  
         $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
  
+       return $self;
  }
  
  =head2 setup_cache_dir
@@ -153,14 +127,18 @@ sub setup_cache_dir {
  
  =head2 data_structure
  
-Create in-memory data structure which represents layout from C<import_xml>.
-It is used later to produce output.
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
  
   my @ds = $webpac->data_structure($rec);
  
-This method will also set C<$webpac->{'currnet_filename'}> if there is
-<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
-<headline> tag.
+B<Note: historical oddity follows>
+
+This method will also set C<< $webpac->{'currnet_filename'} >> if there is
+C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
+C<< <headline> >> tag.
  
  =cut
  
@@ -232,8 +210,7 @@ sub data_structure {
                         $log->debug("format: $format");
  
                         my @v;
-                       # FIXME this is a cludge!
-                       if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
+                       if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                 @v = $self->fill_in_to_arr($rec,$format);
                         } else {
                                 @v = $self->parse_to_arr($rec,$format);
@@ -324,6 +301,46 @@ sub data_structure {
  
  }
  
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+       my $self = shift;
+
+       my ($name,$delimiter,$data) = @_;
+
+       my $log = $self->_get_logger();
+
+       if (! $self->{'import_xml'}->{'format'}->{$name}) {
+               $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+               return $data;
+       }
+
+       $log->warn("no delimiter for format $name") if (! $delimiter);
+
+       my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+       my @data = split(/\Q$delimiter\E/, $data);
+
+       my $out = sprintf($format, @data);
+       $log->debug("using format $name [$format] on $data to produce: $out");
+
+       if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+               return $self->lookup($out);
+       } else {
+               return $out;
+       }
+
+}
+
  
  =head1 AUTHOR
  
@@ -338,4 +355,4 @@ under the same terms as Perl itself.
  
  =cut
  
-1; # End of WebPAC::Normalise::XML
+1; # End of WebPAC::Normalize::XML