From 888eabffdc605c2ec8692afca45883d15a89e57e Mon Sep 17 00:00:00 2001
From: Dobrica Pavlinusic <dpavlin@rot13.org>
Date: Sat, 16 Jul 2005 22:57:26 +0000
Subject: [PATCH] improvements to WebPAC::Normalize::XML

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@12 07558da8-63fa-0310-ba24-9fe276d99e06
---
 Makefile.PL                 |   1 +
 lib/WebPAC/Lookup.pm        |  32 +++++++--
 lib/WebPAC/Normalize/XML.pm | 135 ++++++++++++++++++++----------------
 t/1-lookup.t                |   5 +-
 t/3-normalize-xml.t         |  26 +++++++
 5 files changed, 134 insertions(+), 65 deletions(-)
 create mode 100755 t/3-normalize-xml.t

diff --git a/Makefile.PL b/Makefile.PL
index 7eab3f3..d2d2e51 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -18,6 +18,7 @@ WriteMakefile(
 	'Text::Iconv' => 0,
 	'Storable' => 0,
 	'DBM::Deep' => 0,
+	'XML::Simple' => 0,
     },
     dist                => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
     clean               => { FILES => 'WebPAC-* pod2html Makefile tags' },
diff --git a/lib/WebPAC/Lookup.pm b/lib/WebPAC/Lookup.pm
index b74760d..02d73d7 100644
--- a/lib/WebPAC/Lookup.pm
+++ b/lib/WebPAC/Lookup.pm
@@ -9,9 +9,6 @@ use base qw/WebPAC::Common/;
 use File::Slurp;
 use Data::Dumper;
 
-my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
-my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
-
 =head1 NAME
 
 WebPAC::Lookup - simple normalisation plugin to produce lookup
@@ -54,6 +51,8 @@ Create new lookup object.
 
   my $lookup = new WebPAC::Lookup(
   	lookup_file => '/path/to/conf/lookup/lookup.pm',
+	is_lookup_regex => 'lookup{[^\{\}]+}';
+	save_lookup_regex => 'lookup{([^\{\}]+)}';
   );
 
 =cut
@@ -77,6 +76,15 @@ sub new {
 
 	$log->logconfess("lookup config file isn't ARRAY", sub { Dumper( $self->{'lookup_def'} ) }) if ($self->{'lookup_def'} !~ /ARRAY/o);
 
+	$self->{'is_lookup_regex'} ||= 'lookup{[^\{\}]+}';
+        $self->{'save_lookup_regex'} ||= 'lookup{([^\{\}]+)}';
+
+
+	$self->{'LOOKUP_REGEX'} = qr/$self->{'is_lookup_regex'}/;
+	$self->{'LOOKUP_REGEX_SAVE'} = qr/$self->{'save_lookup_regex'}/;
+
+	$log->debug("regexps: ", $self->{'LOOKUP_REGEX'}, " ", $self->{'LOOKUP_REGEX_SAVE'});
+
 	$self ? return $self : return undef;
 }
 
@@ -144,14 +152,14 @@ sub lookup {
 
 	my $tmp = shift || $log->logconfess("need format");
 
-	if ($tmp =~ /$LOOKUP_REGEX/o) {
+	if ($tmp =~ $self->{'LOOKUP_REGEX'}) {
 		my @in = ( $tmp );
 
 		$log->debug("lookup for: ",$tmp);
 
 		my @out;
 		while (my $f = shift @in) {
-			if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
+			if ($f =~ $self->{'LOOKUP_REGEX_SAVE'}) {
 				my $k = $1;
 				if ($self->{'lookup'}->{$k}) {
 					foreach my $nv (@{$self->{'lookup'}->{$k}}) {
@@ -173,6 +181,20 @@ sub lookup {
 	}
 }
 
+=head2 regex
+
+Returns precompiled regex for lookup format.
+
+ if ($foo =~ $lookup->reges) { ... }
+
+=cut
+
+sub regex {
+	my $self = shift;
+
+	return $self->{'LOOKUP_REGEX'};
+}
+
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm
index f8cc5fd..24c1c1d 100644
--- a/lib/WebPAC/Normalize/XML.pm
+++ b/lib/WebPAC/Normalize/XML.pm
@@ -1,14 +1,16 @@
-package WebPAC::Normalise::XML;
+package WebPAC::Normalize::XML;
 
 use warnings;
 use strict;
 
 use base qw/WebPAC::Common/;
 use Storable;
+use XML::Simple;
+use Data::Dumper;
 
 =head1 NAME
 
-WebPAC::Normalise::XML - apply XML normalisaton rules
+WebPAC::Normalize::XML - apply XML normalisaton rules
 
 =head1 VERSION
 
@@ -23,41 +25,31 @@ our $VERSION = '0.01';
 This module uses C<conf/normalize/*.xml> files to perform normalisation
 from input records
 
-    use WebPAC::Normalise::XML;
-
-    my $foo = WebPAC::Normalise::XML->new();
-    ...
-
 =cut
 
-# mapping between data type and tag which specify
-# format in XML file
-my %type2tag = (
-	'isis' => 'isis',
-#	'excel' => 'column',
-#	'marc' => 'marc',
-#	'feed' => 'feed'
-);
-
-
-=head1 EXPORT
-
-A list of functions that can be exported.  You can delete this section
-if you don't export anything, such as for a purely object-oriented module.
-
 =head1 FUNCTIONS
 
 =head2 new
 
-Create new instance of WebPAC using configuration specified by C<config_file>.
+Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
+parse it.
 
  my $n = new WebPAC::Normalize::XML(
+ 	tag => 'isis',
+ 	xml_file => '/path/to/conf/normalize/isis.xml',
 	cache_data_structure => './cache/ds/',
+	lookup_regex => $lookup->regex,
  }
 
+C<tag> defines tag to use within C<xml_file>
+
+C<xml_file> defines path to normalize XML.
+
 Optional parameter C<cache_data_structure> defines path to directory
 in which cache file for C<data_structure> call will be created.
 
+Recommended parametar C<lookup_regex> specify ... 
+
 =cut
 
 sub new {
@@ -67,47 +59,29 @@ sub new {
 
 	$self->setup_cache_dir( $self->{'cache_data_structure'} );
 
-	return $self;
-}
-
-=head2 open_import_xml
-
-Read file from C<import_xml/> directory and parse it.
-
- $webpac->open_import_xml(type => 'isis');
-
-=cut
-
-sub open_import_xml {
-	my $self = shift;
-
 	my $log = $self->_get_logger();
 
-	my $arg = {@_};
-	$log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
-
-	$self->{'type'} = $arg->{'type'};
-
-	my $type_base = $arg->{'type'};
-	$type_base =~ s/_.*$//g;
+	foreach my $req (qw/tag xml_file/) {
+		$log->logconfess("need argument $req") unless $self->{$req};
+	}
 
-	$self->{'tag'} = $type2tag{$type_base};
+	my $f = 
 
-	$log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+	my $xml_file = $self->{'xml_file'};
 
-	my $f = "./import_xml/".$self->{'type'}.".xml";
-	$log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
+	$log->info("using $xml_file tag <",$self->{'tag'},">");
 
-	$log->info("reading '$f'");
+	$log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
 
-	$self->{'import_xml_file'} = $f;
+	$self->{'import_xml_file'} = $xml_file;
 
 	$self->{'import_xml'} = XMLin($f,
-		ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+		ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
 	);
 
 	$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
 
+	return $self;
 }
 
 =head2 setup_cache_dir
@@ -153,14 +127,18 @@ sub setup_cache_dir {
 
 =head2 data_structure
 
-Create in-memory data structure which represents layout from C<import_xml>.
-It is used later to produce output.
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
 
  my @ds = $webpac->data_structure($rec);
 
-This method will also set C<$webpac->{'currnet_filename'}> if there is
-<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
-<headline> tag.
+B<Note: historical oddity follows>
+
+This method will also set C<< $webpac->{'currnet_filename'} >> if there is
+C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
+C<< <headline> >> tag.
 
 =cut
 
@@ -232,8 +210,7 @@ sub data_structure {
 			$log->debug("format: $format");
 
 			my @v;
-			# FIXME this is a cludge!
-			if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
+			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 				@v = $self->fill_in_to_arr($rec,$format);
 			} else {
 				@v = $self->parse_to_arr($rec,$format);
@@ -324,6 +301,46 @@ sub data_structure {
 
 }
 
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+	my $self = shift;
+
+	my ($name,$delimiter,$data) = @_;
+
+	my $log = $self->_get_logger();
+
+	if (! $self->{'import_xml'}->{'format'}->{$name}) {
+		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+		return $data;
+	}
+
+	$log->warn("no delimiter for format $name") if (! $delimiter);
+
+	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+	my @data = split(/\Q$delimiter\E/, $data);
+
+	my $out = sprintf($format, @data);
+	$log->debug("using format $name [$format] on $data to produce: $out");
+
+	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+		return $self->lookup($out);
+	} else {
+		return $out;
+	}
+
+}
+
 
 =head1 AUTHOR
 
@@ -338,4 +355,4 @@ under the same terms as Perl itself.
 
 =cut
 
-1; # End of WebPAC::Normalise::XML
+1; # End of WebPAC::Normalize::XML
diff --git a/t/1-lookup.t b/t/1-lookup.t
index aa2dbbc..cd7b4f2 100755
--- a/t/1-lookup.t
+++ b/t/1-lookup.t
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -w
 
 use strict;
-use Test::More tests => 15;
+use Test::More tests => 16;
 use Test::Exception;
 use blib;
 
@@ -20,6 +20,9 @@ ok(my $lookup = new WebPAC::Lookup(
 	},
 ), "new");
 
+ok(my $regex = $lookup->regex, "regex");
+diag "regex: $regex";
+
 my $rec = {
 	'000' => [ '001' ],
 	'800' => [ 'foo' ],
diff --git a/t/3-normalize-xml.t b/t/3-normalize-xml.t
new file mode 100755
index 0000000..85f7bb0
--- /dev/null
+++ b/t/3-normalize-xml.t
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+
+use Test::More tests => 6;
+use Test::Exception;
+use Cwd qw/abs_path/;
+use blib;
+use strict;
+
+BEGIN {
+use_ok( 'WebPAC::Normalize::XML' );
+}
+
+ok(my $abs_path = abs_path($0), "abs_path");
+$abs_path =~ s#/[^/]*$#/#;
+diag "abs_path: $abs_path";
+
+throws_ok { new WebPAC::Normalize::XML() } qr/tag/, "new without tag";
+throws_ok { new WebPAC::Normalize::XML( tag => 'isis' ) } qr/xml_file/, "new without xml_file";
+throws_ok { new WebPAC::Normalize::XML( tag => 'isis', xml_file => 'foo' ) } qr/file.*doesn't exist/, "new with invalid xml_file";
+ok(my $isis = new WebPAC::Normalize::XML(
+	tag => 'isis',
+	xml_file => "$abs_path../conf/normalize/isis.xml",
+	debug => 1,
+), "new");
+
+
-- 
2.20.1