From 888eabffdc605c2ec8692afca45883d15a89e57e Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sat, 16 Jul 2005 22:57:26 +0000 Subject: [PATCH] improvements to WebPAC::Normalize::XML git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@12 07558da8-63fa-0310-ba24-9fe276d99e06 --- Makefile.PL | 1 + lib/WebPAC/Lookup.pm | 32 +++++++-- lib/WebPAC/Normalize/XML.pm | 135 ++++++++++++++++++++---------------- t/1-lookup.t | 5 +- t/3-normalize-xml.t | 26 +++++++ 5 files changed, 134 insertions(+), 65 deletions(-) create mode 100755 t/3-normalize-xml.t diff --git a/Makefile.PL b/Makefile.PL index 7eab3f3..d2d2e51 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -18,6 +18,7 @@ WriteMakefile( 'Text::Iconv' => 0, 'Storable' => 0, 'DBM::Deep' => 0, + 'XML::Simple' => 0, }, dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, clean => { FILES => 'WebPAC-* pod2html Makefile tags' }, diff --git a/lib/WebPAC/Lookup.pm b/lib/WebPAC/Lookup.pm index b74760d..02d73d7 100644 --- a/lib/WebPAC/Lookup.pm +++ b/lib/WebPAC/Lookup.pm @@ -9,9 +9,6 @@ use base qw/WebPAC::Common/; use File::Slurp; use Data::Dumper; -my $LOOKUP_REGEX = 'lookup{[^\{\}]+}'; -my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}'; - =head1 NAME WebPAC::Lookup - simple normalisation plugin to produce lookup @@ -54,6 +51,8 @@ Create new lookup object. my $lookup = new WebPAC::Lookup( lookup_file => '/path/to/conf/lookup/lookup.pm', + is_lookup_regex => 'lookup{[^\{\}]+}'; + save_lookup_regex => 'lookup{([^\{\}]+)}'; ); =cut @@ -77,6 +76,15 @@ sub new { $log->logconfess("lookup config file isn't ARRAY", sub { Dumper( $self->{'lookup_def'} ) }) if ($self->{'lookup_def'} !~ /ARRAY/o); + $self->{'is_lookup_regex'} ||= 'lookup{[^\{\}]+}'; + $self->{'save_lookup_regex'} ||= 'lookup{([^\{\}]+)}'; + + + $self->{'LOOKUP_REGEX'} = qr/$self->{'is_lookup_regex'}/; + $self->{'LOOKUP_REGEX_SAVE'} = qr/$self->{'save_lookup_regex'}/; + + $log->debug("regexps: ", $self->{'LOOKUP_REGEX'}, " ", $self->{'LOOKUP_REGEX_SAVE'}); + $self ? return $self : return undef; } @@ -144,14 +152,14 @@ sub lookup { my $tmp = shift || $log->logconfess("need format"); - if ($tmp =~ /$LOOKUP_REGEX/o) { + if ($tmp =~ $self->{'LOOKUP_REGEX'}) { my @in = ( $tmp ); $log->debug("lookup for: ",$tmp); my @out; while (my $f = shift @in) { - if ($f =~ /$LOOKUP_REGEX_SAVE/o) { + if ($f =~ $self->{'LOOKUP_REGEX_SAVE'}) { my $k = $1; if ($self->{'lookup'}->{$k}) { foreach my $nv (@{$self->{'lookup'}->{$k}}) { @@ -173,6 +181,20 @@ sub lookup { } } +=head2 regex + +Returns precompiled regex for lookup format. + + if ($foo =~ $lookup->reges) { ... } + +=cut + +sub regex { + my $self = shift; + + return $self->{'LOOKUP_REGEX'}; +} + =head1 AUTHOR Dobrica Pavlinusic, C<< >> diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm index f8cc5fd..24c1c1d 100644 --- a/lib/WebPAC/Normalize/XML.pm +++ b/lib/WebPAC/Normalize/XML.pm @@ -1,14 +1,16 @@ -package WebPAC::Normalise::XML; +package WebPAC::Normalize::XML; use warnings; use strict; use base qw/WebPAC::Common/; use Storable; +use XML::Simple; +use Data::Dumper; =head1 NAME -WebPAC::Normalise::XML - apply XML normalisaton rules +WebPAC::Normalize::XML - apply XML normalisaton rules =head1 VERSION @@ -23,41 +25,31 @@ our $VERSION = '0.01'; This module uses C files to perform normalisation from input records - use WebPAC::Normalise::XML; - - my $foo = WebPAC::Normalise::XML->new(); - ... - =cut -# mapping between data type and tag which specify -# format in XML file -my %type2tag = ( - 'isis' => 'isis', -# 'excel' => 'column', -# 'marc' => 'marc', -# 'feed' => 'feed' -); - - -=head1 EXPORT - -A list of functions that can be exported. You can delete this section -if you don't export anything, such as for a purely object-oriented module. - =head1 FUNCTIONS =head2 new -Create new instance of WebPAC using configuration specified by C. +Read normalisation rules defined using XML from C and +parse it. my $n = new WebPAC::Normalize::XML( + tag => 'isis', + xml_file => '/path/to/conf/normalize/isis.xml', cache_data_structure => './cache/ds/', + lookup_regex => $lookup->regex, } +C defines tag to use within C + +C defines path to normalize XML. + Optional parameter C defines path to directory in which cache file for C call will be created. +Recommended parametar C specify ... + =cut sub new { @@ -67,47 +59,29 @@ sub new { $self->setup_cache_dir( $self->{'cache_data_structure'} ); - return $self; -} - -=head2 open_import_xml - -Read file from C directory and parse it. - - $webpac->open_import_xml(type => 'isis'); - -=cut - -sub open_import_xml { - my $self = shift; - my $log = $self->_get_logger(); - my $arg = {@_}; - $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); - - $self->{'type'} = $arg->{'type'}; - - my $type_base = $arg->{'type'}; - $type_base =~ s/_.*$//g; + foreach my $req (qw/tag xml_file/) { + $log->logconfess("need argument $req") unless $self->{$req}; + } - $self->{'tag'} = $type2tag{$type_base}; + my $f = - $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); + my $xml_file = $self->{'xml_file'}; - my $f = "./import_xml/".$self->{'type'}.".xml"; - $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); + $log->info("using $xml_file tag <",$self->{'tag'},">"); - $log->info("reading '$f'"); + $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file); - $self->{'import_xml_file'} = $f; + $self->{'import_xml_file'} = $xml_file; $self->{'import_xml'} = XMLin($f, - ForceArray => [ $self->{'tag'}, 'config', 'format' ], + ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ], ); $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); + return $self; } =head2 setup_cache_dir @@ -153,14 +127,18 @@ sub setup_cache_dir { =head2 data_structure -Create in-memory data structure which represents layout from C. -It is used later to produce output. +Create in-memory data structure which represents normalized layout from +C. + +This structures are used to produce output. my @ds = $webpac->data_structure($rec); -This method will also set C<$webpac->{'currnet_filename'}> if there is - tag in C and C<$webpac->{'headline'}> if there is - tag. +B + +This method will also set C<< $webpac->{'currnet_filename'} >> if there is +C<< >> tag and C<< $webpac->{'headline'} >> if there is +C<< >> tag. =cut @@ -232,8 +210,7 @@ sub data_structure { $log->debug("format: $format"); my @v; - # FIXME this is a cludge! - if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) { + if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { @v = $self->fill_in_to_arr($rec,$format); } else { @v = $self->parse_to_arr($rec,$format); @@ -324,6 +301,46 @@ sub data_structure { } +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->lookup($out); + } else { + return $out; + } + +} + =head1 AUTHOR @@ -338,4 +355,4 @@ under the same terms as Perl itself. =cut -1; # End of WebPAC::Normalise::XML +1; # End of WebPAC::Normalize::XML diff --git a/t/1-lookup.t b/t/1-lookup.t index aa2dbbc..cd7b4f2 100755 --- a/t/1-lookup.t +++ b/t/1-lookup.t @@ -1,7 +1,7 @@ #!/usr/bin/perl -w use strict; -use Test::More tests => 15; +use Test::More tests => 16; use Test::Exception; use blib; @@ -20,6 +20,9 @@ ok(my $lookup = new WebPAC::Lookup( }, ), "new"); +ok(my $regex = $lookup->regex, "regex"); +diag "regex: $regex"; + my $rec = { '000' => [ '001' ], '800' => [ 'foo' ], diff --git a/t/3-normalize-xml.t b/t/3-normalize-xml.t new file mode 100755 index 0000000..85f7bb0 --- /dev/null +++ b/t/3-normalize-xml.t @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w + +use Test::More tests => 6; +use Test::Exception; +use Cwd qw/abs_path/; +use blib; +use strict; + +BEGIN { +use_ok( 'WebPAC::Normalize::XML' ); +} + +ok(my $abs_path = abs_path($0), "abs_path"); +$abs_path =~ s#/[^/]*$#/#; +diag "abs_path: $abs_path"; + +throws_ok { new WebPAC::Normalize::XML() } qr/tag/, "new without tag"; +throws_ok { new WebPAC::Normalize::XML( tag => 'isis' ) } qr/xml_file/, "new without xml_file"; +throws_ok { new WebPAC::Normalize::XML( tag => 'isis', xml_file => 'foo' ) } qr/file.*doesn't exist/, "new with invalid xml_file"; +ok(my $isis = new WebPAC::Normalize::XML( + tag => 'isis', + xml_file => "$abs_path../conf/normalize/isis.xml", + debug => 1, +), "new"); + + -- 2.20.1