r11727@llin: dpavlin | 2005-12-16 19:41:08 +0100
authorDobrica Pavlinusic <dpavlin@rot13.org>
Fri, 16 Dec 2005 14:40:55 +0000 (14:40 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Fri, 16 Dec 2005 14:40:55 +0000 (14:40 +0000)
 added filter{regex(s/foo/bar/)} [2.00_5]

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@260 07558da8-63fa-0310-ba24-9fe276d99e06

TODO
conf/normalize/isis_ffzg.xml
lib/WebPAC.pm
lib/WebPAC/Normalize.pm
t/3-normalize-xml.t
t/data/normalize.xml

diff --git a/TODO b/TODO
index b43ff84..0c2eac4 100644 (file)
--- a/TODO
+++ b/TODO
@@ -2,6 +2,8 @@
 + support multiple inputs to single database [2.00_1]
 + lookups now works [2.00_3]
 + create links to other databases [2.00_4]
++ add regexp filter [2.00_5]
+- add more input formats (MARC::Fast and others)
 - delete unused files in database directories
 - scoring for various fields in input/*.xml
 - write pure perl Search::HyperEstraier
index 811bdae..7acc4bd 100644 (file)
@@ -10,8 +10,8 @@
 >
 
 <!--
-       <isis type="swish|display|index">_pre_000x_sep_000x_sep_000x_post_</isis>
-       <config type="swish|display">name of var from config file</config>
+       <isis type="search|display|index">_pre_000x_sep_000x_sep_000x_post_</isis>
+       <config type="search|display">name of var from config file</config>
 -->
 
 
@@ -51,7 +51,7 @@
        </MFN>
 
        <ISBN>
-               <isis type="display">filter{isn_swish}v10</isis>
+               <isis type="display">v10</isis>
        </ISBN>
 
        <ISSN>
@@ -59,7 +59,7 @@
        </ISSN>
 
        <IdentificationNumbers name="ISN">
-               <isis type="swish">filter{isn_swish}v10 v11</isis>
+               <isis type="search">v10 v11</isis>
        </IdentificationNumbers>
        
        <Language>
@@ -68,7 +68,7 @@
        </Language>
        
        <TitleProper>
-               <isis>v200^a</isis>
+               <isis>filter{regex(s/<[^>]*>/)}v200^a</isis>
        </TitleProper>
 
        <Subtitle>
        </UncontrolledTerms>
        
        <UDC_All>
-               <isis type="swish">v675</isis>
+               <isis type="search">v675</isis>
        </UDC_All>
        
        <UDC>
        <type name="Vrsta graðe: " order="100">
                <!-- added via xml tag in .conf -->
                <config type="display">materialtype</config>
-               <config type="swish">material_code</config>
-               <isis type="swish">v200^e</isis>
+               <config type="search">material_code</config>
+               <isis type="search">v200^e</isis>
        </type>
        
        <form name="Format: " order="101">
                <!-- added via xml tag in .conf -->
-               <config type="swish">material_form</config>
+               <config type="search">material_form</config>
        </form>
        
        <library name="Knji¾nica: " order="110">
                        <delimiter>, </delimiter>
                        <value>library_url</value>
                </config>
-               <config type="swish">library_code</config>
+               <config type="search">library_code</config>
        </library>
 
     </indexer>
index abab314..09f32e6 100644 (file)
@@ -13,7 +13,7 @@ Version 2.00
 
 =cut
 
-our $VERSION = '2.00_4';
+our $VERSION = '2.00_5';
 
 =head1 SYNOPSIS
 
index f65d82f..1a8f4ac 100644 (file)
@@ -11,11 +11,11 @@ WebPAC::Normalize - data mungling for normalisation
 
 =head1 VERSION
 
-Version 0.05
+Version 0.06
 
 =cut
 
-our $VERSION = '0.05';
+our $VERSION = '0.06';
 
 =head1 SYNOPSIS
 
@@ -47,6 +47,10 @@ optional C<filter{filter_name}> at B<begining of format> will apply perl
 code defined as code ref on format after field substitution to producing
 output
 
+There is one built-in filter called C<regex> which can be use like this:
+
+  filter{regex(s/foo/bar/)}
+
 =item *
 
 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
@@ -119,6 +123,15 @@ sub new {
 
        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
 
+       if ($self->{filter} && ! $self->{filter}->{regex}) {
+               $log->debug("adding built-in filter regex");
+               $self->{filter}->{regex} = sub {
+                       my ($val, $regex) = @_;
+                       eval "\$val =~ $regex";
+                       return $val;
+               };
+       }
+
        $self ? return $self : return undef;
 }
 
@@ -278,6 +291,14 @@ return output or nothing depending on eval code.
 
  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 
+Filters are implemented here. While simple form of filters looks like this:
+
+  filter{name_of_filter}
+
+but, filters can also have variable number of parametars like this:
+
+  filter{name_of_filter(param,param,param)}
+
 =cut
 
 sub parse {
@@ -349,11 +370,20 @@ sub parse {
                return if (! $self->_eval($eval));
        }
        
-       if ($filter_name && $self->{'filter'}->{$filter_name}) {
-               $log->debug("about to filter{$filter_name} format: $out");
-               $out = $self->{'filter'}->{$filter_name}->($out);
-               return unless(defined($out));
-               $log->debug("filter result: $out");
+       if ($filter_name) {
+               my @filter_args;
+               if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
+                       @filter_args = split(/,/, $2);
+               }
+               if ($self->{'filter'}->{$filter_name}) {
+                       $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
+                       unshift @filter_args, $out;
+                       $out = $self->{'filter'}->{$filter_name}->(@filter_args);
+                       return unless(defined($out));
+                       $log->debug("filter result: $out");
+               } else {
+                       $log->warn("trying to use undefined filter $filter_name");
+               }
        }
 
        return $out;
index 9743bbe..77dafc6 100755 (executable)
@@ -18,7 +18,16 @@ diag "abs_path: $abs_path";
 throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
 throws_ok { new WebPAC::Normalize::XML( lookup => 'bar' ) } qr/pair/, "lookup without lookup_regex";
 
-ok(my $n = new WebPAC::Normalize::XML( debug => 0 ), "new");
+ok(my $n = new WebPAC::Normalize::XML(
+       debug => 1,
+       filter => {
+               regex => sub {
+                       my ($val, $regex) = @_;
+                       eval "\$val =~ $regex";
+                       return $val;
+               },
+       },
+), "new");
 
 throws_ok { $n->open() } qr/tag/, "open without tag";
 throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file";
@@ -131,5 +140,5 @@ foreach my $fld (keys %$rec) {
 
 ok(my $ds = $n->data_structure( $rec ), "data_structure");
 
-diag Dumper($rec, $ds);
+#diag Dumper($rec, $ds);
 
index 8e6485b..407db51 100644 (file)
@@ -1,18 +1,6 @@
 <?xml version="1.0" encoding="ISO-8859-2"?>
 <config>
     <indexer>
-       <headline>
-               <isis type="display">v200^a : v200^e / v200^f, v210^d.</isis>
-               <config >
-                       <delimiter> (</delimiter>
-                       <value>materialtype</value>
-               </config>
-               <config >
-                       <delimiter>) ---> Lokacija: </delimiter>
-                       <value>library</value>
-               </config>
-       </headline>
-
        <filename>
                <isis type="filename">out/v000.html</isis>      
        </filename>
@@ -75,7 +63,7 @@
        </NamePublisher>
 
        <DatePublication> 
-               <isis>v210^d</isis>
+               <isis>filter{regex(s/cop.\s+//)}v210^d</isis>
        </DatePublication>
 
        <PhysicalDescription>