From 1abdcaf9bd665b1a6f5972348cc4784e57a7e549 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Fri, 16 Dec 2005 14:40:55 +0000 Subject: [PATCH] r11727@llin: dpavlin | 2005-12-16 19:41:08 +0100 added filter{regex(s/foo/bar/)} [2.00_5] git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@260 07558da8-63fa-0310-ba24-9fe276d99e06 --- TODO | 2 ++ conf/normalize/isis_ffzg.xml | 20 ++++++++-------- lib/WebPAC.pm | 2 +- lib/WebPAC/Normalize.pm | 44 ++++++++++++++++++++++++++++++------ t/3-normalize-xml.t | 13 +++++++++-- t/data/normalize.xml | 14 +----------- 6 files changed, 62 insertions(+), 33 deletions(-) diff --git a/TODO b/TODO index b43ff84..0c2eac4 100644 --- a/TODO +++ b/TODO @@ -2,6 +2,8 @@ + support multiple inputs to single database [2.00_1] + lookups now works [2.00_3] + create links to other databases [2.00_4] ++ add regexp filter [2.00_5] +- add more input formats (MARC::Fast and others) - delete unused files in database directories - scoring for various fields in input/*.xml - write pure perl Search::HyperEstraier diff --git a/conf/normalize/isis_ffzg.xml b/conf/normalize/isis_ffzg.xml index 811bdae..7acc4bd 100644 --- a/conf/normalize/isis_ffzg.xml +++ b/conf/normalize/isis_ffzg.xml @@ -10,8 +10,8 @@ > @@ -51,7 +51,7 @@ - filter{isn_swish}v10 + v10 @@ -59,7 +59,7 @@ - filter{isn_swish}v10 v11 + v10 v11 @@ -68,7 +68,7 @@ - v200^a + filter{regex(s/<[^>]*>/)}v200^a @@ -188,7 +188,7 @@ - v675 + v675 @@ -258,13 +258,13 @@ materialtype - material_code - v200^e + material_code + v200^e
- material_form + material_form
@@ -274,7 +274,7 @@ , library_url - library_code + library_code diff --git a/lib/WebPAC.pm b/lib/WebPAC.pm index abab314..09f32e6 100644 --- a/lib/WebPAC.pm +++ b/lib/WebPAC.pm @@ -13,7 +13,7 @@ Version 2.00 =cut -our $VERSION = '2.00_4'; +our $VERSION = '2.00_5'; =head1 SYNOPSIS diff --git a/lib/WebPAC/Normalize.pm b/lib/WebPAC/Normalize.pm index f65d82f..1a8f4ac 100644 --- a/lib/WebPAC/Normalize.pm +++ b/lib/WebPAC/Normalize.pm @@ -11,11 +11,11 @@ WebPAC::Normalize - data mungling for normalisation =head1 VERSION -Version 0.05 +Version 0.06 =cut -our $VERSION = '0.05'; +our $VERSION = '0.06'; =head1 SYNOPSIS @@ -47,6 +47,10 @@ optional C at B will apply perl code defined as code ref on format after field substitution to producing output +There is one built-in filter called C which can be use like this: + + filter{regex(s/foo/bar/)} + =item * optional C will be then performed. See C. @@ -119,6 +123,15 @@ sub new { $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l); + if ($self->{filter} && ! $self->{filter}->{regex}) { + $log->debug("adding built-in filter regex"); + $self->{filter}->{regex} = sub { + my ($val, $regex) = @_; + eval "\$val =~ $regex"; + return $val; + }; + } + $self ? return $self : return undef; } @@ -278,6 +291,14 @@ return output or nothing depending on eval code. my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); +Filters are implemented here. While simple form of filters looks like this: + + filter{name_of_filter} + +but, filters can also have variable number of parametars like this: + + filter{name_of_filter(param,param,param)} + =cut sub parse { @@ -349,11 +370,20 @@ sub parse { return if (! $self->_eval($eval)); } - if ($filter_name && $self->{'filter'}->{$filter_name}) { - $log->debug("about to filter{$filter_name} format: $out"); - $out = $self->{'filter'}->{$filter_name}->($out); - return unless(defined($out)); - $log->debug("filter result: $out"); + if ($filter_name) { + my @filter_args; + if ($filter_name =~ s/(\w+)\((.*)\)/$1/) { + @filter_args = split(/,/, $2); + } + if ($self->{'filter'}->{$filter_name}) { + $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args)); + unshift @filter_args, $out; + $out = $self->{'filter'}->{$filter_name}->(@filter_args); + return unless(defined($out)); + $log->debug("filter result: $out"); + } else { + $log->warn("trying to use undefined filter $filter_name"); + } } return $out; diff --git a/t/3-normalize-xml.t b/t/3-normalize-xml.t index 9743bbe..77dafc6 100755 --- a/t/3-normalize-xml.t +++ b/t/3-normalize-xml.t @@ -18,7 +18,16 @@ diag "abs_path: $abs_path"; throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup"; throws_ok { new WebPAC::Normalize::XML( lookup => 'bar' ) } qr/pair/, "lookup without lookup_regex"; -ok(my $n = new WebPAC::Normalize::XML( debug => 0 ), "new"); +ok(my $n = new WebPAC::Normalize::XML( + debug => 1, + filter => { + regex => sub { + my ($val, $regex) = @_; + eval "\$val =~ $regex"; + return $val; + }, + }, +), "new"); throws_ok { $n->open() } qr/tag/, "open without tag"; throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file"; @@ -131,5 +140,5 @@ foreach my $fld (keys %$rec) { ok(my $ds = $n->data_structure( $rec ), "data_structure"); -diag Dumper($rec, $ds); +#diag Dumper($rec, $ds); diff --git a/t/data/normalize.xml b/t/data/normalize.xml index 8e6485b..407db51 100644 --- a/t/data/normalize.xml +++ b/t/data/normalize.xml @@ -1,18 +1,6 @@ - - v200^a : v200^e / v200^f, v210^d. - - ( - materialtype - - - ) ---> Lokacija: - library - - - out/v000.html @@ -75,7 +63,7 @@ - v210^d + filter{regex(s/cop.\s+//)}v210^d -- 2.20.1