From c180bbdaeef230d7bfe27b336f7e857dc0aa76df Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sat, 22 Feb 2003 23:49:22 +0000 Subject: [PATCH] add filter="name" for fields (to correct strane input data or make variations for indexing) git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@20 13eb9ef6-21d5-0310-b721-a9d68796d827 --- all2xml.pl | 56 ++++++++++++++++++++++++++++++++++--------- doc/formating_xml.txt | 38 +++++++++++++++++++++++++++++ filter/isn_swish.pm | 16 +++++++++++++ 3 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 doc/formating_xml.txt create mode 100755 filter/isn_swish.pm diff --git a/all2xml.pl b/all2xml.pl index ba1fe0f..d49096d 100755 --- a/all2xml.pl +++ b/all2xml.pl @@ -78,25 +78,53 @@ sub isis2xml { ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index"); #print STDERR "## s: $s d: $d i: $i ## $format ##\n"; + # FIX: this is ugly, UGLY, cludge string is returned + # in UTF8 encoding , but as if source charset + # is ISO8859-1 and not some other. This break other + # character encodings, so we convert it first + # back to ISO8859-1 $format = $cludge_codepage->convert($format); + my ($swish,$display) = parse_format($format,$row); #print STDERR "s: $swish\nd: $display\n" if ($swish); #print STDERR "swish: $swish<-- display: $display<--\n"; - # FIX: this is ugly, UGLY, cludge: OpenIsis return - # UTF8 encoding of strings, but as if source charset - # is ISO8859-1 and not some other. This breaks our - # isis character encoding, so we convert it first - # back to ISO8859-1 (which can actually be different - # encoding in isis) - $swish_data .= $swish if ($s && $swish); - $display_data .= $display if ($d && $display); + # filter="name" ; filter this field through + # filter/[name].pm + my $filter; + if ($x->{filter}) { + $filter = "filter/".$x->{filter}.".pm"; + require $filter; + } + # type="swish" ; field for swish + if ($s && $swish) { + if ($filter) { + $swish_data .= join(" ",&filter($swish)); + } else { + $swish_data .= $swish if ($s && $swish); + } + } - # insert into index + # type="display" ; field for display + if ($d && $display) { + if ($filter) { + $display_data .= join(" ",&filter($display)); + } else { + $display_data .= $display if ($s && $display); + } + } + + # type="index" ; insert into index if ($i && $display) { my $index_data = $index_codepage->convert($display) || $display; - $index->insert($field, $index_data, $db_dir); + if ($filter) { + foreach my $d (&filter($index_data)) { + $index->insert($field, $d, $db_dir); + } + } else { + $index->insert($field, $index_data, $db_dir); + } } } @@ -117,8 +145,10 @@ sub isis2xml { $field_name = $config->{indexer}->{$field}->{name_singular}."#-#"; } elsif ($config->{indexer}->{$field}->{name_plural}) { $field_name = $config->{indexer}->{$field}->{name_plural}."#-#"; - } else { + } elsif ($config->{indexer}->{$field}->{name}) { $field_name = $config->{indexer}->{$field}->{name}."#-#"; + } else { + print STDERR "WARNING: field '$field' doesn't have 'name' attribute!"; } if ($field_name) { $html .= $xml_codepage->convert($field_name); @@ -128,6 +158,10 @@ sub isis2xml { } if ($swish_data) { my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2'); + # remove extra spaces + $swish_data =~ s/ +/ /g; + $swish_data =~ s/ +$//g; + $swish_data = $i->convert($swish_data); $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data)); #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data; diff --git a/doc/formating_xml.txt b/doc/formating_xml.txt new file mode 100644 index 0000000..3d41539 --- /dev/null +++ b/doc/formating_xml.txt @@ -0,0 +1,38 @@ + + +_pre_000x_sep_000x_sep_000x_post_ + +format: + upper [from_char[,to_char]] + + makes field UPPERCASE + default: whole field + + + upper_w [word_nr[,...] + + make words in field UPPERCASE + default: first word + + substr from[,len] + + returns substring from some character in some length + default: len - rest of the string + + skip2nr + skip all aphanumeric caracters and return just + numbers after it. To output "1992." from "cop. 1992." + + initial [word_nr[,...]] + + make word(s) into initials (upper case first char and + append dot after it) + default: first word + + +filter: + name of filter which is stored in filters/[name].pm diff --git a/filter/isn_swish.pm b/filter/isn_swish.pm new file mode 100755 index 0000000..a2202bd --- /dev/null +++ b/filter/isn_swish.pm @@ -0,0 +1,16 @@ +#!/usr/bin/perl -w + +# example filter to convert ISBN and ISSN to hyphen and non-hyphen version + +sub filter { + my @out; + foreach my $nr1 (@_) { + push @out,$nr1; # save original + my $nr2 = $nr1; + $nr2 =~ s/\-//g; + push @out,$nr2; # save version without hyphens + } + return @out; +} + +1; -- 2.20.1