add filter="name" for fields (to correct strane input data or make variations
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 22 Feb 2003 23:49:22 +0000 (23:49 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 22 Feb 2003 23:49:22 +0000 (23:49 +0000)
for indexing)

git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@20 13eb9ef6-21d5-0310-b721-a9d68796d827

all2xml.pl
doc/formating_xml.txt [new file with mode: 0644]
filter/isn_swish.pm [new file with mode: 0755]

index ba1fe0f..d49096d 100755 (executable)
@@ -78,25 +78,53 @@ sub isis2xml {
                        ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
 #print STDERR "## s: $s d: $d i: $i ## $format ##\n";  
 
+                       # FIX: this is ugly, UGLY, cludge string is returned
+                       # in UTF8 encoding , but as if source charset
+                       # is ISO8859-1 and not some other. This break other
+                       # character encodings, so we convert it first
+                       # back to ISO8859-1
                        $format = $cludge_codepage->convert($format);
+
                        my ($swish,$display) = parse_format($format,$row);
 #print STDERR "s: $swish\nd: $display\n" if ($swish);
 
 #print STDERR "swish: $swish<-- display: $display<--\n";
-                       # FIX: this is ugly, UGLY, cludge: OpenIsis return
-                       # UTF8 encoding of strings, but as if source charset
-                       # is ISO8859-1 and not some other. This breaks our
-                       # isis character encoding, so we convert it first
-                       # back to ISO8859-1 (which can actually be different
-                       # encoding in isis)
 
-                       $swish_data .= $swish if ($s && $swish);
-                       $display_data .= $display if ($d && $display);
+                       # filter="name" ; filter this field through
+                       # filter/[name].pm
+                       my $filter;
+                       if ($x->{filter}) {
+                               $filter = "filter/".$x->{filter}.".pm";
+                               require $filter;
+                       }
+                       # type="swish" ; field for swish
+                       if ($s && $swish) {
+                               if ($filter) {
+                                       $swish_data .= join(" ",&filter($swish));
+                               } else {
+                                       $swish_data .= $swish if ($s && $swish);
+                               }
+                       }
 
-                       # insert into index
+                       # type="display" ; field for display
+                       if ($d && $display) {
+                               if ($filter) {
+                                       $display_data .= join(" ",&filter($display));
+                               } else {
+                                       $display_data .= $display if ($s && $display);
+                               }
+                       }
+                                       
+                       # type="index" ; insert into index
                        if ($i && $display) {
                                my $index_data = $index_codepage->convert($display) || $display;
-                               $index->insert($field, $index_data, $db_dir);
+                               if ($filter) {
+                                       foreach my $d (&filter($index_data)) {
+                                               $index->insert($field, $d, $db_dir);
+                                       }
+                               } else {
+                                       $index->insert($field, $index_data, $db_dir);
+                               }
                        }
                }
 
@@ -117,8 +145,10 @@ sub isis2xml {
                                        $field_name = $config->{indexer}->{$field}->{name_singular}."#-#";
                                } elsif ($config->{indexer}->{$field}->{name_plural}) {
                                        $field_name = $config->{indexer}->{$field}->{name_plural}."#-#";
-                               } else {
+                               } elsif ($config->{indexer}->{$field}->{name}) {
                                        $field_name = $config->{indexer}->{$field}->{name}."#-#";
+                               } else {
+                                       print STDERR "WARNING: field '$field' doesn't have 'name' attribute!";
                                }
                                if ($field_name) {
                                        $html .= $xml_codepage->convert($field_name);
@@ -128,6 +158,10 @@ sub isis2xml {
                }
                if ($swish_data) {
                        my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2');
+                       # remove extra spaces
+                       $swish_data =~ s/ +/ /g;
+                       $swish_data =~ s/ +$//g;
+
                        $swish_data = $i->convert($swish_data);
                        $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data));
                        #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data;
diff --git a/doc/formating_xml.txt b/doc/formating_xml.txt
new file mode 100644 (file)
index 0000000..3d41539
--- /dev/null
@@ -0,0 +1,38 @@
+
+
+<isis type="swish|display|index"
+       append="1"
+       format="one of formatting functions below"
+       filter="name"
+>_pre_000x_sep_000x_sep_000x_post_</isis>
+
+format:
+       upper [from_char[,to_char]]
+
+               makes field UPPERCASE
+               default: whole field
+
+
+       upper_w [word_nr[,...]
+
+               make words in field UPPERCASE
+               default: first word
+
+       substr from[,len]
+
+               returns substring from some character in some length
+               default: len - rest of the string
+       
+       skip2nr
+               skip all aphanumeric caracters and return just
+               numbers after it. To output "1992." from "cop. 1992."
+
+       initial [word_nr[,...]]
+       
+               make word(s) into initials (upper case first char and
+               append dot after it)
+               default: first word
+
+
+filter:
+       name of filter which is stored in filters/[name].pm
diff --git a/filter/isn_swish.pm b/filter/isn_swish.pm
new file mode 100755 (executable)
index 0000000..a2202bd
--- /dev/null
@@ -0,0 +1,16 @@
+#!/usr/bin/perl -w
+
+# example filter to convert ISBN and ISSN to hyphen and non-hyphen version
+
+sub filter {
+       my @out;
+       foreach my $nr1 (@_) {
+               push @out,$nr1; # save original
+               my $nr2 = $nr1;
+               $nr2 =~ s/\-//g;
+               push @out,$nr2; # save version without hyphens
+       }
+       return @out;
+}
+
+1;