filter fix && optimisation
[webpac] / all2xml.pl
index 8b6b29d..ab0474a 100755 (executable)
@@ -8,6 +8,7 @@ use XML::Simple;
 use Text::Unaccent 1.02;       # 1.01 won't compile on my platform,
 use Text::Iconv;
 use Config::IniFiles;
+use Encode;
 
 $|=1;
 
@@ -33,19 +34,23 @@ getopts('d:m:qs', \%opts);
 
 my $db_dir;
 
-#die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
+Text::Iconv->raise_error(1);     # Conversion errors raise exceptions
 
-#print Dumper($config->{indexer});
-#print "-" x 70,"\n";
+# this is encoding of all files on disk, including import_xml/*.xml file and
+# filter/*.pm files! It will be used to store strings in perl internally!
+my $codepage = 'ISO-8859-2';
 
-Text::Iconv->raise_error(1);     # Conversion errors raise exceptions
+my $utf2cp = Text::Iconv->new('UTF-8',$codepage);
+# this function will convert data from XML files to local encoding
+sub x {
+       return $utf2cp->convert($_[0]);
+}
+
+# decode isis import codepage
+my $isis2cp;
 
-#my $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
-#my $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
-my $isis_codepage;
-my $index_codepage;
-my $cludge_codepage = Text::Iconv->new('UTF8','ISO8859-1');
-my $xml_codepage;
+# outgoing xml must be in UTF-8
+my $cp2utf = Text::Iconv->new($codepage,'UTF-8');
 
 sub isis2xml {
 
@@ -62,46 +67,104 @@ sub isis2xml {
 
        my %field_usage;        # counter for usage of each field
 
-       foreach my $field (keys %{$config->{indexer}}) {
+       # sort subrouting using order="" attribute
+       sub by_order {
+               return 0 if (! $config->{indexer}->{$a}->{order});
+               return 0 if (! $config->{indexer}->{$b}->{order});
+
+               return $config->{indexer}->{$a}->{order} <=>
+                       $config->{indexer}->{$b}->{order} ;
+       }
+
+       foreach my $field (sort by_order keys %{$config->{indexer}}) {
+
+               $field=x($field);
 
                $field_usage{$field}++;
 
                my $swish_data = "";
                my $display_data = "";
-               my $index_data = "";
+               my $line_delimiter;
+
+               my ($swish,$display);
 
                foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
 
-                       my $format = $x->{content};
+                       my $format = x($x->{content});
+                       my $delimiter = x($x->{delimiter}) || ' ';
+
+                       my $isis_i = 0;         # isis repeatable offset
+
                        my ($s,$d,$i) = (1,1,0);        # swish, display default
                        $s = 0 if (lc($x->{type}) eq "display");
                        $d = 0 if (lc($x->{type}) eq "swish");
                        ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
-#print STDERR "## s: $s d: $d i: $i ## $format ##\n";  
-
-                       $format = $cludge_codepage->convert($format);
-                       my ($swish,$display) = parse_format($format,$row);
-#print STDERR "s: $swish\nd: $display\n" if ($swish);
-
-#print STDERR "swish: $swish<-- display: $display<--\n";
-                       # FIX: this is ugly, UGLY, cludge: OpenIsis return
-                       # UTF8 encoding of strings, but as if source charset
-                       # is ISO8859-1 and not some other. This breaks our
-                       # isis character encoding, so we convert it first
-                       # back to ISO8859-1 (which can actually be different
-                       # encoding in isis)
-
-                       $swish_data .= $swish if ($s && $swish);
-                       $display_data .= $display if ($d && $display);
-                       $index_data .= $display if ($i && $display);
+
+                       # what will separate last line from this one?
+                       if ($display_data && $x->{append} && $x->{append} eq "1") {
+                               $line_delimiter = ' ';
+                       } elsif ($display_data) {
+                               $line_delimiter = '<br/>';
+                       }
+
+                       # init vars so that we go into while...
+                       ($swish,$display) = (1,1);
+
+                       while ($swish || $display) {
+                               ($swish,$display) = parse_format($format,$row,$isis_i++,$isis2cp);
+
+                               # filter="name" ; filter this field through
+                               # filter/[name].pm
+                               my $filter = $x->{filter};
+                               if ($filter) {
+                                       require "filter/".$filter.".pm";
+                               }
+                               # type="swish" ; field for swish
+                               if ($s && $swish) {
+                                       if ($filter) {
+                                               no strict 'refs';
+                                               $swish_data .= join(" ",&$filter($swish));
+print STDERR "#### $swish_data\n";
+                                       } else {
+                                               $swish_data .= $swish;
+                                       }
+                               }
+
+                               # type="display" ; field for display
+                               if ($d && $display) {
+                                       if ($line_delimiter && $display_data) {
+                                               $display_data .= $line_delimiter;
+                                               undef $line_delimiter;
+                                       }
+                                       if ($filter) {
+                                               no strict 'refs';
+                                               $display_data .= join($delimiter,&$filter($display));
+                                       } else {
+                                               if ($display_data) {
+                                                       $display_data .= $delimiter.$display;
+                                               } else {
+                                                       $display_data .= $display;
+                                               }
+                                       }
+                               }
+                                               
+                               # type="index" ; insert into index
+                               if ($i && $display) {
+                                       my $index_data = $display;
+                                       if ($filter) {
+                                               no strict 'refs';
+                                               foreach my $d (&$filter($index_data)) {
+                                                       $index->insert($field, $d, $db_dir);
+                                               }
+                                       } else {
+                                               $index->insert($field, $index_data, $db_dir);
+                                       }
+                               }
+                       }
                }
 
 
-#print STDERR "s_d: $swish_data\nd_d: $display_data\n" if ($swish_data);
                if ($display_data) {
-                       $display_data = $isis_codepage->convert($display_data) || die "Can't convert '$display_data' !";
-                       # FIX: this is removed and replaced by html tag.
-                       #$xml .= xmlify($field."_display", $display_data);
 
                        if ($field eq "headline") {
                                $xml .= xmlify("headline", $display_data);
@@ -113,29 +176,26 @@ sub isis2xml {
                                        $field_name = $config->{indexer}->{$field}->{name_singular}."#-#";
                                } elsif ($config->{indexer}->{$field}->{name_plural}) {
                                        $field_name = $config->{indexer}->{$field}->{name_plural}."#-#";
-                               } else {
+                               } elsif ($config->{indexer}->{$field}->{name}) {
                                        $field_name = $config->{indexer}->{$field}->{name}."#-#";
+                               } else {
+                                       print STDERR "WARNING: field '$field' doesn't have 'name' attribute!";
                                }
                                if ($field_name) {
-                                       $html .= $xml_codepage->convert($field_name);
+                                       $html .= x($field_name);
                                }
                                $html .= $display_data."###\n";
                        }
                }
                if ($swish_data) {
-                       my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2');
-                       $swish_data = $i->convert($swish_data);
-                       $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data));
-                       #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data;
-                       #$xml .= xmlify($field."_swish",unac_string($config->{isis_codepage},$swish_data));
-               }
+                       # remove extra spaces
+                       $swish_data =~ s/ +/ /g;
+                       $swish_data =~ s/ +$//g;
 
-               # index
-               if ($index_data && $index_data ne "") {
-                       $index_data = $index_codepage->convert($index_data) || $index_data;
-                       $index->insert($field, $index_data, $db_dir);
+                       $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
                }
 
+
        }
 
        # dump formatted output in <html>
@@ -144,7 +204,6 @@ sub isis2xml {
        }
        
        if ($xml) {
-#print STDERR "x: $xml\n";
                $xml .= $add_xml if ($add_xml);
                return "<xml>\n$xml</xml>\n";
        } else {
@@ -158,15 +217,13 @@ my $cfg = new Config::IniFiles( -file => $config_file );
 
 foreach my $database ($cfg->Sections) {
 
-       my $isis_db = $cfg -> val($database, 'isis_db');
-       my $type = $cfg -> val($database, 'type');
-       my $add_xml = $cfg -> val($database, 'xml');
+       my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";
+       my $type = $cfg -> val($database, 'type') || die "$database doesn't have 'type' defined";
+       my $add_xml = $cfg -> val($database, 'xml');    # optional
 
-       # read configuration for this type
        $config=XMLin("./import_xml/$type.xml", forcearray => [ 'isis' ], forcecontent => 1);
-       $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
-       $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
-       $xml_codepage = Text::Iconv->new($cfg->val($database,'xml_codepage'),'UTF8');
+
+       $isis2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
 
        my $db = OpenIsis::open( $isis_db );
        if (0) {
@@ -189,7 +246,6 @@ foreach my $database ($cfg->Sections) {
        for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
                my $row = OpenIsis::read( $db, $row_id );
                if ($row && $row->{mfn}) {
-#print STDERR "mfn: ",$row->{mfn},"\n";
                        # output current process indicator
                        my $p = int($row->{mfn} * 100 / $max_rowid);
                        if ($p != $last_p) {
@@ -197,8 +253,8 @@ foreach my $database ($cfg->Sections) {
                                $last_p = $p;
                        }
 
-                       if (my $xml = isis2xml($row,$add_xml)) {
-#print STDERR "--ret-->$xml\n";
+                       if (my $xml = $cp2utf->convert(isis2xml($row,$add_xml))) {
+                               use bytes;      # as opposed to chars
                                print "Path-Name: $path#".int($row->{mfn})."\n";
                                print "Content-Length: ".(length($xml)+1)."\n";
                                print "Document-Type: XML\n\n$xml\n";