Added type="swish_exact" to save data into swish index with boundaries
[webpac] / all2xml.pl
index cb31a28..de6fe0f 100755 (executable)
@@ -96,6 +96,7 @@ sub data2xml {
                $field_usage{$field}++;
 
                my $swish_data = "";
+               my $swish_exact_data = "";
                my $display_data = "";
                my $line_delimiter;
 
@@ -109,9 +110,10 @@ sub data2xml {
 
                        my $repeat_off = 0;             # repeatable offset
 
-                       my ($s,$d,$i) = (1,1,0);        # swish, display default
+                       my ($s,$se,$d,$i) = (1,0,1,0);  # swish, display default
                        $s = 0 if (lc($x->{type}) eq "display");
                        $d = 0 if (lc($x->{type}) eq "swish");
+                       $se = 1 if (lc($x->{type}) eq "swish_exact");
                        ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
 
                        # what will separate last line from this one?
@@ -128,6 +130,32 @@ sub data2xml {
                        my @index_data;
                        my $index_filter;
 
+                       sub mkformat {
+                               my $x = shift || die "mkformat needs tag reference";
+                               my $data = shift || return;
+                               my $format_name = x($x->{format_name}) || return $data;
+                               my $fmt = x($config->{format}->{$format_name}->{content}) || die "<format name=\"$format_name\"> is not defined!";
+                               my $format_delimiter = x($x->{format_delimiter});
+                               my @data;
+                               if ($format_delimiter) {
+                                       @data = split(/$format_delimiter/,$data);
+                               } else {
+                                       push @data,$data;
+                               }
+
+                               if ($fmt) {
+                                       my $nr = scalar $fmt =~ s/%s/%s/g;
+                                       if (($#data+1) == $nr) {
+                                               return sprintf($fmt,@data);
+                                       } else {
+                                               print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n";
+                                               return $data;
+                                       }
+                               } else {
+                                       print STDERR "usage of link '$format_name' without defined format (<link> tag)\n";
+                               }
+                       }
+
                        # while because of repeatable fields
                        while ($swish || $display) {
                                ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp);
@@ -143,12 +171,16 @@ sub data2xml {
                                        require "filter/".$filter.".pm";
                                }
                                # type="swish" ; field for swish
-                               if ($s && $swish) {
-                                       if ($filter) {
+                               if ($swish) {
+                                       if ($filter && ($s || $se)) {
                                                no strict 'refs';
-                                               $swish_data .= join(" ",&$filter($swish));
+                                               my $tmp = join(" ",&$filter($swish)) if ($s || $se);
+                                               $swish_data .= $tmp if ($s);
+                                               $swish_exact_data .= $tmp if ($se);
+
                                        } else {
-                                               $swish_data .= $swish;
+                                               $swish_data .= $swish if ($s);
+                                               $swish_exact_data .= $swish if ($se);
                                        }
                                }
 
@@ -160,12 +192,16 @@ sub data2xml {
                                        }
                                        if ($filter) {
                                                no strict 'refs';
-                                               $display_data .= join($delimiter,&$filter($display));
+                                               if ($display_data) {
+                                                       $display_data .= $delimiter.join($delimiter,mkformat($x,&$filter($display)));
+                                               } else {
+                                                       $display_data = join($delimiter,mkformat($x,&$filter($display)));
+                                               }
                                        } else {
                                                if ($display_data) {
-                                                       $display_data .= $delimiter.$display;
+                                                       $display_data .= $delimiter.mkformat($x,$display);
                                                } else {
-                                                       $display_data .= $display;
+                                                       $display_data = mkformat($x,$display);
                                                }
                                        }
                                }
@@ -181,8 +217,8 @@ sub data2xml {
                        if (@index_data) {
                                if ($index_filter) {
                                        no strict 'refs';
-                                       foreach my $d (&$index_filter(@index_data)) {
-                                               $index->insert($field, $d, $path);
+                                       foreach my $d (@index_data) {
+                                               $index->insert($field, &$index_filter($d), $path);
                                        }
                                } else {
                                        foreach my $d (@index_data) {
@@ -201,6 +237,8 @@ sub data2xml {
                        my ($s,$d,$i) = (1,1,0);        # swish, display default
                        $s = 0 if (lc($x->{type}) eq "display");
                        $d = 0 if (lc($x->{type}) eq "swish");
+                       # no support for swish exact in config.
+                       # IMHO, it's useless
                        ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
 
                        if ($val) {
@@ -243,6 +281,15 @@ sub data2xml {
                        $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
                }
 
+               if ($swish_exact_data) {
+                       $swish_exact_data =~ s/ +/ /g;
+                       $swish_exact_data =~ s/ +$//g;
+
+                       # add delimiters before and after word.
+                       # That is required to produce exact match
+                       $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx'));
+               }
+
 
        }
 
@@ -289,7 +336,7 @@ print STDERR "reading ./import_xml/$type.xml\n";
        my $type_base = $type;
        $type_base =~ s/_.+$//g;
 
-       $config=XMLin("./import_xml/$type.xml", forcearray => [ $type2tag{$type_base}, 'config' ], forcecontent => 1);
+       $config=XMLin("./import_xml/$type.xml", forcearray => [ $type2tag{$type_base}, 'config', 'format' ], forcecontent => 1);
 
        # output current progress indicator
        my $last_p = 0;
@@ -328,6 +375,34 @@ print STDERR "using: $type...\n";
                $import2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
                my $db = OpenIsis::open( $isis_db );
 
+               # check if .txt database for OpenIsis is zero length,
+               # if so, erase it and re-open database
+               sub check_txt_db {
+                       my $isis_db = shift || die "need isis database name";
+                       my $reopen = 0;
+
+                       if (-e $isis_db.".TXT") {
+                               print STDERR "WARNING: removing $isis_db.TXT OpenIsis database...\n";
+                               unlink $isis_db.".TXT" || warn "FATAL: unlink error on '$isis_db.TXT': $!";
+                               $reopen++;
+                       }
+                       if (-e $isis_db.".PTR") {
+                               print STDERR "WARNING: removing $isis_db.PTR OpenIsis database...\n";
+                               unlink $isis_db.".PTR" || warn "FATAL: unlink error on '$isis_db.PTR': $!";
+                               $reopen++;
+                       }
+                       return OpenIsis::open( $isis_db ) if ($reopen);
+               }
+
+               # EOF error
+               if ($db == -1) {
+                       $db = check_txt_db($isis_db);
+                       if ($db == -1) {
+                               print STDERR "FATAL: OpenIsis can't open zero size file $isis_db\n";
+                               next;
+                       }
+               }
+
                # OpenIsis::ERR_BADF 
                if ($db == -4) {
                        print STDERR "FATAL: OpenIsis can't find file $isis_db\n";
@@ -343,6 +418,13 @@ print STDERR "using: $type...\n";
 
                my $max_rowid = OpenIsis::maxRowid( $db );
 
+               # if 0 records, try to rease isis .txt database
+               if ($max_rowid == 0) {
+                       # force removal of database
+                       $db = check_txt_db($isis_db);
+                       $max_rowid = OpenIsis::maxRowid( $db );
+               }
+
                print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
 
                my $path = $database;
@@ -364,6 +446,10 @@ print STDERR "using: $type...\n";
                                }
                        }
                }
+               # for this to work with current version of OpenIsis (0.9.0)
+               # you might need my patch from
+               # http://www.rot13.org/~dpavlin/projects/openisis-0.9.0-perl_close.diff
+               OpenIsis::close($db);
                print STDERR "\n";
 
        } elsif ($type_base eq "excel") {