updated branches to HEAD
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
git-svn-id: file:///home/dpavlin/private/svn/webpac/branches/hidra@321 13eb9ef6-21d5-0310-b721-a9d68796d827

.cvsignore [deleted file]
Makefile
WebPac.pm
all2xml.pl
filter/croascii.pm
filter/unac_string_croatian.pm [deleted file]
global.conf
my_unac_string.pm [new file with mode: 0644]

diff --git a/.cvsignore b/.cvsignore
deleted file mode 100644 (file)
index 287c099..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-out.xml
-run.sh
-swish_isis.index*
-foo*
-bar*
-log
index 6adede1..f13addd 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,10 @@ profile:
 ver=`date +%Y%m%d`
 
 dist:
-       rcs2log -h rot13.org > ChangeLog
+       svn log -v > ChangeLog
        rm -Rf webpac-$(ver)
        mkdir webpac-$(ver)
-       cvs-files.pl | cpio -pvd webpac-$(ver)/
+       svn ls -R | cpio -pvd webpac-$(ver)/
        tar cfvz ../webpac-$(ver).tar.gz webpac-$(ver)/
        rm -Rf webpac-$(ver)
 
index 8139173..c187dee 100644 (file)
--- a/WebPac.pm
+++ b/WebPac.pm
@@ -28,20 +28,24 @@ my $MAX_HITS = $cfg_global->val('webpac', 'max_hits') || 0;
 my $ON_PAGE =$cfg_global->val('webpac', 'on_page') || 10;
 my $MIN_WILDCARD =$cfg_global->val('webpac', 'min_wildcard') || 1;
 my $TEMPLATE =$cfg_global->val('webpac', 'template');
-my $UNAC_FILTER =$cfg_global->val('global', 'unac_filter');
+my $UNAC_FILTER =$cfg_global->val('global', 'my_unac_filter');
 my $BASE_PATH =$cfg_global->val('webpac', 'base_path');
 # for pager
 my $pages_per_set = $cfg_global->val('webpac', 'pages_per_set') || 10;
 
+Text::Iconv->raise_error(0);     # Conversion errors raise exceptions
+
+my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET);
 
 if ($UNAC_FILTER) {
        require $UNAC_FILTER;
+} else {
+       sub WebPac::my_unac_string {
+               my ($charset, $string) = (@_);
+               return $string;
+       }
 }
 
-Text::Iconv->raise_error(0);     # Conversion errors raise exceptions
-
-my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET);
-
 # use path from cgi script to support templates in subdirs
 sub url_ex {
        my $q = shift || die "suff2file needs CGI object!";
@@ -255,7 +259,7 @@ sub show_results_list {
                while (my $search = shift @param_vals) {
                        my $s;
                        # remove accents
-                       $search = unac_string($CHARSET,$search);
+                       $search = my_unac_string($CHARSET,$search);
                        while ($search =~ s/\s*("[^"]+")\s*/ /) {
                                $s .= "$1 ";
                        }
index 860560b..11e49d6 100755 (executable)
@@ -5,7 +5,6 @@ use OpenIsis;
 use Getopt::Std;
 use Data::Dumper;
 use XML::Simple;
-use Text::Unaccent 1.02;       # 1.01 won't compile on my platform,
 use Text::Iconv;
 use Config::IniFiles;
 use Encode;
@@ -17,7 +16,7 @@ $|=1;
 
 my $config_file = $0;
 $config_file =~ s/\.pl$/.conf/;
-$config_file = $ARGV[0] if (-f $ARGV[0]);
+$config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]);
 die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
 
 my $config;
@@ -526,7 +525,7 @@ sub data2xml {
                                        $swish_data =~ s/ +/ /g;
                                        $swish_data =~ s/ +$//g;
 
-                                       $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
+                                       $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
                                }
 
                                my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page];
@@ -536,7 +535,7 @@ sub data2xml {
 
                                        # add delimiters before and after word.
                                        # That is required to produce exact match
-                                       $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data));
+                                       $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
                                }
                                
                                my $idel = $cache->{index_delimiter}->{$field};
@@ -569,7 +568,7 @@ sub data2xml {
                                $swish_data =~ s/ +/ /g;
                                $swish_data =~ s/ +$//g;
 
-                               $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
+                               $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
                        }
 
                        if ($swish_exact_data) {
@@ -578,7 +577,7 @@ sub data2xml {
 
                                # add delimiters before and after word.
                                # That is required to produce exact match
-                               $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data));
+                               $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
                        }
                }
        }
@@ -615,9 +614,18 @@ $index = new index_DBI(
 
 my $show_progress = $cfg_global->val('global', 'show_progress');
 
-my $unac_filter = $cfg_global->val('global', 'unac_filter');
-if ($unac_filter) {
-       require $unac_filter;
+my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter');
+if ($my_unac_filter) {
+       print STDERR "using $my_unac_filter to filter characters for search\n";
+       require $my_unac_filter;
+} else {
+       print STDERR "### fallback to default my_unac_string!\n";
+       eval q{
+       sub main::my_unac_string($$) {
+               my ($charset, $string) = (@_);
+               return $string;
+       }
+       };
 }
 
 foreach my $database ($cfg->Sections) {
index 28f5124..bc6d904 100644 (file)
@@ -5,7 +5,7 @@ sub croascii {
        my $out = "";
        foreach (@_) {
 #              tr/^~]}\|[{@`/ÈèÆæÐ𩹮¾/;      # B1.002:1982
-               tr/^\~]}\|[{@\`/ÈèÆæðй©¾®/;    # Crolist croascii
+               tr/^~]}\\|[{@`/ÈèÆæðЩ¹®¾/;     # Crolist croascii
                # Crolist alternative encoding
                s/ÏC/È/g;
                s/Ïc/è/g;
diff --git a/filter/unac_string_croatian.pm b/filter/unac_string_croatian.pm
deleted file mode 100644 (file)
index b6d549b..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-# Alternative implementation for unac_string which supports charasters in
-# Croatian language which isn't really accented (ð) but needs to be coverted
-# to unaccented equivalent (d)
-
-sub unac_string($$) {
-       my $charset = shift || return;
-       my $string = shift || return;
-#      $string = Text::Unaccent::unac_string($charset,$string);
-#      $string =~ tr/ðÐ/dD/;
-       $string =~ tr/èæ¾¹ðÈÆ®©Ð/cczsdCCZSD/;
-       return $string;
-}
-
-1;
index 3411bad..6abd96c 100644 (file)
        # display progress bar indicator (default is no)
        show_progress=1
 
-       # optional alternative Text::Unaccent filter
-       unac_filter = /data/webpac-hidra/filter/unac_string_croatian.pm
+       # Filter characters before feeding them to swish. If you don't use
+       # this file, implementation will fall-back to passing through
+       # original charset, and if you have anything other than plain
+       # 7-bit ascii in your data, your words will end-up splitted in
+       # index on 8-bit characters and you won't be able to find them!
+       my_unac_filter = /data/webpac/my_unac_string.pm
 
 [webpac]
        # path to template html files
diff --git a/my_unac_string.pm b/my_unac_string.pm
new file mode 100644 (file)
index 0000000..3c16afa
--- /dev/null
@@ -0,0 +1,15 @@
+# Alternative implementation for unac_string which supports charasters in
+# Croatian language which isn't really accented (ð) but needs to be coverted
+# to unaccented equivalent (d)
+
+use Text::Unaccent 1.02;       # 1.01 won't compile on my platform,
+
+sub my_unac_string($$) {
+       my $charset = shift || return;
+       my $string = shift || return;
+       $string = unac_string($charset,$string);
+       $string =~ tr/ðÐ/dD/;
+       return $string;
+}
+
+1;