updated branches to HEAD

author Dobrica Pavlinusic <dpavlin@rot13.org>

Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)

committer Dobrica Pavlinusic <dpavlin@rot13.org>

Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
author Dobrica Pavlinusic <dpavlin@rot13.org>
Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
committer Dobrica Pavlinusic <dpavlin@rot13.org>
Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
diff --git a/.cvsignore b/.cvsignore

deleted file mode 100644 (file)

index 287c099..0000000
--- a/.cvsignore
+++ /dev/null
@@ -1,6 +0,0 @@
-out.xml
-run.sh
-swish_isis.index*
-foo*
-bar*
-log
diff --git a/Makefile b/Makefile

index 6adede1..f13addd 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,10 @@ profile:
  ver=`date +%Y%m%d`
  
  dist:
-       rcs2log -h rot13.org > ChangeLog
+       svn log -v > ChangeLog
         rm -Rf webpac-$(ver)
         mkdir webpac-$(ver)
-       cvs-files.pl | cpio -pvd webpac-$(ver)/
+       svn ls -R | cpio -pvd webpac-$(ver)/
         tar cfvz ../webpac-$(ver).tar.gz webpac-$(ver)/
         rm -Rf webpac-$(ver)
  
diff --git a/WebPac.pm b/WebPac.pm

index 8139173..c187dee 100644 (file)
--- a/WebPac.pm
+++ b/WebPac.pm
@@ -28,20 +28,24 @@ my $MAX_HITS = $cfg_global->val('webpac', 'max_hits') || 0;
  my $ON_PAGE =$cfg_global->val('webpac', 'on_page') || 10;
  my $MIN_WILDCARD =$cfg_global->val('webpac', 'min_wildcard') || 1;
  my $TEMPLATE =$cfg_global->val('webpac', 'template');
-my $UNAC_FILTER =$cfg_global->val('global', 'unac_filter');
+my $UNAC_FILTER =$cfg_global->val('global', 'my_unac_filter');
  my $BASE_PATH =$cfg_global->val('webpac', 'base_path');
  # for pager
  my $pages_per_set = $cfg_global->val('webpac', 'pages_per_set') || 10;
  
+Text::Iconv->raise_error(0);     # Conversion errors raise exceptions
+
+my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET);
  
  if ($UNAC_FILTER) {
         require $UNAC_FILTER;
+} else {
+       sub WebPac::my_unac_string {
+               my ($charset, $string) = (@_);
+               return $string;
+       }
  }
  
-Text::Iconv->raise_error(0);     # Conversion errors raise exceptions
-
-my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET);
-
  # use path from cgi script to support templates in subdirs
  sub url_ex {
         my $q = shift || die "suff2file needs CGI object!";
@@ -255,7 +259,7 @@ sub show_results_list {
                 while (my $search = shift @param_vals) {
                         my $s;
                         # remove accents
-                       $search = unac_string($CHARSET,$search);
+                       $search = my_unac_string($CHARSET,$search);
                         while ($search =~ s/\s*("[^"]+")\s*/ /) {
                                 $s .= "$1 ";
                         }
diff --git a/all2xml.pl b/all2xml.pl

index 860560b..11e49d6 100755 (executable)
--- a/all2xml.pl
+++ b/all2xml.pl
@@ -5,7 +5,6 @@ use OpenIsis;
  use Getopt::Std;
  use Data::Dumper;
  use XML::Simple;
-use Text::Unaccent 1.02;       # 1.01 won't compile on my platform,
  use Text::Iconv;
  use Config::IniFiles;
  use Encode;
@@ -17,7 +16,7 @@ $|=1;
  
  my $config_file = $0;
  $config_file =~ s/\.pl$/.conf/;
-$config_file = $ARGV[0] if (-f $ARGV[0]);
+$config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]);
  die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
  
  my $config;
@@ -526,7 +525,7 @@ sub data2xml {
                                         $swish_data =~ s/ +/ /g;
                                         $swish_data =~ s/ +$//g;
  
-                                       $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
+                                       $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
                                 }
  
                                 my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page];
@@ -536,7 +535,7 @@ sub data2xml {
  
                                         # add delimiters before and after word.
                                         # That is required to produce exact match
-                                       $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data));
+                                       $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
                                 }
                                 
                                 my $idel = $cache->{index_delimiter}->{$field};
@@ -569,7 +568,7 @@ sub data2xml {
                                 $swish_data =~ s/ +/ /g;
                                 $swish_data =~ s/ +$//g;
  
-                               $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
+                               $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
                         }
  
                         if ($swish_exact_data) {
@@ -578,7 +577,7 @@ sub data2xml {
  
                                 # add delimiters before and after word.
                                 # That is required to produce exact match
-                               $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data));
+                               $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
                         }
                 }
         }
@@ -615,9 +614,18 @@ $index = new index_DBI(
  
  my $show_progress = $cfg_global->val('global', 'show_progress');
  
-my $unac_filter = $cfg_global->val('global', 'unac_filter');
-if ($unac_filter) {
-       require $unac_filter;
+my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter');
+if ($my_unac_filter) {
+       print STDERR "using $my_unac_filter to filter characters for search\n";
+       require $my_unac_filter;
+} else {
+       print STDERR "### fallback to default my_unac_string!\n";
+       eval q{
+       sub main::my_unac_string($$) {
+               my ($charset, $string) = (@_);
+               return $string;
+       }
+       };
  }
  
  foreach my $database ($cfg->Sections) {
diff --git a/filter/croascii.pm b/filter/croascii.pm

index 28f5124..bc6d904 100644 (file)
--- a/filter/croascii.pm
+++ b/filter/croascii.pm
@@ -5,7 +5,7 @@ sub croascii {
         my $out = "";
         foreach (@_) {
  #              tr/^~]}\|[{@`/ÈèÆæÐð©¹®¾/;      # B1.002:1982
-               tr/^\~]}\|[{@\`/ÈèÆæðÐ¹©¾®/;    # Crolist croascii
+               tr/^~]}\\|[{@`/ÈèÆæðÐ©¹®¾/;     # Crolist croascii
                 # Crolist alternative encoding
                 s/ÏC/È/g;
                 s/Ïc/è/g;
diff --git a/filter/unac_string_croatian.pm b/filter/unac_string_croatian.pm

deleted file mode 100644 (file)

index b6d549b..0000000
--- a/filter/unac_string_croatian.pm
+++ /dev/null
@@ -1,14 +0,0 @@
-# Alternative implementation for unac_string which supports charasters in
-# Croatian language which isn't really accented (ð) but needs to be coverted
-# to unaccented equivalent (d)
-
-sub unac_string($$) {
-       my $charset = shift || return;
-       my $string = shift || return;
-#      $string = Text::Unaccent::unac_string($charset,$string);
-#      $string =~ tr/ðÐ/dD/;
-       $string =~ tr/èæ¾¹ðÈÆ®©Ð/cczsdCCZSD/;
-       return $string;
-}
-
-1;
diff --git a/global.conf b/global.conf

index 3411bad..6abd96c 100644 (file)
--- a/global.conf
+++ b/global.conf
@@ -13,8 +13,12 @@
         # display progress bar indicator (default is no)
         show_progress=1
  
-       # optional alternative Text::Unaccent filter
-       unac_filter = /data/webpac-hidra/filter/unac_string_croatian.pm
+       # Filter characters before feeding them to swish. If you don't use
+       # this file, implementation will fall-back to passing through
+       # original charset, and if you have anything other than plain
+       # 7-bit ascii in your data, your words will end-up splitted in
+       # index on 8-bit characters and you won't be able to find them!
+       my_unac_filter = /data/webpac/my_unac_string.pm
  
  [webpac]
         # path to template html files
diff --git a/my_unac_string.pm b/my_unac_string.pm

new file mode 100644 (file)

index 0000000..3c16afa
--- /dev/null
+++ b/my_unac_string.pm
@@ -0,0 +1,15 @@
+# Alternative implementation for unac_string which supports charasters in
+# Croatian language which isn't really accented (ð) but needs to be coverted
+# to unaccented equivalent (d)
+
+use Text::Unaccent 1.02;       # 1.01 won't compile on my platform,
+
+sub my_unac_string($$) {
+       my $charset = shift || return;
+       my $string = shift || return;
+       $string = unac_string($charset,$string);
+       $string =~ tr/ðÐ/dD/;
+       return $string;
+}
+
+1;
author	Dobrica Pavlinusic <dpavlin@rot13.org>
	Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
committer	Dobrica Pavlinusic <dpavlin@rot13.org>
	Sun, 18 Apr 2004 01:03:27 +0000 (01:03 +0000)
.cvsignore	[deleted file]	patch \| blob \| history
Makefile		patch \| blob \| history
WebPac.pm		patch \| blob \| history
all2xml.pl		patch \| blob \| history
filter/croascii.pm		patch \| blob \| history
filter/unac_string_croatian.pm	[deleted file]	patch \| blob \| history
global.conf		patch \| blob \| history
my_unac_string.pm	[new file with mode: 0644]	patch \| blob