From: Dobrica Pavlinusic <dpavlin@rot13.org>
Date: Sun, 18 Apr 2004 01:03:27 +0000 (+0000)
Subject: updated branches to HEAD
X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;ds=sidebyside;h=562d6eb78d3e36e0b9373f1c2a4f6c403d17f584;p=webpac

updated branches to HEAD


git-svn-id: file:///home/dpavlin/private/svn/webpac/branches/hidra@321 13eb9ef6-21d5-0310-b721-a9d68796d827
---

diff --git a/.cvsignore b/.cvsignore
deleted file mode 100644
index 287c099..0000000
--- a/.cvsignore
+++ /dev/null
@@ -1,6 +0,0 @@
-out.xml
-run.sh
-swish_isis.index*
-foo*
-bar*
-log
diff --git a/Makefile b/Makefile
index 6adede1..f13addd 100644
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,10 @@ profile:
 ver=`date +%Y%m%d`
 
 dist:
-	rcs2log -h rot13.org > ChangeLog
+	svn log -v > ChangeLog
 	rm -Rf webpac-$(ver)
 	mkdir webpac-$(ver)
-	cvs-files.pl | cpio -pvd webpac-$(ver)/
+	svn ls -R | cpio -pvd webpac-$(ver)/
 	tar cfvz ../webpac-$(ver).tar.gz webpac-$(ver)/
 	rm -Rf webpac-$(ver)
 
diff --git a/WebPac.pm b/WebPac.pm
index 8139173..c187dee 100644
--- a/WebPac.pm
+++ b/WebPac.pm
@@ -28,20 +28,24 @@ my $MAX_HITS = $cfg_global->val('webpac', 'max_hits') || 0;
 my $ON_PAGE =$cfg_global->val('webpac', 'on_page') || 10;
 my $MIN_WILDCARD =$cfg_global->val('webpac', 'min_wildcard') || 1;
 my $TEMPLATE =$cfg_global->val('webpac', 'template');
-my $UNAC_FILTER =$cfg_global->val('global', 'unac_filter');
+my $UNAC_FILTER =$cfg_global->val('global', 'my_unac_filter');
 my $BASE_PATH =$cfg_global->val('webpac', 'base_path');
 # for pager
 my $pages_per_set = $cfg_global->val('webpac', 'pages_per_set') || 10;
 
+Text::Iconv->raise_error(0);     # Conversion errors raise exceptions
+
+my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET);
 
 if ($UNAC_FILTER) {
 	require $UNAC_FILTER;
+} else {
+	sub WebPac::my_unac_string {
+		my ($charset, $string) = (@_);
+		return $string;
+	}
 }
 
-Text::Iconv->raise_error(0);     # Conversion errors raise exceptions
-
-my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET);
-
 # use path from cgi script to support templates in subdirs
 sub url_ex {
 	my $q = shift || die "suff2file needs CGI object!";
@@ -255,7 +259,7 @@ sub show_results_list {
 		while (my $search = shift @param_vals) {
 			my $s;
 			# remove accents
-			$search = unac_string($CHARSET,$search);
+			$search = my_unac_string($CHARSET,$search);
 			while ($search =~ s/\s*("[^"]+")\s*/ /) {
 				$s .= "$1 ";
 			}
diff --git a/all2xml.pl b/all2xml.pl
index 860560b..11e49d6 100755
--- a/all2xml.pl
+++ b/all2xml.pl
@@ -5,7 +5,6 @@ use OpenIsis;
 use Getopt::Std;
 use Data::Dumper;
 use XML::Simple;
-use Text::Unaccent 1.02;	# 1.01 won't compile on my platform,
 use Text::Iconv;
 use Config::IniFiles;
 use Encode;
@@ -17,7 +16,7 @@ $|=1;
 
 my $config_file = $0;
 $config_file =~ s/\.pl$/.conf/;
-$config_file = $ARGV[0] if (-f $ARGV[0]);
+$config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]);
 die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
 
 my $config;
@@ -526,7 +525,7 @@ sub data2xml {
 					$swish_data =~ s/ +/ /g;
 					$swish_data =~ s/ +$//g;
 
-					$xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
+					$xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
 				}
 
 				my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page];
@@ -536,7 +535,7 @@ sub data2xml {
 
 					# add delimiters before and after word.
 					# That is required to produce exact match
-					$xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data));
+					$xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
 				}
 				
 				my $idel = $cache->{index_delimiter}->{$field};
@@ -569,7 +568,7 @@ sub data2xml {
 				$swish_data =~ s/ +/ /g;
 				$swish_data =~ s/ +$//g;
 
-				$xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
+				$xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
 			}
 
 			if ($swish_exact_data) {
@@ -578,7 +577,7 @@ sub data2xml {
 
 				# add delimiters before and after word.
 				# That is required to produce exact match
-				$xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data));
+				$xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
 			}
 		}
 	}
@@ -615,9 +614,18 @@ $index = new index_DBI(
 
 my $show_progress = $cfg_global->val('global', 'show_progress');
 
-my $unac_filter = $cfg_global->val('global', 'unac_filter');
-if ($unac_filter) {
-	require $unac_filter;
+my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter');
+if ($my_unac_filter) {
+	print STDERR "using $my_unac_filter to filter characters for search\n";
+	require $my_unac_filter;
+} else {
+	print STDERR "### fallback to default my_unac_string!\n";
+	eval q{
+	sub main::my_unac_string($$) {
+		my ($charset, $string) = (@_);
+		return $string;
+	}
+	};
 }
 
 foreach my $database ($cfg->Sections) {
diff --git a/filter/croascii.pm b/filter/croascii.pm
index 28f5124..bc6d904 100644
--- a/filter/croascii.pm
+++ b/filter/croascii.pm
@@ -5,7 +5,7 @@ sub croascii {
 	my $out = "";
 	foreach (@_) {
 #		tr/^~]}\|[{@`/ÈèÆæÐð©¹®¾/;	# B1.002:1982
-		tr/^\~]}\|[{@\`/ÈèÆæðÐ¹©¾®/;	# Crolist croascii
+		tr/^~]}\\|[{@`/ÈèÆæðÐ©¹®¾/;	# Crolist croascii
 		# Crolist alternative encoding
 		s/ÏC/È/g;
 		s/Ïc/è/g;
diff --git a/filter/unac_string_croatian.pm b/filter/unac_string_croatian.pm
deleted file mode 100644
index b6d549b..0000000
--- a/filter/unac_string_croatian.pm
+++ /dev/null
@@ -1,14 +0,0 @@
-# Alternative implementation for unac_string which supports charasters in
-# Croatian language which isn't really accented (ð) but needs to be coverted
-# to unaccented equivalent (d)
-
-sub unac_string($$) {
-	my $charset = shift || return;
-	my $string = shift || return;
-#	$string = Text::Unaccent::unac_string($charset,$string);
-#	$string =~ tr/ðÐ/dD/;
-	$string =~ tr/èæ¾¹ðÈÆ®©Ð/cczsdCCZSD/;
-	return $string;
-}
-
-1;
diff --git a/global.conf b/global.conf
index 3411bad..6abd96c 100644
--- a/global.conf
+++ b/global.conf
@@ -13,8 +13,12 @@
 	# display progress bar indicator (default is no)
 	show_progress=1
 
-	# optional alternative Text::Unaccent filter
-	unac_filter = /data/webpac-hidra/filter/unac_string_croatian.pm
+	# Filter characters before feeding them to swish. If you don't use
+	# this file, implementation will fall-back to passing through
+	# original charset, and if you have anything other than plain
+	# 7-bit ascii in your data, your words will end-up splitted in
+	# index on 8-bit characters and you won't be able to find them!
+	my_unac_filter = /data/webpac/my_unac_string.pm
 
 [webpac]
 	# path to template html files
diff --git a/my_unac_string.pm b/my_unac_string.pm
new file mode 100644
index 0000000..3c16afa
--- /dev/null
+++ b/my_unac_string.pm
@@ -0,0 +1,15 @@
+# Alternative implementation for unac_string which supports charasters in
+# Croatian language which isn't really accented (ð) but needs to be coverted
+# to unaccented equivalent (d)
+
+use Text::Unaccent 1.02;	# 1.01 won't compile on my platform,
+
+sub my_unac_string($$) {
+	my $charset = shift || return;
+	my $string = shift || return;
+	$string = unac_string($charset,$string);
+	$string =~ tr/ðÐ/dD/;
+	return $string;
+}
+
+1;