From 863c1e3e82d2b9b5b309a8df57799ec0ab145d3f Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sun, 18 Apr 2004 00:57:39 +0000 Subject: [PATCH] implement my_unac_string function, and my_unac_filter option in global.conf which you *REALLY* want to use if you don't have only clean 7-bit characters in your data git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@320 13eb9ef6-21d5-0310-b721-a9d68796d827 --- WebPac.pm | 16 +++++++----- all2xml.pl | 26 ++++++++++++------- global.conf | 8 ++++-- ...ac_string_croatian.pm => my_unac_string.pm | 9 ++++--- 4 files changed, 38 insertions(+), 21 deletions(-) rename filter/unac_string_croatian.pm => my_unac_string.pm (63%) diff --git a/WebPac.pm b/WebPac.pm index 8139173..c187dee 100644 --- a/WebPac.pm +++ b/WebPac.pm @@ -28,20 +28,24 @@ my $MAX_HITS = $cfg_global->val('webpac', 'max_hits') || 0; my $ON_PAGE =$cfg_global->val('webpac', 'on_page') || 10; my $MIN_WILDCARD =$cfg_global->val('webpac', 'min_wildcard') || 1; my $TEMPLATE =$cfg_global->val('webpac', 'template'); -my $UNAC_FILTER =$cfg_global->val('global', 'unac_filter'); +my $UNAC_FILTER =$cfg_global->val('global', 'my_unac_filter'); my $BASE_PATH =$cfg_global->val('webpac', 'base_path'); # for pager my $pages_per_set = $cfg_global->val('webpac', 'pages_per_set') || 10; +Text::Iconv->raise_error(0); # Conversion errors raise exceptions + +my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET); if ($UNAC_FILTER) { require $UNAC_FILTER; +} else { + sub WebPac::my_unac_string { + my ($charset, $string) = (@_); + return $string; + } } -Text::Iconv->raise_error(0); # Conversion errors raise exceptions - -my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET); - # use path from cgi script to support templates in subdirs sub url_ex { my $q = shift || die "suff2file needs CGI object!"; @@ -255,7 +259,7 @@ sub show_results_list { while (my $search = shift @param_vals) { my $s; # remove accents - $search = unac_string($CHARSET,$search); + $search = my_unac_string($CHARSET,$search); while ($search =~ s/\s*("[^"]+")\s*/ /) { $s .= "$1 "; } diff --git a/all2xml.pl b/all2xml.pl index 860560b..11e49d6 100755 --- a/all2xml.pl +++ b/all2xml.pl @@ -5,7 +5,6 @@ use OpenIsis; use Getopt::Std; use Data::Dumper; use XML::Simple; -use Text::Unaccent 1.02; # 1.01 won't compile on my platform, use Text::Iconv; use Config::IniFiles; use Encode; @@ -17,7 +16,7 @@ $|=1; my $config_file = $0; $config_file =~ s/\.pl$/.conf/; -$config_file = $ARGV[0] if (-f $ARGV[0]); +$config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]); die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file); my $config; @@ -526,7 +525,7 @@ sub data2xml { $swish_data =~ s/ +/ /g; $swish_data =~ s/ +$//g; - $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data)); } my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page]; @@ -536,7 +535,7 @@ sub data2xml { # add delimiters before and after word. # That is required to produce exact match - $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data)); + $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data)); } my $idel = $cache->{index_delimiter}->{$field}; @@ -569,7 +568,7 @@ sub data2xml { $swish_data =~ s/ +/ /g; $swish_data =~ s/ +$//g; - $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data)); } if ($swish_exact_data) { @@ -578,7 +577,7 @@ sub data2xml { # add delimiters before and after word. # That is required to produce exact match - $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data)); + $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data)); } } } @@ -615,9 +614,18 @@ $index = new index_DBI( my $show_progress = $cfg_global->val('global', 'show_progress'); -my $unac_filter = $cfg_global->val('global', 'unac_filter'); -if ($unac_filter) { - require $unac_filter; +my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter'); +if ($my_unac_filter) { + print STDERR "using $my_unac_filter to filter characters for search\n"; + require $my_unac_filter; +} else { + print STDERR "### fallback to default my_unac_string!\n"; + eval q{ + sub main::my_unac_string($$) { + my ($charset, $string) = (@_); + return $string; + } + }; } foreach my $database ($cfg->Sections) { diff --git a/global.conf b/global.conf index 48dffe1..619400e 100644 --- a/global.conf +++ b/global.conf @@ -14,8 +14,12 @@ # display progress bar indicator (default is no) show_progress=1 - # optional alternative Text::Unaccent filter - unac_filter = /data/webpac/filter/unac_string_croatian.pm + # Filter characters before feeding them to swish. If you don't use + # this file, implementation will fall-back to passing through + # original charset, and if you have anything other than plain + # 7-bit ascii in your data, your words will end-up splitted in + # index on 8-bit characters and you won't be able to find them! + my_unac_filter = /data/webpac/my_unac_string.pm [webpac] # path to template html files diff --git a/filter/unac_string_croatian.pm b/my_unac_string.pm similarity index 63% rename from filter/unac_string_croatian.pm rename to my_unac_string.pm index b6d549b..3c16afa 100644 --- a/filter/unac_string_croatian.pm +++ b/my_unac_string.pm @@ -2,12 +2,13 @@ # Croatian language which isn't really accented (ð) but needs to be coverted # to unaccented equivalent (d) -sub unac_string($$) { +use Text::Unaccent 1.02; # 1.01 won't compile on my platform, + +sub my_unac_string($$) { my $charset = shift || return; my $string = shift || return; -# $string = Text::Unaccent::unac_string($charset,$string); -# $string =~ tr/ðÐ/dD/; - $string =~ tr/èæ¾¹ðÈÆ®©Ð/cczsdCCZSD/; + $string = unac_string($charset,$string); + $string =~ tr/ðÐ/dD/; return $string; } -- 2.20.1