From: Dobrica Pavlinusic Date: Sun, 18 Apr 2004 01:03:27 +0000 (+0000) Subject: updated branches to HEAD X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;ds=sidebyside;h=562d6eb78d3e36e0b9373f1c2a4f6c403d17f584;p=webpac updated branches to HEAD git-svn-id: file:///home/dpavlin/private/svn/webpac/branches/hidra@321 13eb9ef6-21d5-0310-b721-a9d68796d827 --- diff --git a/.cvsignore b/.cvsignore deleted file mode 100644 index 287c099..0000000 --- a/.cvsignore +++ /dev/null @@ -1,6 +0,0 @@ -out.xml -run.sh -swish_isis.index* -foo* -bar* -log diff --git a/Makefile b/Makefile index 6adede1..f13addd 100644 --- a/Makefile +++ b/Makefile @@ -8,10 +8,10 @@ profile: ver=`date +%Y%m%d` dist: - rcs2log -h rot13.org > ChangeLog + svn log -v > ChangeLog rm -Rf webpac-$(ver) mkdir webpac-$(ver) - cvs-files.pl | cpio -pvd webpac-$(ver)/ + svn ls -R | cpio -pvd webpac-$(ver)/ tar cfvz ../webpac-$(ver).tar.gz webpac-$(ver)/ rm -Rf webpac-$(ver) diff --git a/WebPac.pm b/WebPac.pm index 8139173..c187dee 100644 --- a/WebPac.pm +++ b/WebPac.pm @@ -28,20 +28,24 @@ my $MAX_HITS = $cfg_global->val('webpac', 'max_hits') || 0; my $ON_PAGE =$cfg_global->val('webpac', 'on_page') || 10; my $MIN_WILDCARD =$cfg_global->val('webpac', 'min_wildcard') || 1; my $TEMPLATE =$cfg_global->val('webpac', 'template'); -my $UNAC_FILTER =$cfg_global->val('global', 'unac_filter'); +my $UNAC_FILTER =$cfg_global->val('global', 'my_unac_filter'); my $BASE_PATH =$cfg_global->val('webpac', 'base_path'); # for pager my $pages_per_set = $cfg_global->val('webpac', 'pages_per_set') || 10; +Text::Iconv->raise_error(0); # Conversion errors raise exceptions + +my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET); if ($UNAC_FILTER) { require $UNAC_FILTER; +} else { + sub WebPac::my_unac_string { + my ($charset, $string) = (@_); + return $string; + } } -Text::Iconv->raise_error(0); # Conversion errors raise exceptions - -my $from_utf8 = Text::Iconv->new('UTF8', $CHARSET); - # use path from cgi script to support templates in subdirs sub url_ex { my $q = shift || die "suff2file needs CGI object!"; @@ -255,7 +259,7 @@ sub show_results_list { while (my $search = shift @param_vals) { my $s; # remove accents - $search = unac_string($CHARSET,$search); + $search = my_unac_string($CHARSET,$search); while ($search =~ s/\s*("[^"]+")\s*/ /) { $s .= "$1 "; } diff --git a/all2xml.pl b/all2xml.pl index 860560b..11e49d6 100755 --- a/all2xml.pl +++ b/all2xml.pl @@ -5,7 +5,6 @@ use OpenIsis; use Getopt::Std; use Data::Dumper; use XML::Simple; -use Text::Unaccent 1.02; # 1.01 won't compile on my platform, use Text::Iconv; use Config::IniFiles; use Encode; @@ -17,7 +16,7 @@ $|=1; my $config_file = $0; $config_file =~ s/\.pl$/.conf/; -$config_file = $ARGV[0] if (-f $ARGV[0]); +$config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]); die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file); my $config; @@ -526,7 +525,7 @@ sub data2xml { $swish_data =~ s/ +/ /g; $swish_data =~ s/ +$//g; - $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data)); } my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page]; @@ -536,7 +535,7 @@ sub data2xml { # add delimiters before and after word. # That is required to produce exact match - $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data)); + $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data)); } my $idel = $cache->{index_delimiter}->{$field}; @@ -569,7 +568,7 @@ sub data2xml { $swish_data =~ s/ +/ /g; $swish_data =~ s/ +$//g; - $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data)); } if ($swish_exact_data) { @@ -578,7 +577,7 @@ sub data2xml { # add delimiters before and after word. # That is required to produce exact match - $xml .= xmlify($field."_swish_exact", unac_string($codepage,$swish_exact_data)); + $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data)); } } } @@ -615,9 +614,18 @@ $index = new index_DBI( my $show_progress = $cfg_global->val('global', 'show_progress'); -my $unac_filter = $cfg_global->val('global', 'unac_filter'); -if ($unac_filter) { - require $unac_filter; +my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter'); +if ($my_unac_filter) { + print STDERR "using $my_unac_filter to filter characters for search\n"; + require $my_unac_filter; +} else { + print STDERR "### fallback to default my_unac_string!\n"; + eval q{ + sub main::my_unac_string($$) { + my ($charset, $string) = (@_); + return $string; + } + }; } foreach my $database ($cfg->Sections) { diff --git a/filter/croascii.pm b/filter/croascii.pm index 28f5124..bc6d904 100644 --- a/filter/croascii.pm +++ b/filter/croascii.pm @@ -5,7 +5,7 @@ sub croascii { my $out = ""; foreach (@_) { # tr/^~]}\|[{@`/ÈèÆæÐ𩹮¾/; # B1.002:1982 - tr/^\~]}\|[{@\`/ÈèÆæðй©¾®/; # Crolist croascii + tr/^~]}\\|[{@`/ÈèÆæðЩ¹®¾/; # Crolist croascii # Crolist alternative encoding s/ÏC/È/g; s/Ïc/è/g; diff --git a/filter/unac_string_croatian.pm b/filter/unac_string_croatian.pm deleted file mode 100644 index b6d549b..0000000 --- a/filter/unac_string_croatian.pm +++ /dev/null @@ -1,14 +0,0 @@ -# Alternative implementation for unac_string which supports charasters in -# Croatian language which isn't really accented (ð) but needs to be coverted -# to unaccented equivalent (d) - -sub unac_string($$) { - my $charset = shift || return; - my $string = shift || return; -# $string = Text::Unaccent::unac_string($charset,$string); -# $string =~ tr/ðÐ/dD/; - $string =~ tr/èæ¾¹ðÈÆ®©Ð/cczsdCCZSD/; - return $string; -} - -1; diff --git a/global.conf b/global.conf index 3411bad..6abd96c 100644 --- a/global.conf +++ b/global.conf @@ -13,8 +13,12 @@ # display progress bar indicator (default is no) show_progress=1 - # optional alternative Text::Unaccent filter - unac_filter = /data/webpac-hidra/filter/unac_string_croatian.pm + # Filter characters before feeding them to swish. If you don't use + # this file, implementation will fall-back to passing through + # original charset, and if you have anything other than plain + # 7-bit ascii in your data, your words will end-up splitted in + # index on 8-bit characters and you won't be able to find them! + my_unac_filter = /data/webpac/my_unac_string.pm [webpac] # path to template html files diff --git a/my_unac_string.pm b/my_unac_string.pm new file mode 100644 index 0000000..3c16afa --- /dev/null +++ b/my_unac_string.pm @@ -0,0 +1,15 @@ +# Alternative implementation for unac_string which supports charasters in +# Croatian language which isn't really accented (ð) but needs to be coverted +# to unaccented equivalent (d) + +use Text::Unaccent 1.02; # 1.01 won't compile on my platform, + +sub my_unac_string($$) { + my $charset = shift || return; + my $string = shift || return; + $string = unac_string($charset,$string); + $string =~ tr/ðÐ/dD/; + return $string; +} + +1;