From 345a5fb8bb16876309c38b790f47cb55cc10e0eb Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Thu, 16 Jan 2003 17:35:54 +0000 Subject: [PATCH] bunch of changes: make design more modular, implement index (partial implementation) and other small and big changes git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@10 13eb9ef6-21d5-0310-b721-a9d68796d827 --- CPAN.modules | 6 ++ TODO | 13 +++- WebPac.pm | 12 +--- all2xml.pl | 157 ++++++++++++++++++++---------------------------- httpd.conf | 1 + index_DBI.pm | 108 +++++++++++++++++++++++++++++++++ isis2xml.xml | 61 +++++++++++-------- isis_sf.pm | 25 ++++++++ parse_format.pm | 51 ++++++++++++++++ swish_isis.conf | 3 + xmlify.pm | 20 ++++++ 11 files changed, 328 insertions(+), 129 deletions(-) create mode 100644 index_DBI.pm create mode 100644 isis_sf.pm create mode 100644 parse_format.pm create mode 100644 xmlify.pm diff --git a/CPAN.modules b/CPAN.modules index a2b270c..1ce5b0d 100644 --- a/CPAN.modules +++ b/CPAN.modules @@ -9,3 +9,9 @@ XML::Writer other perl modules which are used: Text::Unaccent from http://www.senga.org/unac/ + +Modules which are used: +CGI::Application +HTML::Pager +HTML::Template +HTML::FillInForm diff --git a/TODO b/TODO index 505d89c..66d9b93 100644 --- a/TODO +++ b/TODO @@ -4,13 +4,22 @@ This that has to be done (or fixed) * isis2xml.pl - create swish configuration file "on the fly" -- implement "posudba" +- implement Makefile.PL and install modules in "proper" places. + +- implement circulation + +- make swish to search case sensitive and insesitive depending on input + string: if it has capital letters search case sensitive + +- index should show some records before and after matched term. --------------------------------------------------------------------------- General implementation notes: -- +- automatic crawlers should keep local copy (DBI) and e-mail publications + which are added (and generate report for users (last month, 6 months) + using cgi) --------------------------------------------------------------------------- diff --git a/WebPac.pm b/WebPac.pm index 843519c..23ab460 100644 --- a/WebPac.pm +++ b/WebPac.pm @@ -6,15 +6,14 @@ use strict; use HTML::Pager; use HTML::FillInForm; use SWISH; -use Unicode::String qw(utf8 utf16); -require Unicode::Map8; +use Unicode::MapUTF8 qw(to_utf8 from_utf8 utf8_supported_charset); use DBI; # configuration options # FIX: they really should go in configuration file! my $TEMPLATE_PATH = '/data/webpac/template_html'; my $CHARSET = 'ISO-8859-2'; -my $SWISH = '/data/swish/swish-e'; +my $SWISH = '/usr/local/bin/swish-e'; my $INDEX = '/data/webpac/index/isis.index'; my $MAX_HITS = 500; my $ON_PAGE = 10; @@ -91,9 +90,6 @@ sub show_results_list { my $tmpl = $self->load_tmpl('results.html'); - my $l2_map = Unicode::Map8->new($CHARSET) || die; - my $us = Unicode::String->new(); - # call swish my $sh = SWISH->connect('Fork', prog => $SWISH, @@ -102,12 +98,10 @@ sub show_results_list { results => sub { my ($sh,$hit) = @_; - $us->utf8($hit->swishtitle); - push @swish_results, { nr => ($#swish_results + 2), path => $hit->swishdocpath, - title => $l2_map->to8($us->utf16), + title => to_utf8({ -string => $hit->swishtitle, -charset => $CHARSET }), rank => $hit->swishrank }; # my @fields = $hit->field_names; diff --git a/all2xml.pl b/all2xml.pl index 5bbc307..6e414a8 100755 --- a/all2xml.pl +++ b/all2xml.pl @@ -6,16 +6,18 @@ use Getopt::Std; use Data::Dumper; use XML::Simple; use Text::Unaccent 1.02; # 1.01 won't compile on my platform, -require Unicode::Map8; -use DBI; +use Text::Iconv; -my $config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1); -my $dbh = DBI->connect("DBI:Pg:dbname=webpac","","") || die $DBI::errstr; # FIX -# FIX; select relname from pg_class where relname like 'index_%' ; -$dbh->begin_work || die $dbh->errstr(); -$dbh->do("delete from index_author") || die $dbh->errstr(); -$dbh->do("delete from index_title") || die $dbh->errstr(); +$|=1; + +my $config; + +$config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1); + +use index_DBI; # there is no other, right now ;-) + +my $index = new index_DBI(); # open index my %opts; @@ -34,110 +36,79 @@ my $db_dir = $opts{d} || "ps"; # FIX #print Dumper($config->{indexer}); #print "-" x 70,"\n"; -# how to convert isis code page to UTF8? -my $isis_map = Unicode::Map8->new($config->{isis_codepage}) || die; +Text::Iconv->raise_error(1); # Conversion errors raise exceptions + +my $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8'); +my $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage}); +my $cludge_codepage = Text::Iconv->new('UTF8','ISO8859-1'); sub isis2xml { + use xmlify; + my $row = shift @_; my $xml; - $xml->{db_dir} = [ $db_dir ]; # FIX remove? - - sub isis_sf { - my $row = shift @_; - my $isis_id = shift @_; - my $subfield = shift @_; - if ($row->{$isis_id}->[0]) { - my $sf = OpenIsis::subfields($row->{$isis_id}->[0]); - if (! defined $subfield || length($subfield) == 0) { - # subfield list undef, empty or no defined subfields for this record - my $all_sf = $row->{$isis_id}->[0]; - $all_sf =~ s/\^./ /g; nuke definirions - return $all_sf; - } elsif ($sf->{$subfield}) { - return $sf->{$subfield}; - } - } - } + $xml .= xmlify('db_dir',$db_dir); # FIX remove? + + use parse_format; + foreach my $field (keys %{$config->{indexer}}) { - my $display_data = ""; my $swish_data = ""; + my $display_data = ""; my $index_data = ""; foreach my $x (@{$config->{indexer}->{$field}->{isis}}) { - my $display_tmp = ""; - my $swish_tmp = ""; - my $index_tmp = ""; - my $format = $x->{content}; - my $s = 1; # swish only - my $d = 1; # display only - my $i = 0; # index only + my ($s,$d,$i) = (1,1,0); # swish, display default $s = 0 if (lc($x->{type}) eq "display"); $d = 0 if (lc($x->{type}) eq "swish"); ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index"); #print STDERR "## s: $s d: $d i: $i ## $format ##\n"; - # parse format - my $prefix = ""; - if ($format =~ s/^([^\d]+)//) { - $prefix = $1; - } - while ($format) { - if ($format =~ s/^(\d\d\d)(\w?)//) { - my $isis_tmp = isis_sf($row,$1,$2); - if ($isis_tmp) { -# $display_tmp .= $prefix . "/$1/$2/".$isis_tmp if ($d); - $display_tmp .= $prefix . $isis_tmp if ($d); - $swish_tmp .= $isis_tmp." " if ($s); - $index_tmp .= $prefix . $isis_tmp if ($i); -#print STDERR " $isis_tmp <--\n"; - } - $prefix = ""; - } elsif ($format =~ s/^([^\d]+)//) { - $prefix = $1; - } else { - print STDERR "WARNING: unparsed format '$format'\n"; - last; - }; - } - # add suffix - $display_tmp .= $prefix if ($display_tmp); - $index_tmp .= $prefix if ($index_tmp); -# $display_data .= $display_tmp if ($display_tmp ne ""); -# $swish_data .= $swish_tmp if ($swish_tmp ne ""); - $display_data .= $display_tmp; - $swish_data .= $swish_tmp; - $index_data .= $index_tmp; + $format = $cludge_codepage->convert($format); + my ($swish,$display) = parse_format($format,$row); +#print STDERR "s: $swish\nd: $display\n" if ($swish); + +#print STDERR "swish: $swish<-- display: $display<--\n"; + # FIX: this is ugly, UGLY, cludge: OpenIsis return + # UTF8 encoding of strings, but as if source charset + # is ISO8859-1 and not some other. This breaks our + # isis character encoding, so we convert it first + # back to ISO8859-1 (which can actually be different + # encoding in isis) + + $swish_data .= $swish if ($s && $swish); + $display_data .= $display if ($d && $display); + $index_data .= $display if ($i && $display); + } +#print STDERR "s_d: $swish_data\nd_d: $display_data\n" if ($swish_data); + if ($display_data) { + $display_data = $isis_codepage->convert($display_data)."##" || $display_data; + $xml .= xmlify($field."_display", $display_data); + } + if ($swish_data) { + my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2'); + $swish_data = $i->convert($swish_data); + $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data)); + #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data; + #$xml .= xmlify($field."_swish",unac_string($config->{isis_codepage},$swish_data)); } -#print "--display:$display_data\n--swish:$swish_data\n"; - #$xml->{$field."_display"} = $isis_map->tou($display_data)->utf8 if ($display_data); - #$xml->{$field."_swish"} = unac_string($config->{isis_codepage},$swish_data) if ($swish_data); - $xml->{$field."_display" } = [ $isis_map->tou($display_data)->utf8 ] if ($display_data); - $xml->{$field."_swish"} = [ unac_string($config->{isis_codepage},$swish_data) ] if ($swish_data); # index if ($index_data && $index_data ne "") { - my $sql = "select $field from index_$field where upper($field)=upper(?)"; - my $sth = $dbh->prepare($sql) || die $dbh->errstr(); - $sth->execute($index_data) || die "SQL: $sql; ".$dbh->errstr(); -#print STDERR "--->$index_data<---\n"; - if (! $sth->fetchrow_hashref) { - my $sql = "insert into index_$field values (?)"; - my $sth = $dbh->prepare($sql) || die $dbh->errstr(); -#print STDERR "$sql: $index_data$xml\n"; + print "Path-Name: $path#".int($row->{mfn})."\n"; + print "Content-Length: ".(length($xml)+1)."\n"; + print "Document-Type: XML\n\n$xml\n"; } } } print STDERR "\n"; } -$dbh->commit || die $dbh->errstr(); +# call this to commit index +$index->close; 1; __END__ diff --git a/httpd.conf b/httpd.conf index 6d7c4b5..2acb0ed 100644 --- a/httpd.conf +++ b/httpd.conf @@ -1,6 +1,7 @@ # make swish indexing interface available Alias /webpac /data/webpac/public_html +AddHandler cgi-script .cgi DirectoryIndex index.cgi Options ExecCGI FollowSymLinks Indexes diff --git a/index_DBI.pm b/index_DBI.pm new file mode 100644 index 0000000..d24f130 --- /dev/null +++ b/index_DBI.pm @@ -0,0 +1,108 @@ +# +# this file implements index functions using DBI +# + +package index_DBI; +use strict qw(vars); +use vars qw($Count); + +use DBI; + +my %Table; # index tables which where visited in this run + +sub new { + my $class = shift; + my $self = {}; + bless($self, $class); + + # FIX: config params + $self->{dbh} = DBI->connect("DBI:Pg:dbname=webpac","","") || die $DBI::errstr; + # begin transaction + $self->{dbh}->begin_work || die $self->{dbh}->errstr(); + + $Count++; + + return $self; +} + +sub delete_and_create { + my $self = shift; + + my $field = shift; + + $self->{dbh}->commit; + $self->{dbh}->begin_work; + + my $sql = "select count(*) from $field"; + my $sth = $self->{dbh}->prepare($sql) || die $self->{dbh}->errstr(); +# FIX: this is not a good way to check if table exists! + if (1 || $sth->execute() && $sth->fetchrow_hashref) { + my $sql = "drop table $field"; +# my $sth = $self->{dbh}->do($sql) || die "SQL: $sql ".$self->{dbh}->errstr(); + } + $sql = "create table $field ( + item varchar(255), + ident varchar(255), + count int, + primary key (item,ident) + )"; +# $sth = $self->{dbh}->do($sql) || die "SQL: $sql ".$self->{dbh}->errstr(); + $sth = $self->{dbh}->do($sql) || warn "SQL: $sql ".$self->{dbh}->errstr(); + + $self->{dbh}->commit; + $self->{dbh}->begin_work; +} + +sub insert { + my $self = shift; + + my $field = shift; + my $index_data = shift; + my $ident = shift; # e.g. Library ID + + if (! $index_data) { + print STDERR "\$index->insert() -- no value to insert\n"; + return; + } + + if (! $Table{$field}) { + $self->delete_and_create($field); + } + $Table{$field}++; + + my $sql = "select item from $field where upper(item)=upper(?)"; + my $sth = $self->{dbh}->prepare($sql) || die $self->{dbh}->errstr(); + $sth->execute($index_data) || die "SQL: $sql; ".$self->{dbh}->errstr(); + if (! $sth->fetchrow_hashref) { + my $sql = "insert into $field (item,ident,count) values (?,?,?)"; + my $sth = $self->{dbh}->prepare($sql) || die $self->{dbh}->errstr(); + $sth->execute($index_data,$ident,1) || die "SQL: $sql; ".$self->{dbh}->errstr(); +#print STDERR "in index: $index_data\n"; + } else { + my $sql = "update $field set count = count + 1 where item = ? and ident = ?"; + my $sth = $self->{dbh}->prepare($sql) || die $self->{dbh}->errstr(); + $sth->execute($index_data,$ident) || die "SQL: $sql; ".$self->{dbh}->errstr(); + } +} + +sub close { + my $self = shift; + + if ($self->{dbh}) { + $self->{dbh}->commit || die $self->{dbh}->errstr(); + $self->{dbh}->disconnect; + undef $self->{dbh}; + } +} + +END { + $Count--; + print STDERR "index_DBI fatal error: \$index->close() not called... $Count references left!\n" if ($Count != 0); + # FIX: debug output + print STDERR "usage\ttable\n"; + foreach (keys %Table) { + print STDERR $Table{$_},"\t$_\n"; + } +} + +1; diff --git a/isis2xml.xml b/isis2xml.xml index c0cc837..f251265 100644 --- a/isis2xml.xml +++ b/isis2xml.xml @@ -1,11 +1,15 @@ + + + + <br>230v : 230a. - 250. - 260<br>ISBN 290 <br>231v : 231a. - 251. - 261<br>ISBN 291 <br>232v : 232a. - 252. - 262<br>ISBN 292 <br>233v : 233a. - 253. - 263<br>ISBN 293 - 270 - 271 - 272 - 273 - - + + + + 801a + 801a + + diff --git a/isis_sf.pm b/isis_sf.pm new file mode 100644 index 0000000..554bcc6 --- /dev/null +++ b/isis_sf.pm @@ -0,0 +1,25 @@ +# +# isis_sf($isis_row,'isis_field'[,'subfield']) +# +# e.g. isis_sf($row,'700','a') +# +sub isis_sf { + my $row = shift @_; + my $isis_id = shift @_; + my $subfield = shift @_; + + if ($row->{$isis_id}->[0]) { + my $sf = OpenIsis::subfields($row->{$isis_id}->[0]); + if (! defined $subfield || length($subfield) == 0) { + # subfield list undef, empty or no defined subfields for this record + my $all_sf = $row->{$isis_id}->[0]; + $all_sf =~ s/\^./ /g; nuke definirions + return $all_sf; + } elsif ($sf->{$subfield}) { + return $sf->{$subfield}; + } + } +} + +1; + diff --git a/parse_format.pm b/parse_format.pm new file mode 100644 index 0000000..6dadcf6 --- /dev/null +++ b/parse_format.pm @@ -0,0 +1,51 @@ +#------------------------------------------------------------- +# +# parse_format('format',$isis_row); +# + +use isis_sf; + +sub parse_format { + my $format = shift; + my $row = shift; + + my $out; + my $out_swish; + + my $prefix = ""; + if ($format =~ s/^([^\d]+)//) { + $prefix = "pre: $1"; + } + + my $display; + my $swish; + + while ($format) { +#print STDERR "#### $format\n"; + if ($format =~ s/^(\d\d\d)(\w?)//) { + my $isis_tmp = isis_sf($row,$1,$2); + if ($isis_tmp) { + $display .= $prefix . $isis_tmp; + $swish .= $isis_tmp." "; + } + $prefix = ""; + } elsif ($format =~ s/^([^\d]+)(\d{0,3})/$2/) { + $prefix .= $1; + } elsif ($format =~ s/^([^\d]+\d{0,2})//) { + $prefix .= $1; + } elsif ($format =~ s/^(\d{1,2})//) { + $prefix .= $1; + } else { + print STDERR "unparsed format: $format\n"; + $prefix .= $format; + $format = ""; + } + } + # add suffix + $display .= $prefix if ($display); + + return ($swish,$display); +} + +#------------------------------------------------------------- +1; diff --git a/swish_isis.conf b/swish_isis.conf index e4de493..10c7701 100644 --- a/swish_isis.conf +++ b/swish_isis.conf @@ -6,6 +6,9 @@ # -S prog IndexDir ./isis2xml.pl + +#IndexDir /bin/cat +#SwishProgParameters foo.xml # # debug, pass parametears #SwishProgParameters -l 2002 diff --git a/xmlify.pm b/xmlify.pm new file mode 100644 index 0000000..3b7a163 --- /dev/null +++ b/xmlify.pm @@ -0,0 +1,20 @@ +# +# small function that creates one xml tag +# + +# Escape <, >, & and ", and to produce valid XML +my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); +my $escape_re = join '|' => keys %escape; + +sub xmlify { + my $tag = shift; + my $out; + + $out .= " <$tag>"; + foreach my $v (@_) { + $v =~ s/($escape_re)/$escape{$1}/g; + $out .= $v; + } + $out .= "\n"; + return $out; +} -- 2.20.1