From: Dobrica Pavlinusic Date: Thu, 23 Apr 2009 11:14:24 +0000 (+0000) Subject: r1798@llin: dpavlin | 2009-04-23 13:14:23 +0200 X-Git-Url: http://git.rot13.org/?p=webpac2;a=commitdiff_plain;h=f326b07b309a0e305204bf977c5b52c3bcab86d8 r1798@llin: dpavlin | 2009-04-23 13:14:23 +0200 start support for swish-e using WebPAC::Output::SWISH git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1145 07558da8-63fa-0310-ba24-9fe276d99e06 --- diff --git a/Makefile.PL b/Makefile.PL index a789bad..4831746 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -67,9 +67,13 @@ features( 'Spreadsheet::WriteExcel' => 2.14, ], 'WebPAC::Output::KinoSearch' => [ - -default => 1, + -default => 0, 'KinoSearch::Simple', ], + 'WebPAC::Output::SWISH' => [ + -default => 1, + 'SWISH::API', + ], 'Parallel execution (probably broken)' => [ -default => 0, 'Proc::Queue', diff --git a/lib/WebPAC/Output/SWISH.pm b/lib/WebPAC/Output/SWISH.pm new file mode 100644 index 0000000..69c1fc7 --- /dev/null +++ b/lib/WebPAC/Output/SWISH.pm @@ -0,0 +1,219 @@ +package WebPAC::Output::SWISH; + +use warnings; +use strict; + +use base qw/WebPAC::Common/; + +use File::Path qw/make_path/; +use Data::Dump qw/dump/; +use YAML; + +=head1 NAME + +WebPAC::Output::SWISH - Create swish-e full text index + +=cut + +our $VERSION = '0.01'; + +=head1 SYNOPSIS + +Create full text index using swish-e indexer from data with +type C. + +=head1 FUNCTIONS + +=head2 new + + my $out = new WebPAC::Output::SWISH( + database => 'demo', + ); + +Options are: + +=over 4 + +=item database + +name of database from which data comes + +=back + +Name of database will be used to form URI of documents in index. + +=cut + +our $dir = 'var/swish'; + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + my $log = $self->_get_logger; + + #$log->debug("self: ", sub { dump($self) }); + + my $database = $self->{database} || $log->logdie("need database"); + + my $path = "$dir/$database.conf"; + + open(my $conf, '>', $path) || die "can't open $path: $!"; + + print $conf <<"DEFAULT_SWISH_CONF"; +# swish-e config file for $database + +IndexDir stdin + +# input file definition +DefaultContents XML* + +# indexed metatags +MetaNames xml swishdocpath + + +#XMLClassAttributes type +UndefinedMetaTags auto +UndefinedXMLAttributes auto + +IndexFile $dir/$database + +# Croatian ISO-8859-2 characters to unaccented equivalents +TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz + + +# disable output +ParserWarnLevel 0 +IndexReport 1 + +DEFAULT_SWISH_CONF + + close($conf) || die "can't write config $path: $!"; + + $self->{_swish_index_path} = "$dir/$database"; + + my $swish = "swish-e -S prog -c $path"; + open( $self->{_swish_fh}, '-|', $swish ) || die "can't open pipe to $swish: $!"; + + $self->{stats} = {}; + + $self ? return $self : return undef; +} + +=head2 + + my $path = $out->index_path; + +=cut + +sub index_path { $_[0]->{_swish_index_path} }; + +=head2 add + +Adds one entry to database. + + $out->add( + id => 42, + ds => $ds, + type => 'display', + text => 'optional text from which snippet is created', + ); + +This function will create entries in index using following URI format: + + C + +Each tag in C with specified C will create one +attribute and corresponding hidden text (used for search). + +=cut + +my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); +my $escape_re = join '|' => keys %escape; + +sub add { + my $self = shift; + + my $args = {@_}; + + my $log = $self->_get_logger; + + my $database = $self->{'database'} || $log->logconfess('no database in $self'); + $log->logconfess('need db in object') unless ($self->{'db'}); + + foreach my $p (qw/id ds type/) { + $log->logdie("need $p") unless ($args->{$p}); + } + + my $type = $args->{'type'}; + my $id = $args->{'id'}; + + my $uri = "$database/$id"; + $log->debug("creating $uri"); + + # filter all tags which have type defined + my @tags = grep { + ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} ) + } keys %{ $args->{'ds'} }; + + $log->debug("tags = ", join(",", @tags)); + + return unless (@tags); + + my $xml = qq{}; + + foreach my $tag (@tags) { + + my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} }); + + next if ! $vals; + + $vals =~ s/($escape_re)/$escape{$1}/gs; + $xml .= qq{<$tag>}; + + $self->{stats}->{attr}->{$tag}++; + } + + $xml .= qq{\n}; + + my $len = length($xml); + + my $fh = $self->{_swish_fh}; + + print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml"; +# || die "can't add $uri: $@\n$xml\n"; + + return 1; +} + +=head2 finish + +Dump attributes used on disk + +=cut + +sub finish { + my $self = shift; + my $log = $self->_get_logger(); + + my $path = $dir . '/' . $self->{database} . '.yaml'; + YAML::DumpFile( $path, $self->{stats} ); + $log->info("created $path ", -s $path, " bytes"); + $log->debug( dump( $self->{stats} ) ); +} + +=head1 AUTHOR + +Dobrica Pavlinusic, C<< >> + +=head1 COPYRIGHT & LICENSE + +Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut + +1; diff --git a/t/5-output-swish.t b/t/5-output-swish.t new file mode 100755 index 0000000..2747732 --- /dev/null +++ b/t/5-output-swish.t @@ -0,0 +1,75 @@ +#!/usr/bin/perl -w + +use strict; +use blib; + +use Test::More tests => 35; + +BEGIN { +use_ok( 'WebPAC::Test' ); +use_ok( 'WebPAC::Output::SWISH' ); +use_ok( 'SWISH::API' ); +} + +my $path = "$abs_path/kino/"; + +ok(my $out = new WebPAC::Output::SWISH( + database => 'test', + %LOG +), "new"); + +my $ds = { + 'Source' => { + 'name' => 'Izvor: ', + 'search' => [ 'tko zna' ] + }, + 'ID' => { + 'search' => 'id', + }, + 'Array' => { + 'search' => [ qw/a1 a2 s3 a4 a5/ ], + }, + 'foo' => { + 'search' => [ 'foo' ], + }, +}; + +throws_ok { $out->add( ) } qr/need id/, 'add without params'; +throws_ok { $out->add( 42 ) } qr/need ds/, 'add without ds'; + +ok( $out->add( 42, $ds ), 'add 42' ); + +my @strange = ( qw/èajðinica odma¹æivanje ¾abokreèina ¹uma/ ); + +ok( $out->add( 99, { foo => { search => [ @strange ] } } ), 'add 99' ); + +ok( $out->add( 100, { foo => { search => [ qw/foo bar baz/ ] } } ), 'add 100' ); + +ok( $out->finish, 'finish' ); + +sub test_search { + my ( $query_string, $expected_hits ) = @_; + + my $swish = SWISH::API->new( $out->index_path ); + $swish->abort_last_error if $swish->Error; + + my $results = $swish->search( $query_string ); + + my $total_hits = $results->hits; + + ok( $total_hits, "search '$query_string'" ); + + diag "Total hits: $total_hits\n" if $debug; + + cmp_ok( $total_hits, '==', $expected_hits, 'total_hits' ); + + while ( my $hit = $results->next_result ) { + diag dump($hit) if $debug; + } + +} + +test_search( 'foo', 2 ); + +test_search( $_, 1 ) foreach @strange; +