1 package WebPAC::Output::SWISH;
8 use base qw/WebPAC::Common Class::Accessor/;
9 __PACKAGE__->mk_accessors(qw(
17 use File::Path qw/mkpath/;
18 use Data::Dump qw/dump/;
21 #use Encode qw/encode encode_utf8 is_utf8/;
22 use Text::Unaccent::PurePerl qw/unac_string/;
27 WebPAC::Output::SWISH - Create swish-e full text index
31 our $VERSION = '0.01';
35 Create full text index using swish-e indexer from data with
42 my $out = new WebPAC::Output::SWISH({
52 name of database from which data comes
56 Name of database will be used to form URI of documents in index.
60 our $dir = 'var/swish';
65 my $log = $self->_get_logger;
67 my $database = $self->database || $log->logdie("need database");
69 mkpath $dir if ! -e $dir;
71 my $path = "$dir/$database.conf";
73 open(my $conf, '>', $path) || die "can't open $path: $!";
75 print $conf <<"DEFAULT_SWISH_CONF";
76 # swish-e config file for $database
80 # input file definition
84 MetaNames xml swishdocpath
87 #XMLClassAttributes type
88 UndefinedMetaTags auto
89 UndefinedXMLAttributes auto
91 IndexFile $dir/$database
93 # Croatian ISO-8859-2 characters to unaccented equivalents
94 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
96 # store data into index
105 close($conf) || die "can't write config $path: $!";
107 $self->index_path( "$dir/$database" );
109 my $swish = "swish-e -S prog -c $path";
110 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
112 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
116 $self ? return $self : return undef;
121 my $path = $out->index_path;
125 $out->add( 42, $ds );
129 my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
130 my $escape_re = join '|' => keys %escape;
133 my ($self,$id,$ds) = @_;
135 die "need input" unless $self->input;
137 my $log = $self->_get_logger;
138 $log->debug("id: $id ds = ",sub { dump($ds) });
140 my $database = $self->database || $log->logconfess('no database in $self');
142 my $uri = $self->database . '/' . $self->input . "/$id";
143 $log->debug("creating $uri");
145 # filter all tags which have type defined
146 my $type = $self->type || 'search';
148 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
151 $log->debug("tags = ", join(",", @tags));
153 return unless (@tags);
158 foreach ( 'database', 'input' ) {
159 $xml .= "<$_><![CDATA[" . $self->$_ . "]]></$_>";
160 $data->{$_} = $self->$_;
163 foreach my $tag (@tags) {
165 my $r = ref $ds->{$tag}->{$type};
166 die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
168 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
172 $vals =~ s/($escape_re)/$escape{$1}/gs;
173 $data->{$tag} = $vals;
174 $vals = unac_string( $vals );
176 # BW & EW are our markers for tag boundry
177 $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
178 # $xml .= qq{<!-- } . is_utf8( $vals ) . qq{!>};
180 $self->{stats}->{attr}->{$tag}++;
181 $self->{stats}->{input}->{ $self->input }->{$tag}++;
185 # serialize to JSON instead of YAML because we will loose whitespace
186 $data = to_json($data, {utf8=>1});
187 $xml .= qq{<data><![CDATA[$data]]></data>};
189 $xml .= qq{</all>\n};
191 # $xml = encode('utf-8', $xml);
194 my $len = length($xml);
196 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
198 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
199 die "can't add $uri: $@\n$xml";
208 Dump attributes used on disk
214 my $log = $self->_get_logger();
216 my $path = $dir . '/' . $self->{database} . '.yaml';
217 YAML::DumpFile( $path, $self->{stats} );
218 $log->info("created $path ", -s $path, " bytes");
219 $log->debug( dump( $self->{stats} ) );
221 close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
226 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
228 =head1 COPYRIGHT & LICENSE
230 Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
232 This program is free software; you can redistribute it and/or modify it
233 under the same terms as Perl itself.