1 package WebPAC::Output::KinoSearch;
6 use base qw/WebPAC::Common/;
8 use KinoSearch::InvIndexer;
9 use KinoSearch::Analysis::PolyAnalyzer;
10 use Encode qw/from_to/;
11 use Data::Dump qw/dump/;
16 WebPAC::Output::KinoSearch - Create KinoSearch full text index
24 our $VERSION = '0.03';
28 Create full text index using KinoSearch index from data with
37 my $est = new WebPAC::Output::KinoSearch(
38 index_path => '/path/to/invindex',
39 fields => qw/name of all filelds used/,
41 label => 'node label',
42 encoding => 'iso-8859-2',
52 path to KinoSearch index to use
56 name of all fields used in this index
60 name of database from which data comes
64 label for node (optional)
68 character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
69 (and it probably is). This encoding will be converted to C<UTF-8> for
81 my $log = $self->_get_logger;
83 #$log->debug("self: ", sub { dump($self) });
85 foreach my $p (qw/index_path fields database/) {
86 $log->logdie("need $p") unless ($self->{$p});
89 $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
91 $self->{encoding} ||= 'ISO-8859-2';
93 $self->{index_path} .= '/' . $self->{database};
95 $self->{clean} = 1 if (! -e $self->{index_path} . '/segments');
97 $log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}");
99 my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
101 $self->{invindex} = KinoSearch::InvIndexer->new(
102 invindex => $self->{index_path},
103 create => $self->{clean},
104 analyzer => $analyzer,
107 my $fields_path = $self->{index_path} . '/fields.storable';
108 $fields_path =~ s#//#/#g;
109 if (-e $fields_path) {
110 $self->{fields} = retrieve($fields_path) ||
111 $log->warn("can't open $fields_path: $!");
113 $log->error("This will be dummy run since no fields statistics are found!");
114 $log->error("You will have to re-run indexing to get search results!");
115 $self->{dummy_run} = 1;
117 $self->{fields_path} = $fields_path;
119 foreach my $f (@{ $self->{fields} }) {
120 $self->{invindex}->spec_field(
129 $self ? return $self : return undef;
135 Adds one entry to database.
141 text => 'optional text from which snippet is created',
144 This function will create entries in index using following URI format:
146 C<file:///type/database%20name/000>
148 Each tag in C<data_structure> with specified C<type> will create one
149 attribute and corresponding hidden text (used for search).
158 my $log = $self->_get_logger;
160 my $database = $self->{'database'} || $log->logconfess('no database in $self');
161 $log->logconfess('need invindex in object') unless ($self->{'invindex'});
163 foreach my $p (qw/id ds type/) {
164 $log->logdie("need $p") unless ($args->{$p});
167 my $type = $args->{'type'};
168 my $id = $args->{'id'};
170 my $uri = "file:///$type/$database/$id";
171 $log->debug("creating $uri");
173 my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");
175 sub _add_value($$$$$) {
176 my ($self,$log,$doc,$n,$v) = @_;
179 $self->{value_usage}->{$n}++;
180 return if ($self->{dummy_run});
182 eval { $doc->set_value($n, $self->convert($v) ) };
183 $log->warn("can't insert: $n = $v") if ($@);
186 _add_value($self,$log,$doc, 'uri', $uri);
188 $log->debug("ds = ", sub { dump($args->{'ds'}) } );
190 # filter all tags which have type defined
192 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
193 } keys %{ $args->{'ds'} };
195 $log->debug("tags = ", join(",", @tags));
197 return unless (@tags);
199 foreach my $tag (@tags) {
201 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
205 $vals = $self->convert( $vals ) or
206 $log->logdie("can't convert '$vals' to UTF-8");
208 _add_value($self, $log, $doc, $tag, $vals );
211 if (my $text = $args->{'text'}) {
212 _add_value($self, $log, $doc, 'bodytext', $text );
215 #$log->debug("adding ", sub { $doc->dump_draft } );
216 $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");
232 my $log = $self->_get_logger();
234 $log->info("finish index writing to disk");
235 $self->{invindex}->finish;
237 $log->info("writing value usage file");
239 # add fields from last run
240 map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };
242 my @fields = keys %{ $self->{value_usage} };
243 store \@fields, $self->{fields_path} ||
244 $log->warn("can't write $self->{fields_path}: $!");
250 my $utf8_string = $self->convert('string in codepage');
257 my $text = shift || return;
258 from_to($text, $self->{encoding}, 'UTF-8');
264 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
266 =head1 COPYRIGHT & LICENSE
268 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
270 This program is free software; you can redistribute it and/or modify it
271 under the same terms as Perl itself.
275 1; # End of WebPAC::Output::Estraier