added header_first to WebPAC::Input::CSV
[webpac2] / lib / WebPAC / Output / KinoSearch.pm
1 package WebPAC::Output::KinoSearch;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common WebPAC::Output Class::Accessor/;
7 __PACKAGE__->mk_accessors(qw(
8         path
9         database
10         input
11         encoding
12         clean
13
14         index
15 ));
16
17 use KinoSearch::Simple;
18 use File::Path;
19 use Encode qw/decode/;
20 use Data::Dump qw/dump/;
21 use Storable;
22
23 =head1 NAME
24
25 WebPAC::Output::KinoSearch - Create KinoSearch full text index
26
27 =head1 VERSION
28
29 Version 0.05
30
31 =cut
32
33 our $VERSION = '0.05';
34
35 =head1 SYNOPSIS
36
37 Create full text index using KinoSearch index from data with
38 type C<search>.
39
40 =head1 FUNCTIONS
41
42 =head2 new
43
44 Open KinoSearch index
45
46  my $out = new WebPAC::Output::KinoSearch({
47         path => '/path/to/invindex',
48         database => 'demo',
49         encoding => 'iso-8859-2',
50         clean => 1,
51  });
52
53 Options are:
54
55 =over 4
56
57 =item path
58
59 path to KinoSearch index to use
60
61 =item database
62
63 name of database from which data comes
64
65 =item encoding
66
67 character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
68 (and it probably is). This encoding will be converted to C<UTF-8> for
69 index.
70
71 =back
72
73 =head2 init
74
75   $out->init;
76
77 =cut
78
79 sub init {
80         my $self = shift;
81
82         my $log = $self->_get_logger;
83
84         #$log->debug("self: ", sub { dump($self) });
85
86         foreach my $p (qw/path database/) {
87                 $log->logdie("need $p") unless ($self->$p);
88         }
89
90 #       $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
91
92         $self->encoding( 'ISO-8859-2' ) unless $self->encoding;
93
94         ## FIXME we shouldn't re-create whole KinoSearch index every time!
95 #       $self->clean( 1 );
96
97         if ( ! -e $self->path ) {
98                 mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
99                 $log->info("created ", $self->path);
100         } elsif ( $self->clean ) {
101                 $log->info("removing existing ", $self->path);
102                 rmtree $self->path || $log->logdie("can't remove ", $self->path,": $!");
103                 mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
104         }
105
106         my $path = $self->path . '/' . $self->database;
107
108         $log->info("using index $path with encoding ", $self->encoding);
109
110         my $index = KinoSearch::Simple->new(
111                 path => $path,
112                 language => 'en',
113         );
114
115         $log->logdie("can't open $path: $!") unless $index;
116
117         $self->index( $index );
118
119 }
120
121
122 =head2 add
123
124 Adds one entry
125
126   $out->add( 42, $ds );
127
128 =cut
129
130 sub add {
131         my $self = shift;
132
133         my ( $id, $ds ) = @_;
134
135         my $log = $self->_get_logger;
136         $log->logdie("need id") unless defined $id;
137         $log->logdie("need ds") unless $ds;
138
139         my $hash = $self->ds_to_hash( $ds, 'search' ) || return;
140
141         $hash->{id}       ||= $id;
142         $hash->{database} ||= $self->database;
143         $hash->{input}    ||= $self->input;
144
145         foreach my $f ( keys %$hash ) {
146                 if ( ref($hash->{$f}) eq 'ARRAY' ) {
147                         $hash->{$f} = join(' <*> ', @{ $hash->{$f} });
148                 }
149 #               $hash->{$f} = decode( $self->encoding, $hash->{$f} );
150                 $self->{field_count}->{$f}++;
151         }
152
153         $log->debug("add( $id, ", sub { dump($ds) }," ) => ", sub { dump( $hash ) });
154
155         $self->index->add_doc( $hash );
156
157         $self->{count}++;
158
159         return 1;
160 }
161
162 =head2 finish
163
164 Close index
165
166  $out->finish;
167
168 =cut
169
170 sub finish {
171         my $self = shift;
172
173         my $log = $self->_get_logger();
174
175         $log->info("indexed ", $self->{count}, " records");
176
177         $log->debug("field usage: ", dump( $self->{field_count} ));
178
179 }
180
181 =head1 AUTHOR
182
183 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
184
185 =head1 COPYRIGHT & LICENSE
186
187 Copyright 2005-2007 Dobrica Pavlinusic, All Rights Reserved.
188
189 This program is free software; you can redistribute it and/or modify it
190 under the same terms as Perl itself.
191
192 =cut
193
194 1; # End of WebPAC::Output::Estraier