implement numeric padding
[BackupPC.git] / lib / BackupPC / Search / KinoSearch.pm
1 package BackupPC::Search::KinoSearch;
2 use warnings;
3 use strict;
4
5 use KinoSearch::Index::Indexer;
6 use KinoSearch::Plan::Schema;
7 use KinoSearch::Analysis::PolyAnalyzer;
8 use KinoSearch::Plan::FullTextType;
9 use KinoSearch::Search::IndexSearcher;
10 use Data::Dump qw(dump);
11
12 # my $tokenizer = KinoSearch::Analysis::Tokenizer->new( pattern => '\\w' );
13
14 sub new {
15         my $class = shift @_;
16         my %Conf = @_;
17
18         my $index_path = $Conf{KinoPath} || die "no KinoPath";
19
20         my $self = bless {
21                 index => $index_path,
22                 first_time_indexing => ! -d $index_path,
23         }, $class;
24         warn "# ",dump($self);
25         return $self;
26 }
27
28 sub indexer {
29         my $self = shift;
30         return $self->{_indexer} if defined $self->{_indexer};
31
32         my $schema = KinoSearch::Plan::Schema->new;
33
34
35         my $case_folder  = KinoSearch::Analysis::CaseFolder->new;
36         my $tokenizer    = KinoSearch::Analysis::Tokenizer->new;
37         my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
38                 analyzers => [ $case_folder, $tokenizer ], 
39         );
40
41         my $ft_type = KinoSearch::Plan::FullTextType->new(
42             analyzer => $polyanalyzer,
43         );
44         my $blob_type = KinoSearch::Plan::BlobType->new( stored => 1 );
45         my $string_type = KinoSearch::Plan::StringType->new; # non-tokenized
46         my $num_type = KinoSearch::Plan::Int64Type->new( sortable => 1 );
47         my $sort_type = KinoSearch::Plan::StringType->new( sortable => 1 ); # non-tokenized
48
49 =for numeric-no-padding
50                 fid
51                 shareid
52                 type
53                 backupnum
54 =cut
55         $self->{numeric_padding} = [ qw/
56                 backup_date
57                 date
58                 size
59         / ];
60
61         # numeric
62         $schema->spec_field( name => $_, type => $string_type ) foreach ( qw/
63                 backup_date
64                 fid
65                 shareid
66                 type
67         / );
68
69         # non-tokenized strings
70         $schema->spec_field( name => $_, type => $string_type ) foreach ( qw/
71                 _uri
72                 hname
73         /);
74
75         # sortable
76         $schema->spec_field( name => $_, type => $sort_type ) foreach (qw/
77                 sname
78                 filepath
79         /);
80
81         # sortable numeric
82         $schema->spec_field( name => $_, type => $sort_type ) foreach (qw/
83                 backupnum
84                 date
85                 size
86         /);
87
88         # tokenized magic columns for infix search
89         $schema->spec_field( name => '_file_path_split', type => $ft_type );
90
91 #       $schema->spec_field( name => '_doc', type => $blob_type );
92
93         my $indexer = KinoSearch::Index::Indexer->new(
94                 schema => $schema,
95                 index  => $self->{index},
96                 create => 1,
97         );
98
99         warn "# created indexer";
100
101         return $self->{_indexer} = $indexer;
102
103 };
104
105 our $searcher;
106 sub searcher {
107         my $self = shift;
108         return $self->{_searcher} if $self->{_searcher};
109         $self->{_searcher} =
110         KinoSearch::Search::IndexSearcher->new( index => $self->{index} )
111 }
112
113 sub exists {
114         my ($self,$row) = @_;
115
116         return 0 if $self->{first_time_indexing};
117
118         my $uri = $row->{hname} . ':' . $row->{sname} . '#' . $row->{backupnum} . ' ' . $row->{filepath};
119         my $hits = $self->searcher->hits( query => "_uri:$uri" );
120
121
122         $self->{stat}->{exists}->{ $hits->total_hits }++;
123
124         return $hits->total_hits;
125 }
126
127 sub add_doc {
128         my ($self,$row) = @_;
129
130         $row->{_uri} = $row->{hname} . ':' . $row->{sname} . '#' . $row->{backupnum} . ' ' . $row->{filepath};
131         my $path = $row->{filepath};
132         $path =~ s/(.)/$1 /g; # XXX our tokenize
133         $row->{_file_path_split} = $path;
134
135         $self->{stats}->{add_doc}++;
136
137         foreach my $col ( @{ $self->{numeric_padding} } ) {
138                 $row->{$col} = sprintf "%011d", $row->{$col};
139         }
140
141         warn "XXX ",dump($row) if $ENV{DEBUG};
142
143         $self->indexer->add_doc( $row );
144
145 }
146
147 sub commit {
148         my $self = shift;
149         $self->indexer->commit;
150         warn "# commit index ", dump($self->{stats});
151 }
152
153 sub _field_lower_upper_term {
154         my ( $field, $l, $u ) = @_;
155         my $range;
156         if ( $l ) {
157                 $range->{lower_term} = $l;
158                 $range->{include_lower} = 1;
159         }
160         if ( $u ) {
161                 $range->{upper_term} = $u;
162                 $range->{include_upper} = 1;
163         }
164         $range->{field} = $field if $range;
165         warn "# $field $l - $u ",dump($range);
166         return $range;
167 }
168
169 sub search {
170         my ( $self, $offset, $on_page, $sort, $q, $shareid, $backup_from, $backup_to, $files_from, $files_to ) = @_;
171
172         warn "# search $offset/$on_page [$q] shareid: $shareid backup: $backup_from - $backup_to files: $files_from - $files_to";
173
174         my $sort_field = (split(/_/,$sort,2))[0];
175
176         my $rules = [ KinoSearch::Search::SortRule->new( type => 'score' ) ];
177         $rules->[0] = KinoSearch::Search::SortRule->new( field => $sort_field, reverse => $sort =~ m/_a$/ ? 0 : 1 ) if $sort_field;
178
179         my $sort_spec = KinoSearch::Search::SortSpec->new( rules => $rules );
180
181         my $split = $q;
182         $split =~ s/(.)/$1 /g; # _file_path_split
183         my $split_query = KinoSearch::Search::TermQuery->new( field => '_file_path_split', term => $split );
184 #warn "XXX ",dump($split_query);
185
186
187         my $query_parser = KinoSearch::Search::QueryParser->new(
188                 schema => $self->searcher->get_schema,
189                 fields => ['_file_path_split'],
190         );
191         my $query = $query_parser->parse( '"' . $split . '"' );
192
193         my @and_query;
194
195         if ( $shareid ) {
196                 push @and_query, KinoSearch::Search::TermQuery->new( field => 'shareid', term => $shareid );
197         }
198
199         if ( my $range = _field_lower_upper_term( 'backup_date', $backup_from, $backup_to ) ) {
200                 push @and_query, KinoSearch::Search::RangeQuery->new( %$range );
201         }
202         if ( my $range = _field_lower_upper_term( 'date', $files_from, $files_to ) ) {
203                 push @and_query, KinoSearch::Search::RangeQuery->new( %$range );
204         }
205
206         if ( @and_query ) {
207                 push @and_query, $query;
208                 $query = KinoSearch::Search::ANDQuery->new( children => [ @and_query ] );
209         }
210
211         my $hits = $self->searcher->hits(
212                 query => m/:/ ? $q : $query,
213                 offset => $offset,
214                 num_wanted => $on_page,
215                 sort_spec => $sort_spec,
216         );
217
218
219         warn "# ", $hits->total_hits, " hits for $q\n";
220
221         return (0,[]) if $hits->total_hits == 0;
222
223         my $results;
224         while ( my $hit = $hits->next ) {
225                 warn "## hit = ",dump($hit) if $ENV{DEBUG};
226                 push @$results, $hit;
227         }
228
229         return ( $hits->total_hits, $results );
230 }
231
232 1;