X-Git-Url: http://git.rot13.org/?p=BackupPC.git;a=blobdiff_plain;f=lib%2FBackupPC%2FSearch%2FKinoSearch.pm;h=d0e77130d3b069af1afde3f2a4a0bc30a90460f1;hp=fd982af6b24385faeb2a17a4b0870852a0ffe0d6;hb=c2148616f886530652ead505e5781c1cf94199b5;hpb=8b30b38f18d3a1cfb15323793e9a14af45168cc4 diff --git a/lib/BackupPC/Search/KinoSearch.pm b/lib/BackupPC/Search/KinoSearch.pm index fd982af..d0e7713 100644 --- a/lib/BackupPC/Search/KinoSearch.pm +++ b/lib/BackupPC/Search/KinoSearch.pm @@ -11,8 +11,30 @@ use Data::Dump qw(dump); # my $tokenizer = KinoSearch::Analysis::Tokenizer->new( pattern => '\\w' ); +# numeric_padding values are used in range search, and muse be sortable + sub new { - my ( $class ) = @_; + my $class = shift @_; + my %Conf = @_; + + my $index_path = $Conf{KinoPath} || die "no KinoPath"; + + my $self = bless { + index => $index_path, + first_time_indexing => ! -d $index_path, + numeric_padding => [ qw( + backup_date + date + ) ], + + }, $class; + #warn "# ",dump($self); + return $self; +} + +sub indexer { + my $self = shift; + return $self->{_indexer} if defined $self->{_indexer}; my $schema = KinoSearch::Plan::Schema->new; @@ -28,72 +50,184 @@ sub new { ); my $blob_type = KinoSearch::Plan::BlobType->new( stored => 1 ); my $string_type = KinoSearch::Plan::StringType->new; # non-tokenized - my $num_type = KinoSearch::Plan::Int64Type->new; + my $num_type = KinoSearch::Plan::Int64Type->new( sortable => 1 ); + my $sort_type = KinoSearch::Plan::StringType->new( sortable => 1 ); # non-tokenized + # numeric $schema->spec_field( name => $_, type => $string_type ) foreach ( qw/ - backup_date - backupnum - date fid shareid - size type / ); + # non-tokenized strings $schema->spec_field( name => $_, type => $string_type ) foreach ( qw/ - _uri filepath hname sname + _uri + hname /); -# $schema->spec_field( name => '_doc', type => $blob_type ); + # sortable + $schema->spec_field( name => $_, type => $sort_type ) foreach (qw/ + sname + filepath + /); - my $index_path = '/tmp/kinosearch'; # FIXME + # sortable numeric + $schema->spec_field( name => $_, type => $sort_type ) foreach (qw/ + backupnum + backup_date + date + size + /); + + # tokenized magic columns for infix search + $schema->spec_field( name => '_file_path_split', type => $ft_type ); + +# $schema->spec_field( name => '_doc', type => $blob_type ); my $indexer = KinoSearch::Index::Indexer->new( schema => $schema, - index => $index_path, + index => $self->{index}, create => 1, ); - warn "# using $index_path"; + #warn "# created indexer"; - $indexer->commit; # make sure that index exists + return $self->{_indexer} = $indexer; - my $self = bless { - indexer => $indexer, - searcher => KinoSearch::Search::IndexSearcher->new( - index => $index_path, - ), +}; - }, $class; - return $self; +our $searcher; +sub searcher { + my $self = shift; + return $self->{_searcher} if $self->{_searcher}; + $self->{_searcher} = + KinoSearch::Search::IndexSearcher->new( index => $self->{index} ) } sub exists { my ($self,$row) = @_; - return 0; # FIXME + return 0 if $self->{first_time_indexing}; my $uri = $row->{hname} . ':' . $row->{sname} . '#' . $row->{backupnum} . ' ' . $row->{filepath}; - warn "# exists $uri"; - my $hits = $self->{searcher}->hits( query => "_uri:$uri" ); + my $hits = $self->searcher->hits( query => "_uri:$uri" ); + + + $self->{stat}->{exists}->{ $hits->total_hits }++; + return $hits->total_hits; } +sub _numeric_padding { sprintf "%010d", $_[0] } # pad up to 32bit number (timestamp) + sub add_doc { my ($self,$row) = @_; - warn "XXX ",dump($row); - $row->{_uri} = $row->{hname} . ':' . $row->{sname} . '#' . $row->{backupnum} . ' ' . $row->{filepath}; + my $path = $row->{filepath}; + $path =~ s/(.)/$1 /g; # XXX our tokenize + $row->{_file_path_split} = $path; + + $self->{stats}->{add_doc}++; - $self->{indexer}->add_doc( $row ); + foreach my $col ( @{ $self->{numeric_padding} } ) { + $row->{$col} = _numeric_padding $row->{$col}; + } + + warn "XXX ",dump($row) if $ENV{DEBUG}; + + $self->indexer->add_doc( $row ); } sub commit { my $self = shift; - $self->{indexer}->commit; - warn "# commit index"; + $self->indexer->commit; + print STDERR "[commit]"; +} + +sub _field_lower_upper_term { + my ( $self, $field, $l, $u ) = @_; + my $numeric_padding = grep { /^$field$/ } @{ $self->{numeric_padding} }; + my $range; + if ( $l ) { + $range->{lower_term} = $numeric_padding ? _numeric_padding $l : $l; + $range->{include_lower} = 1; + } + if ( $u ) { + $range->{upper_term} = $numeric_padding ? _numeric_padding $u : $u; + $range->{include_upper} = 1; + } + if ( $range ) { + $range->{field} = $field; + + #warn "# $field $l - $u numeric_padding:$numeric_padding ",dump($range); + } + return $range; +} + +sub search { + my ( $self, $offset, $on_page, $sort, $q, $shareid, $backup_from, $backup_to, $files_from, $files_to ) = @_; + + warn "# search $offset/$on_page [$q] shareid: $shareid backup: $backup_from - $backup_to files: $files_from - $files_to"; + + my $sort_field = (split(/_/,$sort,2))[0]; + + my $rules = [ KinoSearch::Search::SortRule->new( type => 'score' ) ]; + $rules->[0] = KinoSearch::Search::SortRule->new( field => $sort_field, reverse => $sort =~ m/_a$/ ? 0 : 1 ) if $sort_field; + + my $sort_spec = KinoSearch::Search::SortSpec->new( rules => $rules ); + + my $split = $q; + $split =~ s/(.)/$1 /g; # _file_path_split + my $split_query = KinoSearch::Search::TermQuery->new( field => '_file_path_split', term => $split ); +#warn "XXX ",dump($split_query); + + + my $query_parser = KinoSearch::Search::QueryParser->new( + schema => $self->searcher->get_schema, + fields => ['_file_path_split'], + ); + my $query = $query_parser->parse( '"' . $split . '"' ); + + my @and_query; + + if ( $shareid ) { + push @and_query, KinoSearch::Search::TermQuery->new( field => 'shareid', term => $shareid ); + } + + if ( my $range = $self->_field_lower_upper_term( 'backup_date', $backup_from, $backup_to ) ) { + push @and_query, KinoSearch::Search::RangeQuery->new( %$range ); + } + if ( my $range = $self->_field_lower_upper_term( 'date', $files_from, $files_to ) ) { + push @and_query, KinoSearch::Search::RangeQuery->new( %$range ); + } + + if ( @and_query ) { + push @and_query, $query; + $query = KinoSearch::Search::ANDQuery->new( children => [ @and_query ] ); + } + + my $hits = $self->searcher->hits( + query => m/:/ ? $q : $query, + offset => $offset, + num_wanted => $on_page, + sort_spec => $sort_spec, + ); + + + warn "# ", $hits->total_hits, " hits for $q\n"; + + return (0,[]) if $hits->total_hits == 0; + + my $results; + while ( my $hit = $hits->next ) { + warn "## hit = ",dump($hit) if $ENV{DEBUG}; + push @$results, $hit; + } + + return ( $hits->total_hits, $results ); } 1;