don't dump hits to STDERR without DEBUG env variable

[BackupPC.git] / lib / BackupPC / Search / KinoSearch.pm
diff --git a/lib/BackupPC/Search/KinoSearch.pm b/lib/BackupPC/Search/KinoSearch.pm

index d7f7868..91f488d 100644 (file)
--- a/lib/BackupPC/Search/KinoSearch.pm
+++ b/lib/BackupPC/Search/KinoSearch.pm
@@ -12,83 +12,162 @@ use Data::Dump qw(dump);
  # my $tokenizer = KinoSearch::Analysis::Tokenizer->new( pattern => '\\w' );
  
  sub new {
-       my ( $class ) = @_;
+       my $class = shift @_;
+       my %Conf = @_;
+
+       my $index_path = $Conf{KinoPath} || die "no KinoPath";
+
+       my $self = bless {
+               index => $index_path,
+               first_time_indexing => ! -d $index_path,
+       }, $class;
+       warn "# ",dump($self);
+       return $self;
+}
+
+sub indexer {
+       my $self = shift;
+       return $self->{_indexer} if defined $self->{_indexer};
  
         my $schema = KinoSearch::Plan::Schema->new;
+
+
+       my $case_folder  = KinoSearch::Analysis::CaseFolder->new;
+       my $tokenizer    = KinoSearch::Analysis::Tokenizer->new;
         my $polyanalyzer = KinoSearch::Analysis::PolyAnalyzer->new(
-           language => 'en',
+               analyzers => [ $case_folder, $tokenizer ], 
         );
+
         my $ft_type = KinoSearch::Plan::FullTextType->new(
             analyzer => $polyanalyzer,
         );
         my $blob_type = KinoSearch::Plan::BlobType->new( stored => 1 );
         my $string_type = KinoSearch::Plan::StringType->new; # non-tokenized
-       my $num_type = KinoSearch::Plan::Int64Type->new;
+       my $num_type = KinoSearch::Plan::Int64Type->new( sortable => 1 );
+       my $sort_type = KinoSearch::Plan::StringType->new( sortable => 1 ); # non-tokenized
  
+       # numeric
         $schema->spec_field( name => $_, type => $string_type ) foreach ( qw/
                 backup_date
-               backupnum
-               date
                 fid
                 shareid
-               size
                 type
         / );
  
+       # non-tokenized strings
         $schema->spec_field( name => $_, type => $string_type ) foreach ( qw/
-               _uri filepath hname sname
+               _uri
+               hname
         /);
  
-#      $schema->spec_field( name => '_doc', type => $blob_type );
+       # sortable
+       $schema->spec_field( name => $_, type => $sort_type ) foreach (qw/
+               sname
+               filepath
+       /);
  
-       my $index_path = '/tmp/kinosearch'; # FIXME
+       # sortable numeric
+       $schema->spec_field( name => $_, type => $sort_type ) foreach (qw/
+               backupnum
+               date
+               size
+       /);
+
+       # tokenized magic columns for infix search
+       $schema->spec_field( name => '_file_path_split', type => $ft_type );
+
+#      $schema->spec_field( name => '_doc', type => $blob_type );
  
         my $indexer = KinoSearch::Index::Indexer->new(
                 schema => $schema,
-               index  => $index_path,
+               index  => $self->{index},
                 create => 1,
         );
  
-       warn "# using $index_path";
+       warn "# created indexer";
  
-       $indexer->commit; # make sure that index exists
+       return $self->{_indexer} = $indexer;
  
-       my $self = bless {
-               indexer => $indexer,
-               searcher => KinoSearch::Search::IndexSearcher->new(
-                       index => $index_path,
-               ),
+};
  
-       }, $class;
-       return $self;
+our $searcher;
+sub searcher {
+       my $self = shift;
+       return $self->{_searcher} if $self->{_searcher};
+       $self->{_searcher} =
+       KinoSearch::Search::IndexSearcher->new( index => $self->{index} )
  }
  
  sub exists {
         my ($self,$row) = @_;
  
-       return 0; # FIXME
+       return 0 if $self->{first_time_indexing};
  
         my $uri = $row->{hname} . ':' . $row->{sname} . '#' . $row->{backupnum} . ' ' . $row->{filepath};
-       warn "# exists $uri";
-       my $hits = $self->{searcher}->hits( query => "_uri:$uri" );
+       my $hits = $self->searcher->hits( query => "_uri:$uri" );
+
+
+       $self->{stat}->{exists}->{ $hits->total_hits }++;
+
         return $hits->total_hits;
  }
  
  sub add_doc {
         my ($self,$row) = @_;
  
-       warn "XXX ",dump($row);
-
         $row->{_uri} = $row->{hname} . ':' . $row->{sname} . '#' . $row->{backupnum} . ' ' . $row->{filepath};
+       my $path = $row->{filepath};
+       $path =~ s/(.)/$1 /g; # XXX our tokenize
+       $row->{_file_path_split} = $path;
+
+       warn "XXX ",dump($row) if $ENV{DEBUG};
  
-       $self->{indexer}->add_doc( $row );
+       $self->{stats}->{add_doc}++;
+
+       $self->indexer->add_doc( $row );
  
  }
  
  sub commit {
         my $self = shift;
-       $self->{indexer}->commit;
-       warn "# commit index";
+       $self->indexer->commit;
+       warn "# commit index ", dump($self->{stats});
+}
+
+sub search {
+       my ( $self, $offset, $on_page, $sort, $q, $shareid, $backup_from, $backup_to, $files_from, $files_to ) = @_;
+
+       warn "# search $offset/$on_page [$q] shareid: $shareid backup: $backup_from - $backup_to files: $files_from - $files_to";
+
+       my $sort_field = (split(/_/,$sort,2))[0];
+
+       my $rules = [ KinoSearch::Search::SortRule->new( type => 'score' ) ];
+       $rules->[0] = KinoSearch::Search::SortRule->new( field => $sort_field, reverse => $sort =~ m/_a$/ ? 0 : 1 ) if $sort_field;
+
+       my $sort_spec = KinoSearch::Search::SortSpec->new( rules => $rules );
+
+       my $split = $q;
+       $split =~ s/(.)/$1 /g; # _file_path_split
+       $split = qq{"$split"}; # exact ordering
+       my $hits = $self->searcher->hits(
+               query => $split,
+               offset => $offset,
+               num_wanted => $on_page,
+               sort_spec => $sort_spec,
+       );
+
+
+       warn "# ", $hits->total_hits, " hits for $q\n";
+
+       return (0,[]) if $hits->total_hits == 0;
+
+       my $results;
+       while ( my $hit = $hits->next ) {
+               warn "## hit = ",dump($hit) if $ENV{DEBUG};
+               push @$results, $hit;
+       }
+
+       return ( $hits->total_hits, $results );
  }
  
  1;