update HyperEstraier index in chunks of EST_CHUNK (default is 10000) to
authordpavlin <dpavlin@8392b6e1-25fa-0310-8288-cc32f8e212ea>
Tue, 30 Aug 2005 14:19:54 +0000 (14:19 +0000)
committerdpavlin <dpavlin@8392b6e1-25fa-0310-8288-cc32f8e212ea>
Tue, 30 Aug 2005 14:19:54 +0000 (14:19 +0000)
reduce memory usage with huge backup sets (because PostgreSQL tries to load
whole result set in memory).

git-svn-id: svn+ssh://llin/home/dpavlin/private/svn/BackupPC/trunk@98 8392b6e1-25fa-0310-8288-cc32f8e212ea

Makefile
bin/BackupPC_updatedb

index 8ac90aa..3886bb2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ update:
        sudo -u backuppc /data/backuppc/bin/BackupPC_updatedb
 
 index: test
-       test -d /tmp/casket && sudo rm -Rf /tmp/casket || true
+       sudo rm -Rf /data/backuppc/data/casket || true
        sudo -u backuppc /data/backuppc/bin/BackupPC_updatedb -i
 
 xls:
index a3d0dff..405d5cc 100755 (executable)
@@ -13,7 +13,7 @@ use File::Pid;
 use POSIX qw/strftime/;
 
 use constant BPC_FTYPE_DIR => 5;
-use constant EST_SYNC_EVERY => 10000;
+use constant EST_CHUNK => 10000;
 
 my $debug = 0;
 $|=1;
@@ -44,6 +44,9 @@ my $user = $Conf{SearchUser} || '';
 my $index_path = $Conf{HyperEstraierIndex};
 $index_path = $TopDir . '/' . $index_path;
 $index_path =~ s#//#/#g;
+if ($index_path) {
+       use HyperEstraier;
+}
 
 
 my $dbh = DBI->connect($dsn, $user, "", { RaiseError => 1, AutoCommit => 0 });
@@ -103,129 +106,115 @@ sub hest_update {
 
        my ($host_id, $share_id, $num) = @_;
 
-       print curr_time," updating HyperEstraier: select files";
+       print curr_time," updating HyperEstraier:";
 
        my $t = time();
 
-       my $where = '';
-       if ($host_id && $share_id && $num) {
-               $where = qq{
-               WHERE
-                       hosts.id = ? AND
-                       shares.id = ? AND
-                       files.backupnum = ?
-               };
-       }
-
-       my $sth = $dbh->prepare(qq{
-               SELECT
-                       files.id                        AS fid,
-                       hosts.name                      AS hname,
-                       shares.name                     AS sname,
-                       -- shares.share                 AS sharename,
-                       files.backupnum                 AS backupnum,
-                       -- files.name                   AS filename,
-                       files.path                      AS filepath,
-                       files.date                      AS date,
-                       files.type                      AS type,
-                       files.size                      AS size,
-                       files.shareid                   AS shareid,
-                       backups.date                    AS backup_date
-               FROM files 
-                       INNER JOIN shares       ON files.shareID=shares.ID
-                       INNER JOIN hosts        ON hosts.ID = shares.hostID
-                       INNER JOIN backups      ON backups.num = files.backupNum and backups.hostID = hosts.ID AND backups.shareID = shares.ID
-               $where
-       });
-
-       $sth->execute(@_);
-       my $results = $sth->rows;
+       my $offset = 0;
+       my $added = 0;
 
-       if ($results == 0) {
-               print " - no files, skipping\n";
-               return;
-       }
+       print " opening index $index_path";
+       $hest_db = HyperEstraier::Database->new();
+       $hest_db->open($index_path, $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
 
-       my $dot = int($results / 15) || 1;
+       my $results = 0;
 
-       print " $results ($dot/#)";
+       do {
 
-       sub fmt_date {
-               my $t = shift || return;
-               my $iso = BackupPC::Lib::timeStamp($t);
-               $iso =~ s/\s/T/;
-               return $iso;
-       }
+               my $where = '';
+               if ($host_id && $share_id && $num) {
+                       $where = qq{
+                       WHERE
+                               hosts.id = ? AND
+                               shares.id = ? AND
+                               files.backupnum = ?
+                       };
+               }
 
-       my $max = int($results / $dot);
+               my $limit = sprintf('LIMIT '.EST_CHUNK.' OFFSET %d', $offset);
+
+               my $sth = $dbh->prepare(qq{
+                       SELECT
+                               files.id                        AS fid,
+                               hosts.name                      AS hname,
+                               shares.name                     AS sname,
+                               -- shares.share                 AS sharename,
+                               files.backupnum                 AS backupnum,
+                               -- files.name                   AS filename,
+                               files.path                      AS filepath,
+                               files.date                      AS date,
+                               files.type                      AS type,
+                               files.size                      AS size,
+                               files.shareid                   AS shareid,
+                               backups.date                    AS backup_date
+                       FROM files 
+                               INNER JOIN shares       ON files.shareID=shares.ID
+                               INNER JOIN hosts        ON hosts.ID = shares.hostID
+                               INNER JOIN backups      ON backups.num = files.backupNum and backups.hostID = hosts.ID AND backups.shareID = shares.ID
+                       $where
+                       $limit
+               });
+
+               $sth->execute(@_);
+               $results = $sth->rows;
+
+               if ($results == 0) {
+                       print " - no more files\n";
+                       last;
+               }
 
-       print ", opening index $index_path...";
-       use HyperEstraier;
-       my $db = HyperEstraier::Database->new();
+               sub fmt_date {
+                       my $t = shift || return;
+                       my $iso = BackupPC::Lib::timeStamp($t);
+                       $iso =~ s/\s/T/;
+                       return $iso;
+               }
 
-#      unless ($hest_db) {
-#              print " open reader";
-#              $hest_db = HyperEstraier::Database->new();
-#
-#      }
+               while (my $row = $sth->fetchrow_hashref()) {
 
+                       my $fid = $row->{'fid'} || die "no fid?";
+                       my $uri = 'file:///' . $fid;
 
-       $db->open($index_path, $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
+                       my $id = $hest_db->uri_to_id($uri);
+                       next unless ($id == -1);
 
-       my $added = 0;
+                       # create a document object 
+                       my $doc = HyperEstraier::Document->new;
 
-       while (my $row = $sth->fetchrow_hashref()) {
+                       # add attributes to the document object 
+                       $doc->add_attr('@uri', $uri);
 
-               my $fid = $row->{'fid'} || die "no fid?";
-               my $uri = 'file:///' . $fid;
+                       foreach my $c (@{ $sth->{NAME} }) {
+                               $doc->add_attr($c, $row->{$c}) if ($row->{$c});
+                       }
 
-               my $id = $db->uri_to_id($uri);
-               next unless ($id == -1);
+                       #$doc->add_attr('@cdate', fmt_date($row->{'date'}));
 
-               # create a document object 
-               my $doc = HyperEstraier::Document->new;
+                       # add the body text to the document object 
+                       my $path = $row->{'filepath'};
+                       $doc->add_text($path);
+                       $path =~ s/(.)/$1 /g;
+                       $doc->add_hidden_text($path);
 
-               # add attributes to the document object 
-               $doc->add_attr('@uri', $uri);
+                       print STDERR $doc->dump_draft,"\n" if ($debug > 1);
 
-               foreach my $c (@{ $sth->{NAME} }) {
-                       $doc->add_attr($c, $row->{$c}) if ($row->{$c});
+                       # register the document object to the database
+                       $hest_db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
+                       $added++;
                }
 
-               #$doc->add_attr('@cdate', fmt_date($row->{'date'}));
-
-               # add the body text to the document object 
-               my $path = $row->{'filepath'};
-               $doc->add_text($path);
-               $path =~ s/(.)/$1 /g;
-               $doc->add_hidden_text($path);
-
-               print STDERR $doc->dump_draft,"\n" if ($debug > 1);
-
-               # register the document object to the database
-               $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
-
-               $added++;
-               if ($added % $dot == 0) {
-                       print "$max ";
-                       $max--;
-               }
+               print " $added";
+               $hest_db->sync();
 
-               if ($added % EST_SYNC_EVERY == 0) {
-                       print "sync ";
-                       $db->sync();
-               }
+               $offset += EST_CHUNK;
 
-       }
+       } while ($results == EST_CHUNK);
 
-       print "sync $added new files";
-       $db->sync();
        print ", close";
-       $db->close();
+       $hest_db->close();
 
        my $dur = (time() - $t) || 1;
-       printf(" [%.2f/s new %.2f/s dur: %s]\n",
-               ( $results / $dur ),
+       printf(" [%.2f/s dur: %s]\n",
                ( $added / $dur ),
                fmt_time($dur)
        );