$|=1; # flushes output
-# limit for database dumping
my $directory;
+my $nosanitize;
my $skip_export;
my $keep_export;
my $reset;
my $want_help;
my $as_xml;
my $process_zebraqueue;
+my $do_not_clear_zebraqueue;
+my $verbose_logging;
+my $zebraidx_log_opt = " -v none,fatal,warn ";
my $result = GetOptions(
'd:s' => \$directory,
'reset' => \$reset,
's' => \$skip_export,
'k' => \$keep_export,
+ 'nosanitize' => \$nosanitize,
'b' => \$biblios,
'noxml' => \$noxml,
'w' => \$noshadow,
'a' => \$authorities,
'h|help' => \$want_help,
'x' => \$as_xml,
+ 'y' => \$do_not_clear_zebraqueue,
'z' => \$process_zebraqueue,
+ 'v' => \$verbose_logging,
);
die $msg;
}
+if ( !$as_xml and $nosanitize ) {
+ my $msg = "Cannot specify both -no_xml and -nosanitize\n";
+ $msg .= "Please do '$0 --help' to see usage.\n";
+ die $msg;
+}
+
if ($process_zebraqueue and ($skip_export or $reset)) {
my $msg = "Cannot specify -r or -s if -z is specified\n";
$msg .= "Please do '$0 --help' to see usage.\n";
die $msg;
}
+if ($process_zebraqueue and $do_not_clear_zebraqueue) {
+ my $msg = "Cannot specify both -y and -z\n";
+ $msg .= "Please do '$0 --help' to see usage.\n";
+ die $msg;
+}
+
if ($noshadow) {
$noshadow = ' -n ';
}
+
+# -v is for verbose, which seems backwards here because of how logging is set
+# on the CLI of zebraidx. It works this way. The default is to not log much
+if ($verbose_logging) {
+ $zebraidx_log_opt = '';
+}
+
my $use_tempdir = 0;
unless ($directory) {
$use_tempdir = 1;
my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber","");
my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber","");
-print "Zebra configuration information\n";
-print "================================\n";
-print "Zebra biblio directory = $biblioserverdir\n";
-print "Zebra authorities directory = $authorityserverdir\n";
-print "Koha directory = $kohadir\n";
-print "BIBLIONUMBER in : $biblionumbertagfield\$$biblionumbertagsubfield\n";
-print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
-print "================================\n";
+if ( $verbose_logging ) {
+ print "Zebra configuration information\n";
+ print "================================\n";
+ print "Zebra biblio directory = $biblioserverdir\n";
+ print "Zebra authorities directory = $authorityserverdir\n";
+ print "Koha directory = $kohadir\n";
+ print "BIBLIONUMBER in : $biblionumbertagfield\$$biblionumbertagsubfield\n";
+ print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
+ print "================================\n";
+}
if ($do_munge) {
munge_config();
}
-$dbh->{AutoCommit} = 0; # don't autocommit - want a consistent view of the zebraqueue table
-
if ($authorities) {
- #
- # exporting authorities
- #
- my $num_authorities_exported = 0;
- my $num_authorities_deleted = 0;
- if ($skip_export) {
- print "====================\n";
- print "SKIPPING authorities export\n";
- print "====================\n";
- } else {
- print "====================\n";
- print "exporting authorities\n";
- print "====================\n";
- mkdir "$directory" unless (-d $directory);
- mkdir "$directory/authorities" unless (-d "$directory/authorities");
- if ($process_zebraqueue) {
- my $sth = select_zebraqueue_records('authority', 'deleted');
- mkdir "$directory/del_authorities" unless (-d "$directory/del_authorities");
- $num_authorities_deleted = generate_deleted_marc_records('authority', $sth, "$directory/del_authorities", $as_xml);
- mark_zebraqueue_done('authority', 'deleted');
- $sth = select_zebraqueue_records('authority', 'updated');
- mkdir "$directory/upd_authorities" unless (-d "$directory/upd_authorities");
- $num_authorities_exported = export_marc_records('authority', $sth, "$directory/upd_authorities", $as_xml, $noxml);
- mark_zebraqueue_done('authority', 'updated');
- } else {
- my $sth = select_all_authorities();
- $num_authorities_exported = export_marc_records('authority', $sth, "$directory/authorities", $as_xml, $noxml);
- }
- }
-
- #
- # and reindexing everything
- #
- print "====================\n";
- print "REINDEXING zebra\n";
- print "====================\n";
- my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
- if ($process_zebraqueue) {
- do_indexing('authority', 'delete', "$directory/del_authorities", $reset, $noshadow, $record_fmt)
- if $num_authorities_deleted;
- do_indexing('authority', 'update', "$directory/upd_authorities", $reset, $noshadow, $record_fmt)
- if $num_authorities_exported;
- } else {
- do_indexing('authority', 'update', "$directory/authorities", $reset, $noshadow, $record_fmt)
- if $num_authorities_exported;
- }
-
+ index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
} else {
- print "skipping authorities\n";
+ print "skipping authorities\n" if ( $verbose_logging );
}
-$dbh->commit(); # commit changes to zebraqueue, if any
-
-#################################################################################################################
-# BIBLIOS
-#################################################################################################################
-
if ($biblios) {
- #
- # exporting biblios
- #
- my $num_biblios_exported = 0;
- my $num_biblios_deleted = 0;
- if ($skip_export) {
- print "====================\n";
- print "SKIPPING biblio export\n";
- print "====================\n";
- } else {
- print "====================\n";
- print "exporting biblios\n";
- print "====================\n";
- mkdir "$directory" unless (-d $directory);
- if ($process_zebraqueue) {
- my $sth = select_zebraqueue_records('biblio', 'deleted');
- mkdir "$directory/del_biblios" unless (-d "$directory/del_biblios");
- $num_biblios_deleted = generate_deleted_marc_records('biblio', $sth, "$directory/del_biblios", $as_xml);
- mark_zebraqueue_done('biblio', 'deleted');
- $sth = select_zebraqueue_records('biblio', 'updated');
- mkdir "$directory/upd_biblios" unless (-d "$directory/upd_biblios");
- $num_biblios_exported = export_marc_records('biblio', $sth, "$directory/upd_biblios", $as_xml, $noxml);
- mark_zebraqueue_done('biblio', 'updated');
- } else {
- mkdir "$directory/biblios" unless (-d "$directory/biblios");
- my $sth = select_all_biblios();
- $num_biblios_exported = export_marc_records('biblio', $sth, "$directory/biblios", $as_xml, $noxml);
- }
- }
-
- #
- # and reindexing everything
- #
- print "====================\n";
- print "REINDEXING zebra\n";
- print "====================\n";
- my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
- if ($process_zebraqueue) {
- do_indexing('biblio', 'delete', "$directory/del_biblios", $reset, $noshadow, $record_fmt)
- if $num_biblios_deleted;
- do_indexing('biblio', 'update', "$directory/upd_biblios", $reset, $noshadow, $record_fmt)
- if $num_biblios_exported;
- } else {
- do_indexing('biblio', 'update', "$directory/biblios", $reset, $noshadow, $record_fmt)
- if $num_biblios_exported;
- }
+ index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
} else {
- print "skipping biblios\n";
+ print "skipping biblios\n" if ( $verbose_logging );
}
-$dbh->commit(); # commit changes to zebraqueue, if any
-print "====================\n";
-print "CLEANING\n";
-print "====================\n";
+if ( $verbose_logging ) {
+ print "====================\n";
+ print "CLEANING\n";
+ print "====================\n";
+}
if ($keep_export) {
print "NOTHING cleaned : the export $directory has been kept.\n";
print "You can re-run this script with the -s ";
}
}
+sub index_records {
+ my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_;
+
+ my $num_records_exported = 0;
+ my $num_records_deleted = 0;
+ if ($skip_export && $verbose_logging) {
+ print "====================\n";
+ print "SKIPPING $record_type export\n";
+ print "====================\n";
+ } else {
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "exporting $record_type\n";
+ print "====================\n";
+ }
+ mkdir "$directory" unless (-d $directory);
+ mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
+ if ($process_zebraqueue) {
+ my $entries = select_zebraqueue_records($record_type, 'deleted');
+ mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
+ $num_records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
+ mark_zebraqueue_batch_done($entries);
+ $entries = select_zebraqueue_records($record_type, 'updated');
+ mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
+ $num_records_exported = export_marc_records_from_list($record_type,
+ $entries, "$directory/upd_$record_type", $as_xml, $noxml);
+ mark_zebraqueue_batch_done($entries);
+ } else {
+ my $sth = select_all_records($record_type);
+ $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
+ unless ($do_not_clear_zebraqueue) {
+ mark_all_zebraqueue_done($record_type);
+ }
+ }
+ }
+
+ #
+ # and reindexing everything
+ #
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "REINDEXING zebra\n";
+ print "====================\n";
+ }
+ my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+ if ($process_zebraqueue) {
+ do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+ if $num_records_deleted;
+ do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+ if $num_records_exported;
+ } else {
+ do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+ if ($num_records_exported or $skip_export);
+ }
+}
+
sub select_zebraqueue_records {
my ($record_type, $update_type) = @_;
my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';
- my $sth = $dbh->prepare("SELECT DISTINCT biblio_auth_number
+ my $sth = $dbh->prepare("SELECT id, biblio_auth_number
FROM zebraqueue
WHERE server = ?
AND operation = ?
- AND done = 0");
+ AND done = 0
+ ORDER BY id DESC");
$sth->execute($server, $op);
- return $sth;
+ my $entries = $sth->fetchall_arrayref({});
}
-sub mark_zebraqueue_done {
- my ($record_type, $update_type) = @_;
+sub mark_all_zebraqueue_done {
+ my ($record_type) = @_;
my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
- my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';
- if ($op eq 'recordDelete') {
- my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1
- WHERE id IN (
- SELECT id FROM (
- SELECT z1.id
- FROM zebraqueue z1
- JOIN zebraqueue z2 ON z2.biblio_auth_number = z1.biblio_auth_number
- WHERE z1.done = 0
- AND z1.server = ?
- AND z2.done = 0
- AND z2.server = ?
- AND z1.operation = ?
- ) d2
- )
- ");
- $sth->execute($server, $server, $op); # if we've deleted a record, any prior specialUpdates are void
- } else {
- my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1
- WHERE server = ?
- AND operation = ?
- AND done = 0");
- $sth->execute($server, $op);
+ my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1
+ WHERE server = ?
+ AND done = 0");
+ $sth->execute($server);
+}
+
+sub mark_zebraqueue_batch_done {
+ my ($entries) = @_;
+
+ $dbh->{AutoCommit} = 0;
+ my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ?");
+ $dbh->commit();
+ foreach my $id (map { $_->{id} } @$entries) {
+ $sth->execute($id);
}
+ $dbh->{AutoCommit} = 1;
+}
+
+sub select_all_records {
+ my $record_type = shift;
+ return ($record_type eq 'biblio') ? select_all_biblios() : select_all_authorities();
}
sub select_all_authorities {
- my $sth = $dbh->prepare("select authid from auth_header");
+ my $sth = $dbh->prepare("SELECT authid FROM auth_header");
$sth->execute();
return $sth;
}
return $sth;
}
-sub export_marc_records {
- my ($record_type, $sth, $directory, $as_xml, $noxml) = @_;
+sub export_marc_records_from_sth {
+ my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
my $num_exported = 0;
open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
my $i = 0;
while (my ($record_number) = $sth->fetchrow_array) {
- print ".";
- print "\r$i" unless ($i++ %100);
+ print "." if ( $verbose_logging );
+ print "\r$i" unless ($i++ %100 or !$verbose_logging);
+ if ( $nosanitize ) {
+ my $marcxml = $record_type eq 'biblio'
+ ? GetXmlBiblio( $record_number )
+ : GetAuthorityXML( $record_number );
+ if ( $marcxml ) {
+ print OUT $marcxml if $marcxml;
+ $num_exported++;
+ }
+ next;
+ }
my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
if (defined $marc) {
# FIXME - when more than one record is exported and $as_xml is true,
$num_exported++;
}
}
- print "\nRecords exported: $num_exported\n";
+ print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+ close OUT;
+ return $num_exported;
+}
+
+sub export_marc_records_from_list {
+ my ($record_type, $entries, $directory, $as_xml, $noxml) = @_;
+
+ my $num_exported = 0;
+ open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+ my $i = 0;
+ my %found = ();
+ foreach my $record_number ( map { $_->{biblio_auth_number} }
+ grep { !$found{ $_->{biblio_auth_number} }++ }
+ @$entries ) {
+ print "." if ( $verbose_logging );
+ print "\r$i" unless ($i++ %100 or !$verbose_logging);
+ my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+ if (defined $marc) {
+ # FIXME - when more than one record is exported and $as_xml is true,
+ # the output file is not valid XML - it's just multiple <record> elements
+ # strung together with no single root element. zebraidx doesn't seem
+ # to care, though, at least if you're using the GRS-1 filter. It does
+ # care if you're using the DOM filter, which requires valid XML file(s).
+ print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+ $num_exported++;
+ }
+ }
+ print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
close OUT;
return $num_exported;
}
sub generate_deleted_marc_records {
- my ($record_type, $sth, $directory, $as_xml) = @_;
+ my ($record_type, $entries, $directory, $as_xml) = @_;
my $num_exported = 0;
open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
my $i = 0;
- while (my ($record_number) = $sth->fetchrow_array) {
- print "\r$i" unless ($i++ %100);
- print ".";
+ foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
+ print "\r$i" unless ($i++ %100 or !$verbose_logging);
+ print "." if ( $verbose_logging );
my $marc = MARC::Record->new();
if ($record_type eq 'biblio') {
print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
$num_exported++;
}
- print "\nRecords exported: $num_exported\n";
+ print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
close OUT;
return $num_exported;
$fetch_sth->execute($record_number);
if (my ($blob) = $fetch_sth->fetchrow_array) {
$marc = MARC::Record->new_from_usmarc($blob);
+ $fetch_sth->finish();
} else {
- warn "failed to retrieve biblio $record_number";
+ return; # failure to find a bib is not a problem -
+ # a delete could have been done before
+ # trying to process a record update
}
- $fetch_sth->finish();
} else {
eval { $marc = GetMarcBiblio($record_number); };
if ($@) {
- warn "failed to retrieve biblio $record_number";
+ # here we do warn since catching an exception
+ # means that the bib was found but failed
+ # to be parsed
+ warn "error retrieving biblio $record_number";
return;
}
}
} else {
eval { $marc = GetAuthority($record_number); };
if ($@) {
- warn "failed to retrieve authority $record_number";
+ warn "error retrieving authority $record_number";
return;
}
}
# FIXME - it is essential to ensure that the biblionumber is present,
# otherwise, Zebra will choke on the record. However, this
# logic belongs in the relevant C4::Biblio APIs.
- my ($marc, $biblionumber) = @_;
+ my $marc = shift;
+ my $biblionumber = shift;
my $biblioitemnumber;
if (@_) {
$biblioitemnumber = shift;
}
sub do_indexing {
- my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format) = @_;
+ my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format, $zebraidx_log_opt) = @_;
my $zebra_server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
my $zebra_config = C4::Context->zebraconfig($zebra_server)->{'config'};
my $zebra_db_dir = C4::Context->zebraconfig($zebra_server)->{'directory'};
- system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name init") if $reset_index;
- system("zebraidx -c $zebra_config $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
- system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name commit") unless $noshadow;
+ system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name init") if $reset_index;
+ system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
+ system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
}
-x export and index as xml instead of is02709 (biblios only).
use this if you might have records > 99,999 chars,
+ -nosanitize export biblio/authority records directly from DB marcxml
+ field without sanitizing records. It speed up
+ dump process but could fail if DB contains badly
+ encoded records. Works only with -x,
+
-w skip shadow indexing for this batch
+ -y do NOT clear zebraqueue after indexing; normally,
+ after doing batch indexing, zebraqueue should be
+ marked done for the affected record type(s) so that
+ a running zebraqueue_daemon doesn't try to reindex
+ the same records - specify -y to override this.
+ Cannot be used with -z.
+
+ -v increase the amount of logging. Normally only
+ warnings and errors from the indexing are shown.
+
-munge-config Deprecated option to try
to fix Zebra config files.
--help or -h show this message.
#
my $created_dir_or_file = 0;
if ($authorities) {
- print "====================\n";
- print "checking directories & files for authorities\n";
- print "====================\n";
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "checking directories & files for authorities\n";
+ print "====================\n";
+ }
unless (-d "$authorityserverdir") {
system("mkdir -p $authorityserverdir");
print "Info: created $authorityserverdir\n";
}
if ($biblios) {
- print "====================\n";
- print "checking directories & files for biblios\n";
- print "====================\n";
-
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "checking directories & files for biblios\n";
+ print "====================\n";
+ }
+
#
# BIBLIOS : creating directory structure
#