With this patch, rebuild_zebra can re-index a whole Koha DB
quickly:
rebuild_zebra -r -b -nosanitize
Biblio (authority) records are dump directly in a file
from marcxml field without beeing transformed into
MARC::Record object and corrected.
DOCUMENTATION:
rebuild_zebra.pl new paramater:
-nosanitize export biblio/authority records directly from DB marcxml
field without sanitizing records. It speed up
dump process but could fail if DB contains badly
encoded records. Works now only with -x and -b
Signed-off-by: Galen Charlton <galen.charlton@liblime.com>
$|=1; # flushes output
my $directory;
$|=1; # flushes output
my $directory;
my $skip_export;
my $keep_export;
my $reset;
my $skip_export;
my $keep_export;
my $reset;
'reset' => \$reset,
's' => \$skip_export,
'k' => \$keep_export,
'reset' => \$reset,
's' => \$skip_export,
'k' => \$keep_export,
+ 'nosanitize' => \$nosanitize,
'b' => \$biblios,
'noxml' => \$noxml,
'w' => \$noshadow,
'b' => \$biblios,
'noxml' => \$noxml,
'w' => \$noshadow,
+if ( !$as_xml and $nosanitize ) {
+ my $msg = "Cannot specify both -no_xml and -nosanitize\n";
+ $msg .= "Please do '$0 --help' to see usage.\n";
+ die $msg;
+}
+
if ($process_zebraqueue and ($skip_export or $reset)) {
my $msg = "Cannot specify -r or -s if -z is specified\n";
$msg .= "Please do '$0 --help' to see usage.\n";
if ($process_zebraqueue and ($skip_export or $reset)) {
my $msg = "Cannot specify -r or -s if -z is specified\n";
$msg .= "Please do '$0 --help' to see usage.\n";
- index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
+ index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
} else {
print "skipping authorities\n" if ( $verbose_logging );
}
if ($biblios) {
} else {
print "skipping authorities\n" if ( $verbose_logging );
}
if ($biblios) {
- index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
+ index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
} else {
print "skipping biblios\n" if ( $verbose_logging );
}
} else {
print "skipping biblios\n" if ( $verbose_logging );
}
- my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_;
+ my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_;
my $num_records_exported = 0;
my $num_records_deleted = 0;
my $num_records_exported = 0;
my $num_records_deleted = 0;
mark_zebraqueue_batch_done($entries);
} else {
my $sth = select_all_records($record_type);
mark_zebraqueue_batch_done($entries);
} else {
my $sth = select_all_records($record_type);
- $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml);
+ $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
unless ($do_not_clear_zebraqueue) {
mark_all_zebraqueue_done($record_type);
}
unless ($do_not_clear_zebraqueue) {
mark_all_zebraqueue_done($record_type);
}
}
sub export_marc_records_from_sth {
}
sub export_marc_records_from_sth {
- my ($record_type, $sth, $directory, $as_xml, $noxml) = @_;
+ my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
my $num_exported = 0;
open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
my $num_exported = 0;
open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
while (my ($record_number) = $sth->fetchrow_array) {
print "." if ( $verbose_logging );
print "\r$i" unless ($i++ %100 or !$verbose_logging);
while (my ($record_number) = $sth->fetchrow_array) {
print "." if ( $verbose_logging );
print "\r$i" unless ($i++ %100 or !$verbose_logging);
+ if ( $nosanitize ) {
+ my $marcxml = $record_type eq 'biblio'
+ ? GetXmlBiblio( $record_number )
+ : GetAuthorityXML( $record_number );
+ if ( $marcxml ) {
+ print OUT $marcxml if $marcxml;
+ $num_exported++;
+ }
+ next;
+ }
my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
if (defined $marc) {
# FIXME - when more than one record is exported and $as_xml is true,
my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
if (defined $marc) {
# FIXME - when more than one record is exported and $as_xml is true,
-x export and index as xml instead of is02709 (biblios only).
use this if you might have records > 99,999 chars,
-x export and index as xml instead of is02709 (biblios only).
use this if you might have records > 99,999 chars,
+ -nosanitize export biblio/authority records directly from DB marcxml
+ field without sanitizing records. It speed up
+ dump process but could fail if DB contains badly
+ encoded records. Works only with -x,
+
-w skip shadow indexing for this batch
-y do NOT clear zebraqueue after indexing; normally,
-w skip shadow indexing for this batch
-y do NOT clear zebraqueue after indexing; normally,