#!/usr/bin/perl
use strict;
+#use warnings; FIXME - Bug 2505
use C4::Context;
use Getopt::Long;
use File::Path;
use C4::Biblio;
use C4::AuthoritiesMarc;
+use C4::Items;
#
# script that checks zebradir structure & create directories & mandatory files if needed
#
$|=1; # flushes output
-
+# If the cron job starts us in an unreadable dir, we will break without
+# this.
+chdir $ENV{HOME} if (!(-r '.'));
my $directory;
+my $nosanitize;
my $skip_export;
my $keep_export;
my $reset;
my $as_xml;
my $process_zebraqueue;
my $do_not_clear_zebraqueue;
+my $length;
+my $where;
+my $offset;
+my $verbose_logging = 0;
+my $zebraidx_log_opt = " -v none,fatal,warn ";
my $result = GetOptions(
'd:s' => \$directory,
- 'reset' => \$reset,
+ 'r|reset' => \$reset,
's' => \$skip_export,
'k' => \$keep_export,
+ 'nosanitize' => \$nosanitize,
'b' => \$biblios,
'noxml' => \$noxml,
'w' => \$noshadow,
'x' => \$as_xml,
'y' => \$do_not_clear_zebraqueue,
'z' => \$process_zebraqueue,
+ 'where:s' => \$where,
+ 'length:i' => \$length,
+ 'offset:i' => \$offset,
+ 'v+' => \$verbose_logging,
);
die $msg;
}
+if ( !$as_xml and $nosanitize ) {
+ my $msg = "Cannot specify both -no_xml and -nosanitize\n";
+ $msg .= "Please do '$0 --help' to see usage.\n";
+ die $msg;
+}
+
if ($process_zebraqueue and ($skip_export or $reset)) {
my $msg = "Cannot specify -r or -s if -z is specified\n";
$msg .= "Please do '$0 --help' to see usage.\n";
if ($noshadow) {
$noshadow = ' -n ';
}
+
+# -v is for verbose, which seems backwards here because of how logging is set
+# on the CLI of zebraidx. It works this way. The default is to not log much
+if ($verbose_logging >= 2) {
+ $zebraidx_log_opt = '-v none,fatal,warn,all';
+}
+
my $use_tempdir = 0;
unless ($directory) {
$use_tempdir = 1;
my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber","");
my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber","");
-print "Zebra configuration information\n";
-print "================================\n";
-print "Zebra biblio directory = $biblioserverdir\n";
-print "Zebra authorities directory = $authorityserverdir\n";
-print "Koha directory = $kohadir\n";
-print "BIBLIONUMBER in : $biblionumbertagfield\$$biblionumbertagsubfield\n";
-print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
-print "================================\n";
+if ( $verbose_logging ) {
+ print "Zebra configuration information\n";
+ print "================================\n";
+ print "Zebra biblio directory = $biblioserverdir\n";
+ print "Zebra authorities directory = $authorityserverdir\n";
+ print "Koha directory = $kohadir\n";
+ print "BIBLIONUMBER in : $biblionumbertagfield\$$biblionumbertagsubfield\n";
+ print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
+ print "================================\n";
+}
if ($do_munge) {
munge_config();
}
if ($authorities) {
- index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue);
+ index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
} else {
- print "skipping authorities\n";
+ print "skipping authorities\n" if ( $verbose_logging );
}
if ($biblios) {
- index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue);
+ index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
} else {
- print "skipping biblios\n";
+ print "skipping biblios\n" if ( $verbose_logging );
}
-print "====================\n";
-print "CLEANING\n";
-print "====================\n";
+if ( $verbose_logging ) {
+ print "====================\n";
+ print "CLEANING\n";
+ print "====================\n";
+}
if ($keep_export) {
print "NOTHING cleaned : the export $directory has been kept.\n";
print "You can re-run this script with the -s ";
}
}
+# This checks to see if the zebra directories exist under the provided path.
+# If they don't, then zebra is likely to spit the dummy. This returns true
+# if the directories had to be created, false otherwise.
+sub check_zebra_dirs {
+ my ($base) = shift() . '/';
+ my $needed_repairing = 0;
+ my @dirs = ( '', 'key', 'register', 'shadow', 'tmp' );
+ foreach my $dir (@dirs) {
+ my $bdir = $base . $dir;
+ if (! -d $bdir) {
+ $needed_repairing = 1;
+ mkdir $bdir || die "Unable to create '$bdir': $!\n";
+ print "$0: needed to create '$bdir'\n";
+ }
+ }
+ return $needed_repairing;
+} # ---------- end of subroutine check_zebra_dirs ----------
+
sub index_records {
- my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue) = @_;
+ my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
my $num_records_exported = 0;
- my $num_records_deleted = 0;
- if ($skip_export) {
+ my $records_deleted;
+ my $need_reset = check_zebra_dirs($server_dir);
+ if ($need_reset) {
+ print "$0: found broken zebra server directories: forcing a rebuild\n";
+ $reset = 1;
+ }
+ if ($skip_export && $verbose_logging) {
print "====================\n";
print "SKIPPING $record_type export\n";
print "====================\n";
} else {
- print "====================\n";
- print "exporting $record_type\n";
- print "====================\n";
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "exporting $record_type\n";
+ print "====================\n";
+ }
mkdir "$directory" unless (-d $directory);
mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
if ($process_zebraqueue) {
my $entries = select_zebraqueue_records($record_type, 'deleted');
mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
- $num_records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
+ $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
mark_zebraqueue_batch_done($entries);
$entries = select_zebraqueue_records($record_type, 'updated');
mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
$num_records_exported = export_marc_records_from_list($record_type,
- $entries, "$directory/upd_$record_type", $as_xml, $noxml);
+ $entries, "$directory/upd_$record_type", $as_xml, $noxml, $records_deleted);
mark_zebraqueue_batch_done($entries);
} else {
my $sth = select_all_records($record_type);
- $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml);
+ $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
unless ($do_not_clear_zebraqueue) {
mark_all_zebraqueue_done($record_type);
}
#
# and reindexing everything
#
- print "====================\n";
- print "REINDEXING zebra\n";
- print "====================\n";
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "REINDEXING zebra\n";
+ print "====================\n";
+ }
my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
if ($process_zebraqueue) {
- do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt)
- if $num_records_deleted;
- do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt)
+ do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+ if %$records_deleted;
+ do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
if $num_records_exported;
} else {
- do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt)
- if $num_records_exported;
+ do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+ if ($num_records_exported or $skip_export);
}
}
+
sub select_zebraqueue_records {
my ($record_type, $update_type) = @_;
}
sub select_all_authorities {
- my $sth = $dbh->prepare("SELECT authid FROM auth_header");
+ my $strsth=qq{SELECT authid FROM auth_header};
+ $strsth.=qq{ WHERE $where } if ($where);
+ $strsth.=qq{ LIMIT $length } if ($length && !$offset);
+ $strsth.=qq{ LIMIT $offset,$length } if ($length && $offset);
+ my $sth = $dbh->prepare($strsth);
$sth->execute();
return $sth;
}
sub select_all_biblios {
- my $sth = $dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber");
+ my $strsth = qq{ SELECT biblionumber FROM biblioitems };
+ $strsth.=qq{ WHERE $where } if ($where);
+ $strsth.=qq{ LIMIT $length } if ($length && !$offset);
+ $strsth.=qq{ LIMIT $offset,$length } if ($offset);
+ my $sth = $dbh->prepare($strsth);
$sth->execute();
return $sth;
}
sub export_marc_records_from_sth {
- my ($record_type, $sth, $directory, $as_xml, $noxml) = @_;
+ my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
my $num_exported = 0;
- open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+ open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!;
my $i = 0;
+ my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField("items.itemnumber",'');
while (my ($record_number) = $sth->fetchrow_array) {
- print ".";
- print "\r$i" unless ($i++ %100);
+ print "." if ( $verbose_logging );
+ print "\r$i" unless ($i++ %100 or !$verbose_logging);
+ if ( $nosanitize ) {
+ my $marcxml = $record_type eq 'biblio'
+ ? GetXmlBiblio( $record_number )
+ : GetAuthorityXML( $record_number );
+ if ($record_type eq 'biblio'){
+ my @items = GetItemsInfo($record_number);
+ if (@items){
+ my $record = MARC::Record->new;
+ $record->encoding('UTF-8');
+ my @itemsrecord;
+ foreach my $item (@items){
+ my $record = Item2Marc($item, $record_number);
+ push @itemsrecord, $record->field($itemtag);
+ }
+ $record->insert_fields_ordered(@itemsrecord);
+ my $itemsxml = $record->as_xml_record();
+ $marcxml =
+ substr($marcxml, 0, length($marcxml)-10) .
+ substr($itemsxml, index($itemsxml, "</leader>\n", 0) + 10);
+ }
+ }
+ if ( $marcxml ) {
+ print {$fh} $marcxml if $marcxml;
+ $num_exported++;
+ }
+ next;
+ }
my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
if (defined $marc) {
# FIXME - when more than one record is exported and $as_xml is true,
# strung together with no single root element. zebraidx doesn't seem
# to care, though, at least if you're using the GRS-1 filter. It does
# care if you're using the DOM filter, which requires valid XML file(s).
- print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
- $num_exported++;
+ eval {
+ print {$fh} ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
+ $num_exported++;
+ };
+ if ($@) {
+ warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+ }
}
}
- print "\nRecords exported: $num_exported\n";
- close OUT;
+ print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+ close $fh;
return $num_exported;
}
sub export_marc_records_from_list {
- my ($record_type, $entries, $directory, $as_xml, $noxml) = @_;
+ my ($record_type, $entries, $directory, $as_xml, $noxml, $records_deleted) = @_;
my $num_exported = 0;
- open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+ open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;
my $i = 0;
- my %found = ();
+
+ # Skip any deleted records. We check for this anyway, but this reduces error spam
+ my %found = %$records_deleted;
foreach my $record_number ( map { $_->{biblio_auth_number} }
grep { !$found{ $_->{biblio_auth_number} }++ }
@$entries ) {
- print ".";
- print "\r$i" unless ($i++ %100);
+ print "." if ( $verbose_logging );
+ print "\r$i" unless ($i++ %100 or !$verbose_logging);
my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
if (defined $marc) {
# FIXME - when more than one record is exported and $as_xml is true,
# strung together with no single root element. zebraidx doesn't seem
# to care, though, at least if you're using the GRS-1 filter. It does
# care if you're using the DOM filter, which requires valid XML file(s).
- print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+ print {$fh} ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
$num_exported++;
}
}
- print "\nRecords exported: $num_exported\n";
- close OUT;
+ print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+ close $fh;
return $num_exported;
}
sub generate_deleted_marc_records {
my ($record_type, $entries, $directory, $as_xml) = @_;
- my $num_exported = 0;
- open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+ my $records_deleted = {};
+ open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;
my $i = 0;
foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
- print "\r$i" unless ($i++ %100);
- print ".";
+ print "\r$i" unless ($i++ %100 or !$verbose_logging);
+ print "." if ( $verbose_logging );
my $marc = MARC::Record->new();
if ($record_type eq 'biblio') {
fix_unimarc_100($marc);
}
- print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
- $num_exported++;
+ print {$fh} ($as_xml) ? $marc->as_xml_record(C4::Context->preference("marcflavour")) : $marc->as_usmarc();
+
+ $records_deleted->{$record_number} = 1;
}
- print "\nRecords exported: $num_exported\n";
- close OUT;
- return $num_exported;
+ print "\nRecords exported: $i\n" if ( $verbose_logging );
+ close $fh;
+ return $records_deleted;
}
if (defined $marc) {
fix_leader($marc);
- if ($record_type eq 'biblio') {
- my $succeeded = fix_biblio_ids($marc, $record_number);
- return unless $succeeded;
- } else {
+ if ($record_type eq 'authority') {
fix_authority_id($marc, $record_number);
}
if (C4::Context->preference("marcflavour") eq "UNIMARC") {
$fetch_sth->execute($record_number);
if (my ($blob) = $fetch_sth->fetchrow_array) {
$marc = MARC::Record->new_from_usmarc($blob);
- $fetch_sth->finish();
- } else {
- return; # failure to find a bib is not a problem -
- # a delete could have been done before
- # trying to process a record update
+ unless ($marc) {
+ warn "error creating MARC::Record from $blob";
+ }
}
+ # failure to find a bib is not a problem -
+ # a delete could have been done before
+ # trying to process a record update
+
+ $fetch_sth->finish();
+ return unless $marc;
} else {
- eval { $marc = GetMarcBiblio($record_number); };
- if ($@) {
+ eval { $marc = GetMarcBiblio($record_number, 1); };
+ if ($@ || !$marc) {
# here we do warn since catching an exception
# means that the bib was found but failed
# to be parsed
}
sub do_indexing {
- my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format) = @_;
+ my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format, $zebraidx_log_opt) = @_;
my $zebra_server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
my $zebra_config = C4::Context->zebraconfig($zebra_server)->{'config'};
my $zebra_db_dir = C4::Context->zebraconfig($zebra_server)->{'directory'};
- system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name init") if $reset_index;
- system("zebraidx -c $zebra_config $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
- system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name commit") unless $noshadow;
+ system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name init") if $reset_index;
+ system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
+ system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
}
-x export and index as xml instead of is02709 (biblios only).
use this if you might have records > 99,999 chars,
+ -nosanitize export biblio/authority records directly from DB marcxml
+ field without sanitizing records. It speed up
+ dump process but could fail if DB contains badly
+ encoded records. Works only with -x,
+
-w skip shadow indexing for this batch
-y do NOT clear zebraqueue after indexing; normally,
the same records - specify -y to override this.
Cannot be used with -z.
- -munge-config Deprecated option to try
+ -v increase the amount of logging. Normally only
+ warnings and errors from the indexing are shown.
+ Use log level 2 (-v -v) to include all Zebra logs.
+
+ --length 1234 how many biblio you want to export
+ --offset 1243 offset you want to start to
+ example: --offset 500 --length=500 will result in a LIMIT 500,1000 (exporting 1000 records, starting by the 500th one)
+ note that the numbers are NOT related to biblionumber, that's the intended behaviour.
+ --where let you specify a WHERE query, like itemtype='BOOK'
+ or something like that
+
+ --munge-config Deprecated option to try
to fix Zebra config files.
--help or -h show this message.
_USAGE_
#
my $created_dir_or_file = 0;
if ($authorities) {
- print "====================\n";
- print "checking directories & files for authorities\n";
- print "====================\n";
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "checking directories & files for authorities\n";
+ print "====================\n";
+ }
unless (-d "$authorityserverdir") {
system("mkdir -p $authorityserverdir");
print "Info: created $authorityserverdir\n";
# AUTHORITIES : copying mandatory files
#
unless (-f C4::Context->zebraconfig('authorityserver')->{config}) {
- open ZD,">:utf8 ",C4::Context->zebraconfig('authorityserver')->{config};
- print ZD "
+ open my $zd, '>:encoding(UTF-8)' ,C4::Context->zebraconfig('authorityserver')->{config};
+ print {$zd} "
# generated by KOHA/misc/migration_tools/rebuild_zebra.pl
profilePath:\${srcdir:-.}:$authorityserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/
}
if ($biblios) {
- print "====================\n";
- print "checking directories & files for biblios\n";
- print "====================\n";
-
+ if ( $verbose_logging ) {
+ print "====================\n";
+ print "checking directories & files for biblios\n";
+ print "====================\n";
+ }
+
#
# BIBLIOS : creating directory structure
#
# BIBLIOS : copying mandatory files
#
unless (-f C4::Context->zebraconfig('biblioserver')->{config}) {
- open ZD,">:utf8 ",C4::Context->zebraconfig('biblioserver')->{config};
- print ZD "
+ open my $zd, '>:encoding(UTF-8)', C4::Context->zebraconfig('biblioserver')->{config};
+ print {$zd} "
# generated by KOHA/misc/migrtion_tools/rebuild_zebra.pl
profilePath:\${srcdir:-.}:$biblioserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/