X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=misc%2Fmigration_tools%2Frebuild_zebra.pl;h=dbbb14010f78b6cc72a57285b562499cb011bb08;hb=3ce542ce2668770a5669d2fb98b666f42a5bd388;hp=239d953a47c7b9a5c17920748469d3d1c407e983;hpb=c190d93a12c2741b8d4539b7bee175257da815c8;p=koha.git diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl index 239d953a47..dbbb14010f 100755 --- a/misc/migration_tools/rebuild_zebra.pl +++ b/misc/migration_tools/rebuild_zebra.pl @@ -9,6 +9,7 @@ use File::Temp qw/ tempdir /; use File::Path; use C4::Biblio; use C4::AuthoritiesMarc; +use C4::Items; # # script that checks zebradir structure & create directories & mandatory files if needed @@ -33,11 +34,14 @@ my $want_help; my $as_xml; my $process_zebraqueue; my $do_not_clear_zebraqueue; -my $verbose_logging; +my $length; +my $where; +my $offset; +my $verbose_logging = 0; my $zebraidx_log_opt = " -v none,fatal,warn "; my $result = GetOptions( 'd:s' => \$directory, - 'reset' => \$reset, + 'r|reset' => \$reset, 's' => \$skip_export, 'k' => \$keep_export, 'nosanitize' => \$nosanitize, @@ -50,7 +54,10 @@ my $result = GetOptions( 'x' => \$as_xml, 'y' => \$do_not_clear_zebraqueue, 'z' => \$process_zebraqueue, - 'v' => \$verbose_logging, + 'where:s' => \$where, + 'length:i' => \$length, + 'offset:i' => \$offset, + 'v+' => \$verbose_logging, ); @@ -65,12 +72,6 @@ if (not $biblios and not $authorities) { die $msg; } -if ($authorities and $as_xml) { - my $msg = "Cannot specify both -a and -x\n"; - $msg .= "Please do '$0 --help' to see usage.\n"; - die $msg; -} - if ( !$as_xml and $nosanitize ) { my $msg = "Cannot specify both -no_xml and -nosanitize\n"; $msg .= "Please do '$0 --help' to see usage.\n"; @@ -95,8 +96,8 @@ if ($noshadow) { # -v is for verbose, which seems backwards here because of how logging is set # on the CLI of zebraidx. It works this way. The default is to not log much -if ($verbose_logging) { - $zebraidx_log_opt = ''; +if ($verbose_logging >= 2) { + $zebraidx_log_opt = '-v none,fatal,warn,all'; } my $use_tempdir = 0; @@ -110,6 +111,9 @@ my $biblioserverdir = C4::Context->zebraconfig('biblioserver')->{directory}; my $authorityserverdir = C4::Context->zebraconfig('authorityserver')->{directory}; my $kohadir = C4::Context->config('intranetdir'); +my $bib_index_mode = C4::Context->config('zebra_bib_index_mode') || 'grs1'; +my $auth_index_mode = C4::Context->config('zebra_auth_index_mode') || 'dom'; + my $dbh = C4::Context->dbh; my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber",""); my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber",""); @@ -130,13 +134,13 @@ if ($do_munge) { } if ($authorities) { - index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt); + index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir); } else { print "skipping authorities\n" if ( $verbose_logging ); } if ($biblios) { - index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt); + index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir); } else { print "skipping biblios\n" if ( $verbose_logging ); } @@ -168,11 +172,34 @@ if ($keep_export) { } } +# This checks to see if the zebra directories exist under the provided path. +# If they don't, then zebra is likely to spit the dummy. This returns true +# if the directories had to be created, false otherwise. +sub check_zebra_dirs { + my ($base) = shift() . '/'; + my $needed_repairing = 0; + my @dirs = ( '', 'key', 'register', 'shadow', 'tmp' ); + foreach my $dir (@dirs) { + my $bdir = $base . $dir; + if (! -d $bdir) { + $needed_repairing = 1; + mkdir $bdir || die "Unable to create '$bdir': $!\n"; + print "$0: needed to create '$bdir'\n"; + } + } + return $needed_repairing; +} # ---------- end of subroutine check_zebra_dirs ---------- + sub index_records { - my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_; + my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_; my $num_records_exported = 0; - my $num_records_deleted = 0; + my $records_deleted; + my $need_reset = check_zebra_dirs($server_dir); + if ($need_reset) { + print "$0: found broken zebra server directories: forcing a rebuild\n"; + $reset = 1; + } if ($skip_export && $verbose_logging) { print "====================\n"; print "SKIPPING $record_type export\n"; @@ -188,12 +215,12 @@ sub index_records { if ($process_zebraqueue) { my $entries = select_zebraqueue_records($record_type, 'deleted'); mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type"); - $num_records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml); + $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml); mark_zebraqueue_batch_done($entries); $entries = select_zebraqueue_records($record_type, 'updated'); mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type"); $num_records_exported = export_marc_records_from_list($record_type, - $entries, "$directory/upd_$record_type", $as_xml, $noxml); + $entries, "$directory/upd_$record_type", $as_xml, $noxml, $records_deleted); mark_zebraqueue_batch_done($entries); } else { my $sth = select_all_records($record_type); @@ -215,7 +242,7 @@ sub index_records { my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ; if ($process_zebraqueue) { do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) - if $num_records_deleted; + if %$records_deleted; do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) if $num_records_exported; } else { @@ -224,6 +251,7 @@ sub index_records { } } + sub select_zebraqueue_records { my ($record_type, $update_type) = @_; @@ -269,23 +297,47 @@ sub select_all_records { } sub select_all_authorities { - my $sth = $dbh->prepare("SELECT authid FROM auth_header"); + my $strsth=qq{SELECT authid FROM auth_header}; + $strsth.=qq{ WHERE $where } if ($where); + $strsth.=qq{ LIMIT $length } if ($length && !$offset); + $strsth.=qq{ LIMIT $offset,$length } if ($length && $offset); + my $sth = $dbh->prepare($strsth); $sth->execute(); return $sth; } sub select_all_biblios { - my $sth = $dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber"); + my $strsth = qq{ SELECT biblionumber FROM biblioitems }; + $strsth.=qq{ WHERE $where } if ($where); + $strsth.=qq{ LIMIT $length } if ($length && !$offset); + $strsth.=qq{ LIMIT $offset,$length } if ($offset); + my $sth = $dbh->prepare($strsth); $sth->execute(); return $sth; } +sub include_xml_wrapper { + my $as_xml = shift; + my $record_type = shift; + + return 0 unless $as_xml; + return 1 if $record_type eq 'biblio' and $bib_index_mode eq 'dom'; + return 1 if $record_type eq 'authority' and $auth_index_mode eq 'dom'; + return 0; + +} + sub export_marc_records_from_sth { my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_; my $num_exported = 0; - open (OUT, ">:utf8 ", "$directory/exported_records") or die $!; + open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!; + if (include_xml_wrapper($as_xml, $record_type)) { + # include XML declaration and root element + print {$fh} ''; + } my $i = 0; + my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField("items.itemnumber",''); while (my ($record_number) = $sth->fetchrow_array) { print "." if ( $verbose_logging ); print "\r$i" unless ($i++ %100 or !$verbose_logging); @@ -293,35 +345,66 @@ sub export_marc_records_from_sth { my $marcxml = $record_type eq 'biblio' ? GetXmlBiblio( $record_number ) : GetAuthorityXML( $record_number ); + if ($record_type eq 'biblio'){ + my @items = GetItemsInfo($record_number); + if (@items){ + my $record = MARC::Record->new; + $record->encoding('UTF-8'); + my @itemsrecord; + foreach my $item (@items){ + my $record = Item2Marc($item, $record_number); + push @itemsrecord, $record->field($itemtag); + } + $record->insert_fields_ordered(@itemsrecord); + my $itemsxml = $record->as_xml_record(); + $marcxml = + substr($marcxml, 0, length($marcxml)-10) . + substr($itemsxml, index($itemsxml, "\n", 0) + 10); + } + } if ( $marcxml ) { - print OUT $marcxml if $marcxml; + print {$fh} $marcxml if $marcxml; $num_exported++; } next; } my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml); if (defined $marc) { - # FIXME - when more than one record is exported and $as_xml is true, - # the output file is not valid XML - it's just multiple elements - # strung together with no single root element. zebraidx doesn't seem - # to care, though, at least if you're using the GRS-1 filter. It does - # care if you're using the DOM filter, which requires valid XML file(s). - print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc(); - $num_exported++; + eval { + my $rec; + if ($as_xml) { + $rec = $marc->as_xml_record(C4::Context->preference('marcflavour')); + $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!; + } else { + $rec = $marc->as_usmarc(); + } + print {$fh} $rec; + $num_exported++; + }; + if ($@) { + warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML"); + } } } print "\nRecords exported: $num_exported\n" if ( $verbose_logging ); - close OUT; + print {$fh} '' if (include_xml_wrapper($as_xml, $record_type)); + close $fh; return $num_exported; } sub export_marc_records_from_list { - my ($record_type, $entries, $directory, $as_xml, $noxml) = @_; + my ($record_type, $entries, $directory, $as_xml, $noxml, $records_deleted) = @_; my $num_exported = 0; - open (OUT, ">:utf8 ", "$directory/exported_records") or die $!; + open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!; + if (include_xml_wrapper($as_xml, $record_type)) { + # include XML declaration and root element + print {$fh} ''; + } my $i = 0; - my %found = (); + + # Skip any deleted records. We check for this anyway, but this reduces error spam + my %found = %$records_deleted; foreach my $record_number ( map { $_->{biblio_auth_number} } grep { !$found{ $_->{biblio_auth_number} }++ } @$entries ) { @@ -329,25 +412,38 @@ sub export_marc_records_from_list { print "\r$i" unless ($i++ %100 or !$verbose_logging); my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml); if (defined $marc) { - # FIXME - when more than one record is exported and $as_xml is true, - # the output file is not valid XML - it's just multiple elements - # strung together with no single root element. zebraidx doesn't seem - # to care, though, at least if you're using the GRS-1 filter. It does - # care if you're using the DOM filter, which requires valid XML file(s). - print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc(); + eval { + my $rec; + if ($as_xml) { + $rec = $marc->as_xml_record(C4::Context->preference('marcflavour')); + $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!; + } else { + $rec = $marc->as_usmarc(); + } + print {$fh} $rec; + $num_exported++; + }; + if ($@) { + warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML"); + } $num_exported++; } } print "\nRecords exported: $num_exported\n" if ( $verbose_logging ); - close OUT; + print {$fh} '' if (include_xml_wrapper($as_xml, $record_type)); + close $fh; return $num_exported; } sub generate_deleted_marc_records { my ($record_type, $entries, $directory, $as_xml) = @_; - my $num_exported = 0; - open (OUT, ">:utf8 ", "$directory/exported_records") or die $!; + my $records_deleted = {}; + open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!; + if (include_xml_wrapper($as_xml, $record_type)) { + # include XML declaration and root element + print {$fh} ''; + } my $i = 0; foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) { print "\r$i" unless ($i++ %100 or !$verbose_logging); @@ -363,12 +459,21 @@ sub generate_deleted_marc_records { fix_unimarc_100($marc); } - print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc(); - $num_exported++; + my $rec; + if ($as_xml) { + $rec = $marc->as_xml_record(C4::Context->preference('marcflavour')); + $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!; + } else { + $rec = $marc->as_usmarc(); + } + print {$fh} $rec; + + $records_deleted->{$record_number} = 1; } - print "\nRecords exported: $num_exported\n" if ( $verbose_logging ); - close OUT; - return $num_exported; + print "\nRecords exported: $i\n" if ( $verbose_logging ); + print {$fh} '' if (include_xml_wrapper($as_xml, $record_type)); + close $fh; + return $records_deleted; } @@ -380,10 +485,7 @@ sub get_corrected_marc_record { if (defined $marc) { fix_leader($marc); - if ($record_type eq 'biblio') { - my $succeeded = fix_biblio_ids($marc, $record_number); - return unless $succeeded; - } else { + if ($record_type eq 'authority') { fix_authority_id($marc, $record_number); } if (C4::Context->preference("marcflavour") eq "UNIMARC") { @@ -404,15 +506,19 @@ sub get_raw_marc_record { $fetch_sth->execute($record_number); if (my ($blob) = $fetch_sth->fetchrow_array) { $marc = MARC::Record->new_from_usmarc($blob); - $fetch_sth->finish(); - } else { - return; # failure to find a bib is not a problem - - # a delete could have been done before - # trying to process a record update + unless ($marc) { + warn "error creating MARC::Record from $blob"; + } } + # failure to find a bib is not a problem - + # a delete could have been done before + # trying to process a record update + + $fetch_sth->finish(); + return unless $marc; } else { - eval { $marc = GetMarcBiblio($record_number); }; - if ($@) { + eval { $marc = GetMarcBiblio($record_number, 1); }; + if ($@ || !$marc) { # here we do warn since catching an exception # means that the bib was found but failed # to be parsed @@ -496,7 +602,7 @@ sub fix_unimarc_100 { my $marc = shift; my $string; - if ( length($marc->subfield( 100, "a" )) == 35 ) { + if ( length($marc->subfield( 100, "a" )) == 36 ) { $string = $marc->subfield( 100, "a" ); my $f100 = $marc->field(100); $marc->delete_field($f100); @@ -507,7 +613,7 @@ sub fix_unimarc_100 { $string = sprintf( "%-*s", 35, $string ); } substr( $string, 22, 6, "frey50" ); - unless ( length($marc->subfield( 100, "a" )) == 35 ) { + unless ( length($marc->subfield( 100, "a" )) == 36 ) { $marc->delete_field($marc->field(100)); $marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string )); } @@ -585,8 +691,16 @@ Parameters: -v increase the amount of logging. Normally only warnings and errors from the indexing are shown. + Use log level 2 (-v -v) to include all Zebra logs. + + --length 1234 how many biblio you want to export + --offset 1243 offset you want to start to + example: --offset 500 --length=500 will result in a LIMIT 500,1000 (exporting 1000 records, starting by the 500th one) + note that the numbers are NOT related to biblionumber, that's the intended behaviour. + --where let you specify a WHERE query, like itemtype='BOOK' + or something like that - -munge-config Deprecated option to try + --munge-config Deprecated option to try to fix Zebra config files. --help or -h show this message. _USAGE_ @@ -771,8 +885,8 @@ if ($authorities) { # AUTHORITIES : copying mandatory files # unless (-f C4::Context->zebraconfig('authorityserver')->{config}) { - open ZD,">:utf8 ",C4::Context->zebraconfig('authorityserver')->{config}; - print ZD " + open my $zd, '>:encoding(UTF-8)' ,C4::Context->zebraconfig('authorityserver')->{config}; + print {$zd} " # generated by KOHA/misc/migration_tools/rebuild_zebra.pl profilePath:\${srcdir:-.}:$authorityserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/ @@ -916,8 +1030,8 @@ if ($biblios) { # BIBLIOS : copying mandatory files # unless (-f C4::Context->zebraconfig('biblioserver')->{config}) { - open ZD,">:utf8 ",C4::Context->zebraconfig('biblioserver')->{config}; - print ZD " + open my $zd, '>:encoding(UTF-8)', C4::Context->zebraconfig('biblioserver')->{config}; + print {$zd} " # generated by KOHA/misc/migrtion_tools/rebuild_zebra.pl profilePath:\${srcdir:-.}:$biblioserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/