bug 9496: improve error checking in rebuild_zebra.pl
authorGalen Charlton <gmc@esilibrary.com>
Tue, 8 Jan 2013 00:12:57 +0000 (19:12 -0500)
committerJared Camins-Esakov <jcamins@cpbibliography.com>
Fri, 22 Mar 2013 02:25:03 +0000 (22:25 -0400)
When using rebuild_zebra to index all records, skip over
bibliographic or authority records that don't come out
as valid XML.  Also, strip extraneous XML declarations when
using --nosanitize.

Test plans
----------
Note that both plans assume that DOM indexing is turned on.

Test plan #1
============

[1] Run rebuild_zebra.pl with the -x -nosanitize options.  Without
    the patch, zebraidx should terminate early and complain
    about invalid XML.
[2] With the patch, the rebuild_zebra.pl should work without
    error.

Test plan #2
============
[1] Intentionally make a MARCXML record invalid, e.g, by running
    the following SQL:

    UPDATE bilbioitems SET marcxml = CONCATENATE(marcxml, 'junk')
    WHERE biblionumber = 123;

[2] Run rebuild_zebra.pl -b -x -r
[3] Without the patch, only part of the database will be indexed.
[4] With the patch, rebuild_zebra.pl will not export the bad
    record and will give an error message saying so, but will
    successfully index the rest of the records.

Signed-off-by: Galen Charlton <gmc@esilibrary.com>
Signed-off-by: Larry Baerveldt <larry@bywatersolutions.com>
Signed-off-by: Mason James <mtj@kohaaloha.com>
Signed-off-by: Paul Poulain <paul.poulain@biblibre.com>
Signed-off-by: Jared Camins-Esakov <jcamins@cpbibliography.com>
misc/migration_tools/rebuild_zebra.pl

index 6c897ea..5e0d5f1 100755 (executable)
@@ -11,6 +11,7 @@ use C4::Biblio;
 use C4::AuthoritiesMarc;
 use C4::Items;
 use Koha::RecordProcessor;
+use XML::LibXML;
 
 # 
 # script that checks zebradir structure & create directories & mandatory files if needed
@@ -140,6 +141,8 @@ if ($do_munge) {
     munge_config();
 }
 
+my $tester = XML::LibXML->new();
+
 if ($authorities) {
     index_records('authority', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
 } else {
@@ -377,8 +380,18 @@ sub export_marc_records_from_sth {
                         substr($itemsxml, index($itemsxml, "</leader>\n", 0) + 10);
                 }
             }
+            # extra test to ensure that result is valid XML; otherwise
+            # Zebra won't parse it in DOM mode
+            eval {
+                my $doc = $tester->parse_string($marcxml);
+            };
+            if ($@) {
+                warn "Error exporting record $record_number ($record_type): $@\n";
+                next;
+            }
             if ( $marcxml ) {
-                print {$fh} $marcxml if $marcxml;
+                $marcxml =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
+                print {$fh} $marcxml;
                 $num_exported++;
             }
             next;
@@ -389,6 +402,12 @@ sub export_marc_records_from_sth {
                 my $rec;
                 if ($as_xml) {
                     $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
+                    eval {
+                        my $doc = $tester->parse_string($rec);
+                    };
+                    if ($@) {
+                        die "invalid XML: $@";
+                    }
                     $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
                 } else {
                     $rec = $marc->as_usmarc();
@@ -397,7 +416,8 @@ sub export_marc_records_from_sth {
                 $num_exported++;
             };
             if ($@) {
-              warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+                warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+                warn "... specific error is $@" if $verbose_logging;
             }
         }
     }