Bug 8746 [Follow-up] Replace == by eq in string comparison

[koha.git] / misc / migration_tools / rebuild_zebra.pl
diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl

index fbfd3af..c29ef76 100755 (executable)
--- a/misc/migration_tools/rebuild_zebra.pl
+++ b/misc/migration_tools/rebuild_zebra.pl
@@ -10,6 +10,8 @@ use File::Path;
  use C4::Biblio;
  use C4::AuthoritiesMarc;
  use C4::Items;
+use Koha::RecordProcessor;
+use XML::LibXML;
  
  # 
  # script that checks zebradir structure & create directories & mandatory files if needed
@@ -24,6 +26,7 @@ my $directory;
  my $nosanitize;
  my $skip_export;
  my $keep_export;
+my $skip_index;
  my $reset;
  my $biblios;
  my $authorities;
@@ -34,13 +37,17 @@ my $want_help;
  my $as_xml;
  my $process_zebraqueue;
  my $do_not_clear_zebraqueue;
-my $verbose_logging;
+my $length;
+my $where;
+my $offset;
+my $verbose_logging = 0;
  my $zebraidx_log_opt = " -v none,fatal,warn ";
  my $result = GetOptions(
      'd:s'           => \$directory,
      'r|reset'       => \$reset,
      's'             => \$skip_export,
      'k'             => \$keep_export,
+    'I|skip-index'    => \$skip_index,
      'nosanitize'    => \$nosanitize,
      'b'             => \$biblios,
      'noxml'         => \$noxml,
@@ -51,7 +58,10 @@ my $result = GetOptions(
         'x'                             => \$as_xml,
      'y'             => \$do_not_clear_zebraqueue,
      'z'             => \$process_zebraqueue,
-    'v'             => \$verbose_logging,
+    'where:s'        => \$where,
+    'length:i'        => \$length,
+    'offset:i'      => \$offset,
+    'v+'             => \$verbose_logging,
  );
  
  
@@ -66,12 +76,6 @@ if (not $biblios and not $authorities) {
      die $msg;
  }
  
-if ($authorities and $as_xml) {
-    my $msg = "Cannot specify both -a and -x\n";
-    $msg   .= "Please do '$0 --help' to see usage.\n";
-    die $msg;
-}
-
  if ( !$as_xml and $nosanitize ) {
      my $msg = "Cannot specify both -no_xml and -nosanitize\n";
      $msg   .= "Please do '$0 --help' to see usage.\n";
@@ -90,14 +94,18 @@ if ($process_zebraqueue and $do_not_clear_zebraqueue) {
      die $msg;
  }
  
+if ($reset) {
+    $noshadow = 1;
+}
+
  if ($noshadow) {
      $noshadow = ' -n ';
  }
  
  #  -v is for verbose, which seems backwards here because of how logging is set
  #    on the CLI of zebraidx.  It works this way.  The default is to not log much
-if ($verbose_logging) {
-    $zebraidx_log_opt = '';
+if ($verbose_logging >= 2) {
+    $zebraidx_log_opt = '-v none,fatal,warn,all';
  }
  
  my $use_tempdir = 0;
@@ -111,6 +119,9 @@ my $biblioserverdir = C4::Context->zebraconfig('biblioserver')->{directory};
  my $authorityserverdir = C4::Context->zebraconfig('authorityserver')->{directory};
  
  my $kohadir = C4::Context->config('intranetdir');
+my $bib_index_mode = C4::Context->config('zebra_bib_index_mode') || 'grs1';
+my $auth_index_mode = C4::Context->config('zebra_auth_index_mode') || 'dom';
+
  my $dbh = C4::Context->dbh;
  my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber","");
  my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber","");
@@ -130,14 +141,16 @@ if ($do_munge) {
      munge_config();
  }
  
+my $tester = XML::LibXML->new();
+
  if ($authorities) {
-    index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
+    index_records('authority', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
  } else {
      print "skipping authorities\n" if ( $verbose_logging );
  }
  
  if ($biblios) {
-    index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
+    index_records('biblio', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
  } else {
      print "skipping biblios\n" if ( $verbose_logging );
  }
@@ -188,7 +201,7 @@ sub check_zebra_dirs {
  }      # ----------  end of subroutine check_zebra_dirs  ----------
  
  sub index_records {
-    my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
+    my ($record_type, $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
  
      my $num_records_exported = 0;
      my $records_deleted;
@@ -227,24 +240,32 @@ sub index_records {
              }
          }
      }
-    
+
      #
      # and reindexing everything
      #
-    if ( $verbose_logging ) {
-        print "====================\n";
-        print "REINDEXING zebra\n";
-        print "====================\n";
-    }
-       my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
-    if ($process_zebraqueue) {
-        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) 
-            if %$records_deleted;
-        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
-            if $num_records_exported;
+    if ($skip_index) {
+        if ($verbose_logging) {
+            print "====================\n";
+            print "SKIPPING $record_type indexing\n";
+            print "====================\n";
+        }
      } else {
-        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
-            if ($num_records_exported or $skip_export);
+        if ( $verbose_logging ) {
+            print "====================\n";
+            print "REINDEXING zebra\n";
+            print "====================\n";
+        }
+        my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+        if ($process_zebraqueue) {
+            do_indexing($record_type, 'adelete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+                if %$records_deleted;
+            do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+                if $num_records_exported;
+        } else {
+            do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+                if ($num_records_exported or $skip_export);
+        }
      }
  }
  
@@ -294,22 +315,45 @@ sub select_all_records {
  }
  
  sub select_all_authorities {
-    my $sth = $dbh->prepare("SELECT authid FROM auth_header");
+    my $strsth=qq{SELECT authid FROM auth_header};
+    $strsth.=qq{ WHERE $where } if ($where);
+    $strsth.=qq{ LIMIT $length } if ($length && !$offset);
+    $strsth.=qq{ LIMIT $offset,$length } if ($length && $offset);
+    my $sth = $dbh->prepare($strsth);
      $sth->execute();
      return $sth;
  }
  
  sub select_all_biblios {
-    my $sth = $dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber");
+    my $strsth = qq{ SELECT biblionumber FROM biblioitems };
+    $strsth.=qq{ WHERE $where } if ($where);
+    $strsth.=qq{ LIMIT $length } if ($length && !$offset);
+    $strsth.=qq{ LIMIT $offset,$length } if ($offset);
+    my $sth = $dbh->prepare($strsth);
      $sth->execute();
      return $sth;
  }
  
+sub include_xml_wrapper {
+    my $as_xml = shift;
+    my $record_type = shift;
+
+    return 0 unless $as_xml;
+    return 1 if $record_type eq 'biblio' and $bib_index_mode eq 'dom';
+    return 1 if $record_type eq 'authority' and $auth_index_mode eq 'dom';
+    return 0;
+
+}
+
  sub export_marc_records_from_sth {
      my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
  
      my $num_exported = 0;
-    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!;
+    if (include_xml_wrapper($as_xml, $record_type)) {
+        # include XML declaration and root element
+        print {$fh} '<?xml version="1.0" encoding="UTF-8"?><collection>';
+    }
      my $i = 0;
      my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField("items.itemnumber",'');
      while (my ($record_number) = $sth->fetchrow_array) {
@@ -336,30 +380,50 @@ sub export_marc_records_from_sth {
                          substr($itemsxml, index($itemsxml, "</leader>\n", 0) + 10);
                  }
              }
+            # extra test to ensure that result is valid XML; otherwise
+            # Zebra won't parse it in DOM mode
+            eval {
+                my $doc = $tester->parse_string($marcxml);
+            };
+            if ($@) {
+                warn "Error exporting record $record_number ($record_type): $@\n";
+                next;
+            }
              if ( $marcxml ) {
-                print OUT $marcxml if $marcxml;
+                $marcxml =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
+                print {$fh} $marcxml;
                  $num_exported++;
              }
              next;
          }
          my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
          if (defined $marc) {
-            # FIXME - when more than one record is exported and $as_xml is true,
-            # the output file is not valid XML - it's just multiple <record> elements
-            # strung together with no single root element.  zebraidx doesn't seem
-            # to care, though, at least if you're using the GRS-1 filter.  It does
-            # care if you're using the DOM filter, which requires valid XML file(s).
              eval {
-                print OUT ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
+                my $rec;
+                if ($as_xml) {
+                    $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
+                    eval {
+                        my $doc = $tester->parse_string($rec);
+                    };
+                    if ($@) {
+                        die "invalid XML: $@";
+                    }
+                    $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
+                } else {
+                    $rec = $marc->as_usmarc();
+                }
+                print {$fh} $rec;
                  $num_exported++;
              };
              if ($@) {
-              warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+                warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+                warn "... specific error is $@" if $verbose_logging;
              }
          }
      }
      print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
-    close OUT;
+    print {$fh} '</collection>' if (include_xml_wrapper($as_xml, $record_type));
+    close $fh;
      return $num_exported;
  }
  
@@ -367,7 +431,11 @@ sub export_marc_records_from_list {
      my ($record_type, $entries, $directory, $as_xml, $noxml, $records_deleted) = @_;
  
      my $num_exported = 0;
-    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;
+    if (include_xml_wrapper($as_xml, $record_type)) {
+        # include XML declaration and root element
+        print {$fh} '<?xml version="1.0" encoding="UTF-8"?><collection>';
+    }
      my $i = 0;
  
      # Skip any deleted records. We check for this anyway, but this reduces error spam
@@ -379,17 +447,25 @@ sub export_marc_records_from_list {
          print "\r$i" unless ($i++ %100 or !$verbose_logging);
          my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
          if (defined $marc) {
-            # FIXME - when more than one record is exported and $as_xml is true,
-            # the output file is not valid XML - it's just multiple <record> elements
-            # strung together with no single root element.  zebraidx doesn't seem
-            # to care, though, at least if you're using the GRS-1 filter.  It does
-            # care if you're using the DOM filter, which requires valid XML file(s).
-            print OUT ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
-            $num_exported++;
+            eval {
+                my $rec;
+                if ($as_xml) {
+                    $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
+                    $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
+                } else {
+                    $rec = $marc->as_usmarc();
+                }
+                print {$fh} $rec;
+                $num_exported++;
+            };
+            if ($@) {
+              warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+            }
          }
      }
      print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
-    close OUT;
+    print {$fh} '</collection>' if (include_xml_wrapper($as_xml, $record_type));
+    close $fh;
      return $num_exported;
  }
  
@@ -397,7 +473,11 @@ sub generate_deleted_marc_records {
      my ($record_type, $entries, $directory, $as_xml) = @_;
  
      my $records_deleted = {};
-    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;
+    if (include_xml_wrapper($as_xml, $record_type)) {
+        # include XML declaration and root element
+        print {$fh} '<?xml version="1.0" encoding="UTF-8"?><collection>';
+    }
      my $i = 0;
      foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
          print "\r$i" unless ($i++ %100 or !$verbose_logging);
@@ -413,12 +493,20 @@ sub generate_deleted_marc_records {
              fix_unimarc_100($marc);
          }
  
-        print OUT ($as_xml) ? $marc->as_xml_record(C4::Context->preference("marcflavour")) : $marc->as_usmarc();
+        my $rec;
+        if ($as_xml) {
+            $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
+            $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
+        } else {
+            $rec = $marc->as_usmarc();
+        }
+        print {$fh} $rec;
  
          $records_deleted->{$record_number} = 1;
      }
      print "\nRecords exported: $i\n" if ( $verbose_logging );
-    close OUT;
+    print {$fh} '</collection>' if (include_xml_wrapper($as_xml, $record_type));
+    close $fh;
      return $records_deleted;
      
  
@@ -433,6 +521,9 @@ sub get_corrected_marc_record {
          fix_leader($marc);
          if ($record_type eq 'authority') {
              fix_authority_id($marc, $record_number);
+        } elsif ($record_type eq 'biblio' && C4::Context->preference('IncludeSeeFromInSearches')) {
+            my $normalizer = Koha::RecordProcessor->new( { filters => 'EmbedSeeFromHeadings' } );
+            $marc = $normalizer->process($marc);
          }
          if (C4::Context->preference("marcflavour") eq "UNIMARC") {
              fix_unimarc_100($marc);
@@ -548,7 +639,7 @@ sub fix_unimarc_100 {
      my $marc = shift;
  
      my $string;
-    if ( length($marc->subfield( 100, "a" )) == 35 ) {
+    if ( length($marc->subfield( 100, "a" )) == 36 ) {
          $string = $marc->subfield( 100, "a" );
          my $f100 = $marc->field(100);
          $marc->delete_field($f100);
@@ -559,7 +650,7 @@ sub fix_unimarc_100 {
          $string = sprintf( "%-*s", 35, $string );
      }
      substr( $string, 22, 6, "frey50" );
-    unless ( length($marc->subfield( 100, "a" )) == 35 ) {
+    unless ( length($marc->subfield( 100, "a" )) == 36 ) {
          $marc->delete_field($marc->field(100));
          $marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string ));
      }
@@ -584,9 +675,7 @@ sub print_usage {
  $0: reindex MARC bibs and/or authorities in Zebra.
  
  Use this batch job to reindex all biblio or authority
-records in your Koha database.  This job is useful
-only if you are using Zebra; if you are using the 'NoZebra'
-mode, this job should not be used.
+records in your Koha database.
  
  Parameters:
      -b                      index bibliographic records
@@ -599,7 +688,7 @@ Parameters:
                              or -s.
  
      -r                      clear Zebra index before
-                            adding records to index
+                            adding records to index. Implies -w.
  
      -d                      Temporary directory for indexing.
                              If not specified, one is automatically
@@ -637,8 +726,16 @@ Parameters:
  
      -v                      increase the amount of logging.  Normally only 
                              warnings and errors from the indexing are shown.
+                            Use log level 2 (-v -v) to include all Zebra logs.
+
+    --length   1234         how many biblio you want to export
+    --offset 1243           offset you want to start to
+                                example: --offset 500 --length=500 will result in a LIMIT 500,1000 (exporting 1000 records, starting by the 500th one)
+                                note that the numbers are NOT related to biblionumber, that's the intended behaviour.
+    --where                 let you specify a WHERE query, like itemtype='BOOK'
+                            or something like that
  
-    -munge-config           Deprecated option to try
+    --munge-config          Deprecated option to try
                              to fix Zebra config files.
      --help or -h            show this message.
  _USAGE_
@@ -823,8 +920,8 @@ if ($authorities) {
      # AUTHORITIES : copying mandatory files
      #
      unless (-f C4::Context->zebraconfig('authorityserver')->{config}) {
-    open ZD,">:utf8 ",C4::Context->zebraconfig('authorityserver')->{config};
-    print ZD "
+    open my $zd, '>:encoding(UTF-8)' ,C4::Context->zebraconfig('authorityserver')->{config};
+    print {$zd} "
  # generated by KOHA/misc/migration_tools/rebuild_zebra.pl 
  profilePath:\${srcdir:-.}:$authorityserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/
  
@@ -968,8 +1065,8 @@ if ($biblios) {
      # BIBLIOS : copying mandatory files
      #
      unless (-f C4::Context->zebraconfig('biblioserver')->{config}) {
-    open ZD,">:utf8 ",C4::Context->zebraconfig('biblioserver')->{config};
-    print ZD "
+    open my $zd, '>:encoding(UTF-8)', C4::Context->zebraconfig('biblioserver')->{config};
+    print {$zd} "
  # generated by KOHA/misc/migrtion_tools/rebuild_zebra.pl 
  profilePath:\${srcdir:-.}:$biblioserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/