Bug 8176 [SIGNED-OFF] Assign an intial value to $sqlwhere

[koha.git] / misc / migration_tools / rebuild_zebra.pl
diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl

index 38054c3..655b6d4 100755 (executable)
--- a/misc/migration_tools/rebuild_zebra.pl
+++ b/misc/migration_tools/rebuild_zebra.pl
@@ -1,6 +1,7 @@
  #!/usr/bin/perl
  
  use strict;
+#use warnings; FIXME - Bug 2505
  
  use C4::Context;
  use Getopt::Long;
@@ -8,6 +9,7 @@ use File::Temp qw/ tempdir /;
  use File::Path;
  use C4::Biblio;
  use C4::AuthoritiesMarc;
+use C4::Items;
  
  # 
  # script that checks zebradir structure & create directories & mandatory files if needed
@@ -15,8 +17,11 @@ use C4::AuthoritiesMarc;
  #
  
  $|=1; # flushes output
-
+# If the cron job starts us in an unreadable dir, we will break without
+# this.
+chdir $ENV{HOME} if (!(-r '.'));
  my $directory;
+my $nosanitize;
  my $skip_export;
  my $keep_export;
  my $reset;
@@ -29,11 +34,17 @@ my $want_help;
  my $as_xml;
  my $process_zebraqueue;
  my $do_not_clear_zebraqueue;
+my $length;
+my $where;
+my $offset;
+my $verbose_logging = 0;
+my $zebraidx_log_opt = " -v none,fatal,warn ";
  my $result = GetOptions(
      'd:s'           => \$directory,
-    'reset'         => \$reset,
+    'r|reset'       => \$reset,
      's'             => \$skip_export,
      'k'             => \$keep_export,
+    'nosanitize'    => \$nosanitize,
      'b'             => \$biblios,
      'noxml'         => \$noxml,
      'w'             => \$noshadow,
@@ -43,6 +54,10 @@ my $result = GetOptions(
         'x'                             => \$as_xml,
      'y'             => \$do_not_clear_zebraqueue,
      'z'             => \$process_zebraqueue,
+    'where:s'        => \$where,
+    'length:i'        => \$length,
+    'offset:i'      => \$offset,
+    'v+'             => \$verbose_logging,
  );
  
  
@@ -63,6 +78,12 @@ if ($authorities and $as_xml) {
      die $msg;
  }
  
+if ( !$as_xml and $nosanitize ) {
+    my $msg = "Cannot specify both -no_xml and -nosanitize\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
  if ($process_zebraqueue and ($skip_export or $reset)) {
      my $msg = "Cannot specify -r or -s if -z is specified\n";
      $msg   .= "Please do '$0 --help' to see usage.\n";
@@ -78,6 +99,13 @@ if ($process_zebraqueue and $do_not_clear_zebraqueue) {
  if ($noshadow) {
      $noshadow = ' -n ';
  }
+
+#  -v is for verbose, which seems backwards here because of how logging is set
+#    on the CLI of zebraidx.  It works this way.  The default is to not log much
+if ($verbose_logging >= 2) {
+    $zebraidx_log_opt = '-v none,fatal,warn,all';
+}
+
  my $use_tempdir = 0;
  unless ($directory) {
      $use_tempdir = 1;
@@ -93,35 +121,39 @@ my $dbh = C4::Context->dbh;
  my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber","");
  my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber","");
  
-print "Zebra configuration information\n";
-print "================================\n";
-print "Zebra biblio directory      = $biblioserverdir\n";
-print "Zebra authorities directory = $authorityserverdir\n";
-print "Koha directory              = $kohadir\n";
-print "BIBLIONUMBER in :     $biblionumbertagfield\$$biblionumbertagsubfield\n";
-print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
-print "================================\n";
+if ( $verbose_logging ) {
+    print "Zebra configuration information\n";
+    print "================================\n";
+    print "Zebra biblio directory      = $biblioserverdir\n";
+    print "Zebra authorities directory = $authorityserverdir\n";
+    print "Koha directory              = $kohadir\n";
+    print "BIBLIONUMBER in :     $biblionumbertagfield\$$biblionumbertagsubfield\n";
+    print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
+    print "================================\n";
+}
  
  if ($do_munge) {
      munge_config();
  }
  
  if ($authorities) {
-    index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue);
+    index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
  } else {
-    print "skipping authorities\n";
+    print "skipping authorities\n" if ( $verbose_logging );
  }
  
  if ($biblios) {
-    index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue);
+    index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
  } else {
-    print "skipping biblios\n";
+    print "skipping biblios\n" if ( $verbose_logging );
  }
  
  
-print "====================\n";
-print "CLEANING\n";
-print "====================\n";
+if ( $verbose_logging ) {
+    print "====================\n";
+    print "CLEANING\n";
+    print "====================\n";
+}
  if ($keep_export) {
      print "NOTHING cleaned : the export $directory has been kept.\n";
      print "You can re-run this script with the -s ";
@@ -143,34 +175,59 @@ if ($keep_export) {
      }
  }
  
+# This checks to see if the zebra directories exist under the provided path.
+# If they don't, then zebra is likely to spit the dummy. This returns true
+# if the directories had to be created, false otherwise.
+sub check_zebra_dirs {
+       my ($base) = shift() . '/';
+       my $needed_repairing = 0;
+       my @dirs = ( '', 'key', 'register', 'shadow', 'tmp' );
+       foreach my $dir (@dirs) {
+               my $bdir = $base . $dir;
+        if (! -d $bdir) {
+               $needed_repairing = 1;
+               mkdir $bdir || die "Unable to create '$bdir': $!\n";
+               print "$0: needed to create '$bdir'\n";
+        }
+    }
+    return $needed_repairing;
+}      # ----------  end of subroutine check_zebra_dirs  ----------
+
  sub index_records {
-    my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue) = @_;
+    my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
  
      my $num_records_exported = 0;
-    my $num_records_deleted = 0;
-    if ($skip_export) {
+    my $records_deleted;
+    my $need_reset = check_zebra_dirs($server_dir);
+    if ($need_reset) {
+       print "$0: found broken zebra server directories: forcing a rebuild\n";
+       $reset = 1;
+    }
+    if ($skip_export && $verbose_logging) {
          print "====================\n";
          print "SKIPPING $record_type export\n";
          print "====================\n";
      } else {
-        print "====================\n";
-        print "exporting $record_type\n";
-        print "====================\n";
+        if ( $verbose_logging ) {
+            print "====================\n";
+            print "exporting $record_type\n";
+            print "====================\n";
+        }
          mkdir "$directory" unless (-d $directory);
          mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
          if ($process_zebraqueue) {
              my $entries = select_zebraqueue_records($record_type, 'deleted');
              mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
-            $num_records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
+            $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
              mark_zebraqueue_batch_done($entries);
              $entries = select_zebraqueue_records($record_type, 'updated');
              mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
              $num_records_exported = export_marc_records_from_list($record_type, 
-                                                                  $entries, "$directory/upd_$record_type", $as_xml, $noxml);
+                                                                  $entries, "$directory/upd_$record_type", $as_xml, $noxml, $records_deleted);
              mark_zebraqueue_batch_done($entries);
          } else {
              my $sth = select_all_records($record_type);
-            $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml);
+            $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
              unless ($do_not_clear_zebraqueue) {
                  mark_all_zebraqueue_done($record_type);
              }
@@ -180,21 +237,24 @@ sub index_records {
      #
      # and reindexing everything
      #
-    print "====================\n";
-    print "REINDEXING zebra\n";
-    print "====================\n";
+    if ( $verbose_logging ) {
+        print "====================\n";
+        print "REINDEXING zebra\n";
+        print "====================\n";
+    }
         my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
      if ($process_zebraqueue) {
-        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt) 
-            if $num_records_deleted;
-        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt)
+        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) 
+            if %$records_deleted;
+        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
              if $num_records_exported;
      } else {
-        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt)
-            if $num_records_exported;
+        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+            if ($num_records_exported or $skip_export);
      }
  }
  
+
  sub select_zebraqueue_records {
      my ($record_type, $update_type) = @_;
  
@@ -240,26 +300,62 @@ sub select_all_records {
  }
  
  sub select_all_authorities {
-    my $sth = $dbh->prepare("SELECT authid FROM auth_header");
+    my $strsth=qq{SELECT authid FROM auth_header};
+    $strsth.=qq{ WHERE $where } if ($where);
+    $strsth.=qq{ LIMIT $length } if ($length && !$offset);
+    $strsth.=qq{ LIMIT $offset,$length } if ($length && $offset);
+    my $sth = $dbh->prepare($strsth);
      $sth->execute();
      return $sth;
  }
  
  sub select_all_biblios {
-    my $sth = $dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber");
+    my $strsth = qq{ SELECT biblionumber FROM biblioitems };
+    $strsth.=qq{ WHERE $where } if ($where);
+    $strsth.=qq{ LIMIT $length } if ($length && !$offset);
+    $strsth.=qq{ LIMIT $offset,$length } if ($offset);
+    my $sth = $dbh->prepare($strsth);
      $sth->execute();
      return $sth;
  }
  
  sub export_marc_records_from_sth {
-    my ($record_type, $sth, $directory, $as_xml, $noxml) = @_;
+    my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
  
      my $num_exported = 0;
-    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!;
      my $i = 0;
+    my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField("items.itemnumber",'');
      while (my ($record_number) = $sth->fetchrow_array) {
-        print ".";
-        print "\r$i" unless ($i++ %100);
+        print "." if ( $verbose_logging );
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        if ( $nosanitize ) {
+            my $marcxml = $record_type eq 'biblio'
+                          ? GetXmlBiblio( $record_number )
+                          : GetAuthorityXML( $record_number );
+            if ($record_type eq 'biblio'){
+                my @items = GetItemsInfo($record_number);
+                if (@items){
+                    my $record = MARC::Record->new;
+                    $record->encoding('UTF-8');
+                    my @itemsrecord;
+                    foreach my $item (@items){
+                        my $record = Item2Marc($item, $record_number);                        
+                        push @itemsrecord, $record->field($itemtag);
+                    }
+                    $record->insert_fields_ordered(@itemsrecord);
+                    my $itemsxml = $record->as_xml_record();
+                    $marcxml =
+                        substr($marcxml, 0, length($marcxml)-10) .
+                        substr($itemsxml, index($itemsxml, "</leader>\n", 0) + 10);
+                }
+            }
+            if ( $marcxml ) {
+                print {$fh} $marcxml if $marcxml;
+                $num_exported++;
+            }
+            next;
+        }
          my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
          if (defined $marc) {
              # FIXME - when more than one record is exported and $as_xml is true,
@@ -267,27 +363,34 @@ sub export_marc_records_from_sth {
              # strung together with no single root element.  zebraidx doesn't seem
              # to care, though, at least if you're using the GRS-1 filter.  It does
              # care if you're using the DOM filter, which requires valid XML file(s).
-            print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
-            $num_exported++;
+            eval {
+                print {$fh} ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
+                $num_exported++;
+            };
+            if ($@) {
+              warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
+            }
          }
      }
-    print "\nRecords exported: $num_exported\n";
-    close OUT;
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close $fh;
      return $num_exported;
  }
  
  sub export_marc_records_from_list {
-    my ($record_type, $entries, $directory, $as_xml, $noxml) = @_;
+    my ($record_type, $entries, $directory, $as_xml, $noxml, $records_deleted) = @_;
  
      my $num_exported = 0;
-    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;
      my $i = 0;
-    my %found = ();
+
+    # Skip any deleted records. We check for this anyway, but this reduces error spam
+    my %found = %$records_deleted;
      foreach my $record_number ( map { $_->{biblio_auth_number} }
                                  grep { !$found{ $_->{biblio_auth_number} }++ }
                                  @$entries ) {
-        print ".";
-        print "\r$i" unless ($i++ %100);
+        print "." if ( $verbose_logging );
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
          my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
          if (defined $marc) {
              # FIXME - when more than one record is exported and $as_xml is true,
@@ -295,24 +398,24 @@ sub export_marc_records_from_list {
              # strung together with no single root element.  zebraidx doesn't seem
              # to care, though, at least if you're using the GRS-1 filter.  It does
              # care if you're using the DOM filter, which requires valid XML file(s).
-            print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+            print {$fh} ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
              $num_exported++;
          }
      }
-    print "\nRecords exported: $num_exported\n";
-    close OUT;
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close $fh;
      return $num_exported;
  }
  
  sub generate_deleted_marc_records {
      my ($record_type, $entries, $directory, $as_xml) = @_;
  
-    my $num_exported = 0;
-    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $records_deleted = {};
+    open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;
      my $i = 0;
      foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
-        print "\r$i" unless ($i++ %100);
-        print ".";
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        print "." if ( $verbose_logging );
  
          my $marc = MARC::Record->new();
          if ($record_type eq 'biblio') {
@@ -324,12 +427,13 @@ sub generate_deleted_marc_records {
              fix_unimarc_100($marc);
          }
  
-        print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
-        $num_exported++;
+        print {$fh} ($as_xml) ? $marc->as_xml_record(C4::Context->preference("marcflavour")) : $marc->as_usmarc();
+
+        $records_deleted->{$record_number} = 1;
      }
-    print "\nRecords exported: $num_exported\n";
-    close OUT;
-    return $num_exported;
+    print "\nRecords exported: $i\n" if ( $verbose_logging );
+    close $fh;
+    return $records_deleted;
      
  
  }
@@ -341,10 +445,7 @@ sub get_corrected_marc_record {
  
      if (defined $marc) {
          fix_leader($marc);
-        if ($record_type eq 'biblio') {
-            my $succeeded = fix_biblio_ids($marc, $record_number);
-            return unless $succeeded;
-        } else {
+        if ($record_type eq 'authority') {
              fix_authority_id($marc, $record_number);
          }
          if (C4::Context->preference("marcflavour") eq "UNIMARC") {
@@ -365,15 +466,19 @@ sub get_raw_marc_record {
              $fetch_sth->execute($record_number);
              if (my ($blob) = $fetch_sth->fetchrow_array) {
                  $marc = MARC::Record->new_from_usmarc($blob);
-                $fetch_sth->finish();
-            } else {
-                return; # failure to find a bib is not a problem -
-                        # a delete could have been done before
-                        # trying to process a record update
+                unless ($marc) {
+                    warn "error creating MARC::Record from $blob";
+                }
              }
+            # failure to find a bib is not a problem -
+            # a delete could have been done before
+            # trying to process a record update
+
+            $fetch_sth->finish();
+            return unless $marc;
          } else {
-            eval { $marc = GetMarcBiblio($record_number); };
-            if ($@) {
+            eval { $marc = GetMarcBiblio($record_number, 1); };
+            if ($@ || !$marc) {
                  # here we do warn since catching an exception
                  # means that the bib was found but failed
                  # to be parsed
@@ -475,16 +580,16 @@ sub fix_unimarc_100 {
  }
  
  sub do_indexing {
-    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format) = @_;
+    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format, $zebraidx_log_opt) = @_;
  
      my $zebra_server  = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
      my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
      my $zebra_config  = C4::Context->zebraconfig($zebra_server)->{'config'};
      my $zebra_db_dir  = C4::Context->zebraconfig($zebra_server)->{'directory'};
  
-    system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name init") if $reset_index;
-    system("zebraidx -c $zebra_config $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
-    system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name commit") unless $noshadow;
+    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name init") if $reset_index;
+    system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
+    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
  
  }
  
@@ -530,6 +635,11 @@ Parameters:
      -x                      export and index as xml instead of is02709 (biblios only).
                              use this if you might have records > 99,999 chars,
                                                         
+    -nosanitize             export biblio/authority records directly from DB marcxml
+                            field without sanitizing records. It speed up
+                            dump process but could fail if DB contains badly
+                            encoded records. Works only with -x,
+
      -w                      skip shadow indexing for this batch
  
      -y                      do NOT clear zebraqueue after indexing; normally,
@@ -539,7 +649,18 @@ Parameters:
                              the same records - specify -y to override this.  
                              Cannot be used with -z.
  
-    -munge-config           Deprecated option to try
+    -v                      increase the amount of logging.  Normally only 
+                            warnings and errors from the indexing are shown.
+                            Use log level 2 (-v -v) to include all Zebra logs.
+
+    --length   1234         how many biblio you want to export
+    --offset 1243           offset you want to start to
+                                example: --offset 500 --length=500 will result in a LIMIT 500,1000 (exporting 1000 records, starting by the 500th one)
+                                note that the numbers are NOT related to biblionumber, that's the intended behaviour.
+    --where                 let you specify a WHERE query, like itemtype='BOOK'
+                            or something like that
+
+    --munge-config          Deprecated option to try
                              to fix Zebra config files.
      --help or -h            show this message.
  _USAGE_
@@ -630,9 +751,11 @@ print "Info: tab dir : $tabdir\n";
  #
  my $created_dir_or_file = 0;
  if ($authorities) {
-    print "====================\n";
-    print "checking directories & files for authorities\n";
-    print "====================\n";
+    if ( $verbose_logging ) {
+        print "====================\n";
+        print "checking directories & files for authorities\n";
+        print "====================\n";
+    }
      unless (-d "$authorityserverdir") {
          system("mkdir -p $authorityserverdir");
          print "Info: created $authorityserverdir\n";
@@ -722,8 +845,8 @@ if ($authorities) {
      # AUTHORITIES : copying mandatory files
      #
      unless (-f C4::Context->zebraconfig('authorityserver')->{config}) {
-    open ZD,">:utf8 ",C4::Context->zebraconfig('authorityserver')->{config};
-    print ZD "
+    open my $zd, '>:encoding(UTF-8)' ,C4::Context->zebraconfig('authorityserver')->{config};
+    print {$zd} "
  # generated by KOHA/misc/migration_tools/rebuild_zebra.pl 
  profilePath:\${srcdir:-.}:$authorityserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/
  
@@ -771,10 +894,12 @@ rank:rank-1
      
  }
  if ($biblios) {
-    print "====================\n";
-    print "checking directories & files for biblios\n";
-    print "====================\n";
-    
+    if ( $verbose_logging ) {
+        print "====================\n";
+        print "checking directories & files for biblios\n";
+        print "====================\n";
+    }
+
      #
      # BIBLIOS : creating directory structure
      #
@@ -865,8 +990,8 @@ if ($biblios) {
      # BIBLIOS : copying mandatory files
      #
      unless (-f C4::Context->zebraconfig('biblioserver')->{config}) {
-    open ZD,">:utf8 ",C4::Context->zebraconfig('biblioserver')->{config};
-    print ZD "
+    open my $zd, '>:encoding(UTF-8)', C4::Context->zebraconfig('biblioserver')->{config};
+    print {$zd} "
  # generated by KOHA/misc/migrtion_tools/rebuild_zebra.pl 
  profilePath:\${srcdir:-.}:$biblioserverdir/tab/:$tabdir/tab/:\${srcdir:-.}/tab/