Bug 20811: (RM follow-up) fix check for matching

[koha.git] / misc / migration_tools / rebuild_zebra_sliced.sh
diff --git a/misc/migration_tools/rebuild_zebra_sliced.sh b/misc/migration_tools/rebuild_zebra_sliced.sh

index 35752e0..1eeb551 100755 (executable)
--- a/misc/migration_tools/rebuild_zebra_sliced.sh
+++ b/misc/migration_tools/rebuild_zebra_sliced.sh
@@ -9,24 +9,72 @@ Index Koha records by chunks. It is useful when a record causes errors and
  stops the indexing process. With this script, if indexing of one chunk fails,
  that chunk is split into two or more chunks, and indexing continues on these chunks.
  rebuild_zebra.pl is called only once to export records. Splitting and indexing
-is handled by this script (using yaz-marcdump and zebraidx).
+is handled by this script (using zebraidx for indexing).
  
  Usage:
-$scriptname -t type -l X [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f]
+$scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index]
  $scriptname -h
  
-    -o | --offset         Offset parameter of rebuild_zebra.pl
-    -l | --length         Length parameter of rebuild_zebra.pl
+    -o | --offset         Offset parameter of rebuild_zebra.pl.
+                          Default: $OFFSET
+    -l | --length         Length parameter of rebuild_zebra.pl. If omitted, the
+                          length is automatically calculated to index all
+                          records
      -s | --chunks-size    Initial chunk size (number of records indexed at once)
+                          Default: $CHUNKSSIZE
      -d | --export-dir     Where rebuild_zebra.pl will export data
+                          Default: $EXPORTDIR
+    -x | --exclude-export Do not export Biblios from Koha, but use the existing
+                          export-dir
      -L | --log-dir        Log directory
+                          Default: $LOGDIR
      -r | --remove-logs    Clean log directory before start
+                          Default: $RMLOGS
      -t | --type           Record type ('biblios' or 'authorities')
+                          Default: $TYPE
      -f | --force          Don't ask for confirmation before start
      -h | --help           Display this help message
+    --reset-index         Reset Zebra index for 'type'
  EOF
  }
  
+splitfile() {
+    local file=$1
+    local prefix=$2
+    local size=$3
+    local script='
+        my $indexmode = '"$INDEXMODE"';
+        my $prefix = '"\"$prefix\""';
+        my $size = '"$size"';
+        my ($i,$count) = (0,0);
+        open(my $fh, "<", '"\"$file\""');
+        open(my $out, ">", sprintf("$prefix%02d", $i));
+        my $closed = 0;
+        while (<$fh>) {
+            my $line = $_;
+            if ($closed) {
+                open($out, ">", sprintf("$prefix%02d", $i));
+                $closed = 0;
+                if ($indexmode eq "dom" && $line !~ /<collection>/) {
+                    print $out "<collection>";
+                }
+            }
+            print $out $line;
+            $count++ if ($line =~ m|^</record>|);
+            if ($count == $size) {
+                if ($indexmode eq "dom" && $line !~ m|</collection>|) {
+                    print $out "</collection>";
+                }
+                $count = 0;
+                $i++;
+                close($out);
+                $closed = 1;
+            }
+        }
+    '
+    $PERL -e "$script"
+}
+
  indexfile() {
      local file=$1
      local chunkssize=$2
@@ -37,23 +85,23 @@ indexfile() {
  
          local prefix="${file}_${chunkssize}_"
          echo "Splitting file in chunks of $chunkssize records"
-        YAZMARCDUMP_CMD="$YAZMARCDUMP -n -s $prefix -C $chunkssize $file"
-        $YAZMARCDUMP_CMD
+        splitfile $file $prefix $chunkssize
  
          dir=$(dirname $prefix)
          local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
          for chunkfile in $files; do
              echo "Indexing $chunkfile"
-            size=$($YAZMARCDUMP -p $chunkfile | grep '<!-- Record [0-9]\+ offset .* -->' | wc -l)
+            size=$(grep '^</record>' $chunkfile | wc -l)
              logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
-            ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile"
+            ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile"
              $ZEBRAIDX_CMD >$logfile 2>&1
              grep "Records: $size" $logfile >/dev/null 2>&1
              if [ $? -ne 0 ]; then
-                echo "Indexing failed. Split file and continue..."
+                echo "Indexing failed. See log file $logfile"
+                echo "Split file and continue..."
                  indexfile $chunkfile $(($chunkssize/2))
              else
-                ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit"
+                ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit"
                  $ZEBRAIDX_CMD >> $logfile 2>&1
              fi
          done
@@ -64,11 +112,13 @@ OFFSET=0
  LENGTH=
  CHUNKSSIZE=10000
  EXPORTDIR=/tmp/rebuild/export
+EXCLUDEEXPORT=no
  LOGDIR=/tmp/rebuild/logs
  RMLOGS=no
  NOCONFIRM=no
  TYPE=biblios
  HELP=no
+RESETINDEX=no
  
  # Get parameters
  while [ $1 ]; do
@@ -93,6 +143,9 @@ while [ $1 ]; do
              shift
              LOGDIR=$1
              ;;
+        -x | --exclude-export )
+            EXCLUDEEXPORT=yes
+            ;;
          -r | --remove-logs )
              RMLOGS=yes
              ;;
@@ -103,9 +156,12 @@ while [ $1 ]; do
          -f | --force )
              NOCONFIRM=yes
              ;;
-        -h | --help)
+        -h | --help )
              HELP=yes
              ;;
+        --reset-index )
+            RESETINDEX=yes
+            ;;
          * )
              usage
              exit 1
@@ -118,34 +174,54 @@ if [ $HELP = "yes" ]; then
      exit 0
  fi
  
-if [ -z $LENGTH ]; then
-    echo "--length parameter is mandatory"
+if [ -z $KOHA_CONF ]; then
+    echo "KOHA_CONF is not set"
      exit 1
  fi
  
+if [ -z $PERL5LIB ]; then
+    echo "PERL5LIB is not set"
+    exit 1
+fi
+
+
  TYPESWITCH=
+SQLTABLE=
  case $TYPE in
      biblios )
          TYPESWITCH=-b
+        SQLTABLE="biblio"
          ;;
      authorities )
          TYPESWITCH=-a
+        SQLTABLE="auth_header"
          ;;
      * )
          echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
          TYPESWITCH=-b
          TYPE=biblios
+        SQLTABLE="biblio"
  esac
  
-ZEBRAIDX=`which zebraidx`
-if [ -z $ZEBRAIDX ]; then
-    echo "zebraidx not found"
+PERL=`which perl`
+if [ -z $PERL ]; then
+    echo "perl not found"
      exit 1
  fi
  
-YAZMARCDUMP=`which yaz-marcdump`
-if [ -z $YAZMARCDUMP ]; then
-    echo "yaz-marcdump not found"
+if [ -z $LENGTH ]; then
+    LENGTH=$($PERL -e '
+        use C4::Context;
+        my ($count) = C4::Context->dbh->selectrow_array(qq{
+            SELECT COUNT(*) FROM '"$SQLTABLE"'
+        });
+        print $count;
+    ')
+fi
+
+ZEBRAIDX=`which zebraidx`
+if [ -z $ZEBRAIDX ]; then
+    echo "zebraidx not found"
      exit 1
  fi
  
@@ -158,17 +234,22 @@ fi
  echo ""
  echo "Configuration"
  echo "========================================================================="
+echo "KOHA_CONF: $KOHA_CONF"
+echo "PERL5LIB: $PERL5LIB"
+echo "-------------------------------------------------------------------------"
  echo "Start at offset: $OFFSET"
  echo "Total number of records to index: $LENGTH"
  echo "Initial chunk size: $CHUNKSSIZE"
  echo "Export directory: $EXPORTDIR"
+echo "Exclude re-exporting: $EXCLUDEEXPORT"
  echo "Log directory: $LOGDIR"
  echo "Remove logs before start? $RMLOGS"
  echo "Type of record: $TYPE"
+echo "Reset index before start? $RESETINDEX"
  echo "-------------------------------------------------------------------------"
  echo "zebraidx path: $ZEBRAIDX"
-echo "yaz-marcdump path: $YAZMARCDUMP"
  echo "rebuild_zebra path: $REBUILDZEBRA"
+echo "perl path: $PERL"
  echo "========================================================================="
  
  if [ $NOCONFIRM != "yes" ]; then
@@ -200,24 +281,43 @@ if [ $RMLOGS = "yes" ]; then
      rm -f $LOGDIR/*.log
  fi
  
-REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
-echo "\n$REBUILDZEBRA_CMD"
-$REBUILDZEBRA_CMD
+if [ $EXCLUDEEXPORT = "no" ]; then
+    REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
+    echo "\n$REBUILDZEBRA_CMD"
+    $REBUILDZEBRA_CMD
+fi
  
  EXPORTFILE=
  case $TYPE in
      biblios )
          EXPORTFILE="$EXPORTDIR/biblio/exported_records"
+        indexmode_config_name="zebra_bib_index_mode"
          ;;
      authorities )
          EXPORTFILE="$EXPORTDIR/authority/exported_records"
+        indexmode_config_name="zebra_auth_index_mode"
          ;;
      * )
          echo "Error: TYPE '$TYPE' is not supported"
          exit 1
  esac
  
-CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg"
+INDEXMODE=$(perl -e '
+    use C4::Context;
+    print C4::Context->config('"$indexmode_config_name"');
+')
  
+CONFIGFILE=$(perl -e '
+    use C4::Context;
+    my $zebra_server = ('"$TYPE"' eq "biblios") ? "biblioserver" : "authorityserver";
+    print C4::Context->zebraconfig($zebra_server)->{config};
+')
+
+if [ $RESETINDEX = "yes" ]; then
+    RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init"
+    echo "\n$RESETINDEX_CMD"
+    $RESETINDEX_CMD
+    echo ""
+fi
  
  indexfile $EXPORTFILE $CHUNKSSIZE