Bug 7286: rebuild_zebra_sliced for biblios and authorities

author Julian Maurice <julian.maurice@biblibre.com>

Mon, 2 Jul 2012 11:57:31 +0000 (13:57 +0200)

committer Paul Poulain <paul.poulain@biblibre.com>

Fri, 6 Jul 2012 13:06:40 +0000 (15:06 +0200)
author Julian Maurice <julian.maurice@biblibre.com>
Mon, 2 Jul 2012 11:57:31 +0000 (13:57 +0200)
committer Paul Poulain <paul.poulain@biblibre.com>
Fri, 6 Jul 2012 13:06:40 +0000 (15:06 +0200)
diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl

index dbbb140..1621e84 100755 (executable)
--- a/misc/migration_tools/rebuild_zebra.pl
+++ b/misc/migration_tools/rebuild_zebra.pl
@@ -24,6 +24,7 @@ my $directory;
  my $nosanitize;
  my $skip_export;
  my $keep_export;
+my $skip_index;
  my $reset;
  my $biblios;
  my $authorities;
@@ -44,6 +45,7 @@ my $result = GetOptions(
      'r|reset'       => \$reset,
      's'             => \$skip_export,
      'k'             => \$keep_export,
+    'I|skip-index'    => \$skip_index,
      'nosanitize'    => \$nosanitize,
      'b'             => \$biblios,
      'noxml'         => \$noxml,
@@ -134,13 +136,13 @@ if ($do_munge) {
  }
  
  if ($authorities) {
-    index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
+    index_records('authority', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
  } else {
      print "skipping authorities\n" if ( $verbose_logging );
  }
  
  if ($biblios) {
-    index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
+    index_records('biblio', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
  } else {
      print "skipping biblios\n" if ( $verbose_logging );
  }
@@ -191,7 +193,7 @@ sub check_zebra_dirs {
  }      # ----------  end of subroutine check_zebra_dirs  ----------
  
  sub index_records {
-    my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
+    my ($record_type, $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
  
      my $num_records_exported = 0;
      my $records_deleted;
@@ -230,24 +232,32 @@ sub index_records {
              }
          }
      }
-    
+
      #
      # and reindexing everything
      #
-    if ( $verbose_logging ) {
-        print "====================\n";
-        print "REINDEXING zebra\n";
-        print "====================\n";
-    }
-       my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
-    if ($process_zebraqueue) {
-        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) 
-            if %$records_deleted;
-        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
-            if $num_records_exported;
+    if ($skip_index) {
+        if ($verbose_logging) {
+            print "====================\n";
+            print "SKIPPING $record_type indexing\n";
+            print "====================\n";
+        }
      } else {
-        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
-            if ($num_records_exported or $skip_export);
+        if ( $verbose_logging ) {
+            print "====================\n";
+            print "REINDEXING zebra\n";
+            print "====================\n";
+        }
+        my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+        if ($process_zebraqueue) {
+            do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+                if %$records_deleted;
+            do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+                if $num_records_exported;
+        } else {
+            do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+                if ($num_records_exported or $skip_export);
+        }
      }
  }
  
diff --git a/misc/migration_tools/rebuild_zebra_sliced.sh b/misc/migration_tools/rebuild_zebra_sliced.sh

new file mode 100755 (executable)

index 0000000..f1b73f9
--- /dev/null
+++ b/misc/migration_tools/rebuild_zebra_sliced.sh
@@ -0,0 +1,223 @@
+#!/bin/sh
+
+usage() {
+    local scriptname=$(basename $0)
+    cat <<EOF
+$scriptname
+
+Index Koha records by chunks. It is useful when some record causes errors and
+stop the indexation process. With this script, if indexation of one chunk fails,
+chunk is splitted in two or more chunks, and indexation continue on these chunks.
+rebuild_zebra.pl is called only once to export records. Splitting and indexing
+is handled by this script (using yaz-marcdump and zebraidx).
+
+Usage:
+$scriptname -t type -l X [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f]
+$scriptname -h
+
+    -o | --offset         Offset parameter of rebuild_zebra.pl
+    -l | --length         Length parameter of rebuild_zebra.pl
+    -s | --chunks-size    Initial chunk size (number of records indexed at once)
+    -d | --export-dir     Where rebuild_zebra.pl will export data
+    -L | --log-dir        Log directory
+    -r | --remove-logs    Clean log directory before start
+    -t | --type           Record type ('biblios' or 'authorities')
+    -f | --force          Don't ask for confirmation before start
+    -h | --help           Display this help message
+EOF
+}
+
+indexfile() {
+    local file=$1
+    local chunkssize=$2
+
+    if [ $chunkssize -lt 1 ]; then
+        echo "Fail on file $file"
+    else
+
+        local prefix="${file}_${chunkssize}_"
+        echo "Splitting file in chunks of $chunkssize records"
+        YAZMARCDUMP_CMD="$YAZMARCDUMP -n -s $prefix -C $chunkssize $file"
+        $YAZMARCDUMP_CMD
+
+        dir=$(dirname $prefix)
+        local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
+        for chunkfile in $files; do
+            echo "Indexing $chunkfile"
+            size=$($YAZMARCDUMP -p $chunkfile | grep '<!-- Record [0-9]\+ offset .* -->' | wc -l)
+            logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
+            ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile"
+            $ZEBRAIDX_CMD >$logfile 2>&1
+            grep "Records: $size" $logfile >/dev/null 2>&1
+            if [ $? -ne 0 ]; then
+                echo "Indexing failed. Split file and continue..."
+                indexfile $chunkfile $(($chunkssize/2))
+            else
+                ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit"
+                $ZEBRAIDX_CMD >> $logfile 2>&1
+            fi
+        done
+    fi
+}
+
+OFFSET=0
+LENGTH=
+CHUNKSSIZE=10000
+EXPORTDIR=/tmp/rebuild/export
+LOGDIR=/tmp/rebuild/logs
+RMLOGS=no
+NOCONFIRM=no
+TYPE=biblios
+HELP=no
+
+# Get parameters
+while [ $1 ]; do
+    case $1 in
+        -o | --offset )
+            shift
+            OFFSET=$1
+            ;;
+        -l | --length )
+            shift
+            LENGTH=$1
+            ;;
+        -s | --chunks-size )
+            shift
+            CHUNKSSIZE=$1
+            ;;
+        -d | --export-dir )
+            shift
+            EXPORTDIR=$1
+            ;;
+        -L | --log-dir )
+            shift
+            LOGDIR=$1
+            ;;
+        -r | --remove-logs )
+            RMLOGS=yes
+            ;;
+        -t | --type )
+            shift
+            TYPE=$1
+            ;;
+        -f | --force )
+            NOCONFIRM=yes
+            ;;
+        -h | --help)
+            HELP=yes
+            ;;
+        * )
+            usage
+            exit 1
+    esac
+    shift
+done
+
+if [ $HELP = "yes" ]; then
+    usage
+    exit 0
+fi
+
+if [ -z $LENGTH ]; then
+    echo "--length parameter is mandatory"
+    exit 1
+fi
+
+TYPESWITCH=
+case $TYPE in
+    biblios )
+        TYPESWITCH=-b
+        ;;
+    authorities )
+        TYPESWITCH=-a
+        ;;
+    * )
+        echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
+        TYPESWITCH=-b
+        TYPE=biblios
+esac
+
+ZEBRAIDX=`which zebraidx`
+if [ -z $ZEBRAIDX ]; then
+    echo "zebraidx not found"
+    exit 1
+fi
+
+YAZMARCDUMP=`which yaz-marcdump`
+if [ -z $YAZMARCDUMP ]; then
+    echo "yaz-marcdump not found"
+    exit 1
+fi
+
+REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
+if [ ! -f $REBUILDZEBRA ]; then
+    echo "$REBUILDZEBRA: file not found"
+    exit 1
+fi
+
+echo ""
+echo "Configuration"
+echo "========================================================================="
+echo "Start at offset: $OFFSET"
+echo "Total number of records to index: $LENGTH"
+echo "Initial chunk size: $CHUNKSSIZE"
+echo "Export directory: $EXPORTDIR"
+echo "Log directory: $LOGDIR"
+echo "Remove logs before start? $RMLOGS"
+echo "Type of record: $TYPE"
+echo "-------------------------------------------------------------------------"
+echo "zebraidx path: $ZEBRAIDX"
+echo "yaz-marcdump path: $YAZMARCDUMP"
+echo "rebuild_zebra path: $REBUILDZEBRA"
+echo "========================================================================="
+
+if [ $NOCONFIRM != "yes" ]; then
+    confirm=y
+    echo -n "Confirm ? [Y/n] "
+    read response
+    if [ $response ] && [ $response != "yes" ] && [ $response != "y" ]; then
+        confirm=n
+    fi
+
+    if [ $confirm = "n" ]; then
+        exit 0
+    fi
+fi
+
+mkdir -p $EXPORTDIR
+if [ $? -ne 0 ]; then
+    echo "Failed to create directory $EXPORTDIR. Aborting."
+    exit 1
+fi
+
+mkdir -p $LOGDIR
+if [ $? -ne 0 ]; then
+    echo "Failed to create directory $LOGDIR. Aborting."
+    exit 1
+fi
+
+if [ $RMLOGS = "yes" ]; then
+    rm -f $LOGDIR/*.log
+fi
+
+REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
+echo "\n$REBUILDZEBRA_CMD"
+$REBUILDZEBRA_CMD
+
+EXPORTFILE=
+case $TYPE in
+    biblios )
+        EXPORTFILE="$EXPORTDIR/biblio/exported_records"
+        ;;
+    authorities )
+        EXPORTFILE="$EXPORTDIR/authority/exported_records"
+        ;;
+    * )
+        echo "Error: TYPE '$TYPE' is not supported"
+        exit 1
+esac
+
+CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg"
+
+
+indexfile $EXPORTFILE $CHUNKSSIZE
author	Julian Maurice <julian.maurice@biblibre.com>
	Mon, 2 Jul 2012 11:57:31 +0000 (13:57 +0200)
committer	Paul Poulain <paul.poulain@biblibre.com>
	Fri, 6 Jul 2012 13:06:40 +0000 (15:06 +0200)
misc/migration_tools/rebuild_zebra.pl		patch \| blob \| history
misc/migration_tools/rebuild_zebra_sliced.sh	[new file with mode: 0755]	patch \| blob