misc/migration_tools/rebuild_zebra_sliced.sh

   1 #!/bin/sh
   2
   3 usage() {
   4     local scriptname=$(basename $0)
   5     cat <<EOF
   6 $scriptname
   7
   8 Index Koha records by chunks. It is useful when a record causes errors and
   9 stops the indexing process. With this script, if indexing of one chunk fails,
  10 that chunk is split into two or more chunks, and indexing continues on these chunks.
  11 rebuild_zebra.pl is called only once to export records. Splitting and indexing
  12 is handled by this script (using zebraidx for indexing).
  13
  14 Usage:
  15 $scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index]
  16 $scriptname -h
  17
  18     -o | --offset         Offset parameter of rebuild_zebra.pl.
  19                           Default: $OFFSET
  20     -l | --length         Length parameter of rebuild_zebra.pl. If omitted, the
  21                           length is automatically calculated to index all
  22                           records
  23     -s | --chunks-size    Initial chunk size (number of records indexed at once)
  24                           Default: $CHUNKSSIZE
  25     -d | --export-dir     Where rebuild_zebra.pl will export data
  26                           Default: $EXPORTDIR
  27     -x | --exclude-export Do not export Biblios from Koha, but use the existing
  28                           export-dir
  29     -L | --log-dir        Log directory
  30                           Default: $LOGDIR
  31     -r | --remove-logs    Clean log directory before start
  32                           Default: $RMLOGS
  33     -t | --type           Record type ('biblios' or 'authorities')
  34                           Default: $TYPE
  35     -f | --force          Don't ask for confirmation before start
  36     -h | --help           Display this help message
  37     --reset-index         Reset Zebra index for 'type'
  38 EOF
  39 }
  40
  41 splitfile() {
  42     local file=$1
  43     local prefix=$2
  44     local size=$3
  45     local script='
  46         my $indexmode = '"$INDEXMODE"';
  47         my $prefix = '"\"$prefix\""';
  48         my $size = '"$size"';
  49         my ($i,$count) = (0,0);
  50         open(my $fh, "<", '"\"$file\""');
  51         open(my $out, ">", sprintf("$prefix%02d", $i));
  52         my $closed = 0;
  53         while (<$fh>) {
  54             my $line = $_;
  55             if ($closed) {
  56                 open($out, ">", sprintf("$prefix%02d", $i));
  57                 $closed = 0;
  58                 if ($indexmode eq "dom" && $line !~ /<collection>/) {
  59                     print $out "<collection>";
  60                 }
  61             }
  62             print $out $line;
  63             $count++ if ($line =~ m|^</record>|);
  64             if ($count == $size) {
  65                 if ($indexmode eq "dom" && $line !~ m|</collection>|) {
  66                     print $out "</collection>";
  67                 }
  68                 $count = 0;
  69                 $i++;
  70                 close($out);
  71                 $closed = 1;
  72             }
  73         }
  74     '
  75     $PERL -e "$script"
  76 }
  77
  78 indexfile() {
  79     local file=$1
  80     local chunkssize=$2
  81
  82     if [ $chunkssize -lt 1 ]; then
  83         echo "Fail on file $file"
  84     else
  85
  86         local prefix="${file}_${chunkssize}_"
  87         echo "Splitting file in chunks of $chunkssize records"
  88         splitfile $file $prefix $chunkssize
  89
  90         dir=$(dirname $prefix)
  91         local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
  92         for chunkfile in $files; do
  93             echo "Indexing $chunkfile"
  94             size=$(grep '^</record>' $chunkfile | wc -l)
  95             logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
  96             ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile"
  97             $ZEBRAIDX_CMD >$logfile 2>&1
  98             grep "Records: $size" $logfile >/dev/null 2>&1
  99             if [ $? -ne 0 ]; then
 100                 echo "Indexing failed. See log file $logfile"
 101                 echo "Split file and continue..."
 102                 indexfile $chunkfile $(($chunkssize/2))
 103             else
 104                 ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit"
 105                 $ZEBRAIDX_CMD >> $logfile 2>&1
 106             fi
 107         done
 108     fi
 109 }
 110
 111 OFFSET=0
 112 LENGTH=
 113 CHUNKSSIZE=10000
 114 EXPORTDIR=/tmp/rebuild/export
 115 EXCLUDEEXPORT=no
 116 LOGDIR=/tmp/rebuild/logs
 117 RMLOGS=no
 118 NOCONFIRM=no
 119 TYPE=biblios
 120 HELP=no
 121 RESETINDEX=no
 122
 123 # Get parameters
 124 while [ $1 ]; do
 125     case $1 in
 126         -o | --offset )
 127             shift
 128             OFFSET=$1
 129             ;;
 130         -l | --length )
 131             shift
 132             LENGTH=$1
 133             ;;
 134         -s | --chunks-size )
 135             shift
 136             CHUNKSSIZE=$1
 137             ;;
 138         -d | --export-dir )
 139             shift
 140             EXPORTDIR=$1
 141             ;;
 142         -L | --log-dir )
 143             shift
 144             LOGDIR=$1
 145             ;;
 146         -x | --exclude-export )
 147             EXCLUDEEXPORT=yes
 148             ;;
 149         -r | --remove-logs )
 150             RMLOGS=yes
 151             ;;
 152         -t | --type )
 153             shift
 154             TYPE=$1
 155             ;;
 156         -f | --force )
 157             NOCONFIRM=yes
 158             ;;
 159         -h | --help )
 160             HELP=yes
 161             ;;
 162         --reset-index )
 163             RESETINDEX=yes
 164             ;;
 165         * )
 166             usage
 167             exit 1
 168     esac
 169     shift
 170 done
 171
 172 if [ $HELP = "yes" ]; then
 173     usage
 174     exit 0
 175 fi
 176
 177 if [ -z $KOHA_CONF ]; then
 178     echo "KOHA_CONF is not set"
 179     exit 1
 180 fi
 181
 182 if [ -z $PERL5LIB ]; then
 183     echo "PERL5LIB is not set"
 184     exit 1
 185 fi
 186
 187
 188 TYPESWITCH=
 189 SQLTABLE=
 190 case $TYPE in
 191     biblios )
 192         TYPESWITCH=-b
 193         SQLTABLE="biblio"
 194         ;;
 195     authorities )
 196         TYPESWITCH=-a
 197         SQLTABLE="auth_header"
 198         ;;
 199     * )
 200         echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
 201         TYPESWITCH=-b
 202         TYPE=biblios
 203         SQLTABLE="biblio"
 204 esac
 205
 206 PERL=`which perl`
 207 if [ -z $PERL ]; then
 208     echo "perl not found"
 209     exit 1
 210 fi
 211
 212 if [ -z $LENGTH ]; then
 213     LENGTH=$($PERL -e '
 214         use C4::Context;
 215         my ($count) = C4::Context->dbh->selectrow_array(qq{
 216             SELECT COUNT(*) FROM '"$SQLTABLE"'
 217         });
 218         print $count;
 219     ')
 220 fi
 221
 222 ZEBRAIDX=`which zebraidx`
 223 if [ -z $ZEBRAIDX ]; then
 224     echo "zebraidx not found"
 225     exit 1
 226 fi
 227
 228 REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
 229 if [ ! -f $REBUILDZEBRA ]; then
 230     echo "$REBUILDZEBRA: file not found"
 231     exit 1
 232 fi
 233
 234 echo ""
 235 echo "Configuration"
 236 echo "========================================================================="
 237 echo "KOHA_CONF: $KOHA_CONF"
 238 echo "PERL5LIB: $PERL5LIB"
 239 echo "-------------------------------------------------------------------------"
 240 echo "Start at offset: $OFFSET"
 241 echo "Total number of records to index: $LENGTH"
 242 echo "Initial chunk size: $CHUNKSSIZE"
 243 echo "Export directory: $EXPORTDIR"
 244 echo "Exclude re-exporting: $EXCLUDEEXPORT"
 245 echo "Log directory: $LOGDIR"
 246 echo "Remove logs before start? $RMLOGS"
 247 echo "Type of record: $TYPE"
 248 echo "Reset index before start? $RESETINDEX"
 249 echo "-------------------------------------------------------------------------"
 250 echo "zebraidx path: $ZEBRAIDX"
 251 echo "rebuild_zebra path: $REBUILDZEBRA"
 252 echo "perl path: $PERL"
 253 echo "========================================================================="
 254
 255 if [ $NOCONFIRM != "yes" ]; then
 256     confirm=y
 257     echo -n "Confirm ? [Y/n] "
 258     read response
 259     if [ $response ] && [ $response != "yes" ] && [ $response != "y" ]; then
 260         confirm=n
 261     fi
 262
 263     if [ $confirm = "n" ]; then
 264         exit 0
 265     fi
 266 fi
 267
 268 mkdir -p $EXPORTDIR
 269 if [ $? -ne 0 ]; then
 270     echo "Failed to create directory $EXPORTDIR. Aborting."
 271     exit 1
 272 fi
 273
 274 mkdir -p $LOGDIR
 275 if [ $? -ne 0 ]; then
 276     echo "Failed to create directory $LOGDIR. Aborting."
 277     exit 1
 278 fi
 279
 280 if [ $RMLOGS = "yes" ]; then
 281     rm -f $LOGDIR/*.log
 282 fi
 283
 284 if [ $EXCLUDEEXPORT = "no" ]; then
 285     REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
 286     echo "\n$REBUILDZEBRA_CMD"
 287     $REBUILDZEBRA_CMD
 288 fi
 289
 290 EXPORTFILE=
 291 case $TYPE in
 292     biblios )
 293         EXPORTFILE="$EXPORTDIR/biblio/exported_records"
 294         indexmode_config_name="zebra_bib_index_mode"
 295         ;;
 296     authorities )
 297         EXPORTFILE="$EXPORTDIR/authority/exported_records"
 298         indexmode_config_name="zebra_auth_index_mode"
 299         ;;
 300     * )
 301         echo "Error: TYPE '$TYPE' is not supported"
 302         exit 1
 303 esac
 304
 305 INDEXMODE=$(perl -e '
 306     use C4::Context;
 307     print C4::Context->config('"$indexmode_config_name"');
 308 ')
 309
 310 CONFIGFILE=$(perl -e '
 311     use C4::Context;
 312     my $zebra_server = ('"$TYPE"' eq "biblios") ? "biblioserver" : "authorityserver";
 313     print C4::Context->zebraconfig($zebra_server)->{config};
 314 ')
 315
 316 if [ $RESETINDEX = "yes" ]; then
 317     RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init"
 318     echo "\n$RESETINDEX_CMD"
 319     $RESETINDEX_CMD
 320     echo ""
 321 fi
 322
 323 indexfile $EXPORTFILE $CHUNKSSIZE