--- /dev/null
+#!/usr/bin/perl
+#
+# Copyright 2009 Liblime
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA 02111-1307 USA
+
+use strict;
+use warnings;
+
+use MARC::Record;
+use MARC::File::XML;
+use MARC::File::USMARC;
+
+use open OUT => ':utf8';
+
+use Getopt::Long qw(:config auto_help auto_version);
+use Pod::Usage;
+
+use C4::Biblio;
+use C4::Charset;
+use C4::Context;
+use C4::Debug;
+
+use vars qw($VERSION);
+
+BEGIN {
+ # find Koha's Perl modules
+ # test carefully before changing this
+ use FindBin;
+ eval { require "$FindBin::Bin/../kohalib.pl" };
+ $VERSION = 0.03;
+}
+
+our $debug;
+
+## OPTIONS
+my $help = 0;
+my $man = 0;
+my $verbose = 0;
+
+my $limit; # undef, not zero.
+my $offset = 0;
+my $dump = 0;
+my $all = 0;
+my $summary = 1;
+my $lint = 0;
+my $fix = 0;
+my $filename = "/tmp/MARC21_parse_test.$$.marc";
+
+GetOptions(
+ 'help|?' => \$help,
+ 'man' => \$man,
+ 'verbose' => \$verbose,
+ 'limit=i' => \$limit,
+ 'offset=i' => \$offset,
+ 'filename' => \$filename,
+ 'All!' => \$all,
+ 'Lint!' => \$lint,
+ 'dump!' => \$dump,
+ 'summary!' => \$summary,
+ 'fix!' => \$fix,
+) or pod2usage(2);
+pod2usage( -verbose => 2 ) if ($man);
+pod2usage( -verbose => 2 ) if ($help and $verbose);
+pod2usage(1) if $help;
+
+if ($debug) {
+ $summary++;
+ $verbose++;
+ $lint++;
+}
+
+my $lint_object;
+if ($lint) {
+ require MARC::Lint;
+ $lint_object = new MARC::Lint;
+}
+my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
+(uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
+
+# my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems"); # Too SLOW on large systems
+# $countq->execute; $countq->fetchrow();
+my $max = 999999; # arbitrary ceiling
+
+$limit or $limit = $max; # limit becomes max if unspecified
+
+if ($summary) {
+ printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
+ printf "# limit %d, offset %d:\n", $limit, $offset;
+ printf "# MARC::Lint warnings: %s\n", ($lint ? 'ON' : 'OFF');
+ $verbose and print "# Using temp file: $filename\n"
+}
+
+MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
+
+my $query = "SELECT * FROM biblioitems ";
+my $recs;
+if ($all) {
+ if ($limit or $offset) {
+ my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
+ $query .= $limit_clause;
+ }
+ $verbose and print "# Query: $query\n";
+ $recs = C4::Context->dbh->prepare($query);
+ $recs->execute();
+} else {
+ $query .= "WHERE biblionumber=?";
+ $verbose and print "# Query: $query\n";
+ $recs = C4::Context->dbh->prepare($query);
+ # no execute, we execute per biblionumber
+ print "# Reading biblionumbers from STDIN\n";
+}
+
+sub next_row {
+ $all and return $recs->fetchrow_hashref(); # no WHERE clause, just get it
+ while (my $biblionumber = <>) {
+ chomp($biblionumber);
+ unless (defined $biblionumber) {
+ print "Skipping blank line $.\n";
+ next;
+ }
+ unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
+ print "Skipping illegal biblionumber: $biblionumber (line $.)\n";
+ next;
+ }
+ ($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
+ $recs->execute($biblionumber);
+ return $recs->fetchrow_hashref();
+ }
+ return undef; # just in case
+}
+
+my $ilimit = $limit;
+$ilimit += $offset unless $all; # increase ilimit for offset. if $all, then offset is built into query.
+my $i = 0;
+my $found = 0;
+my $fixed = 0;
+my $fine = 0;
+my $failed = 0;
+my $warns = 0;
+my $printline = 0;
+while ( my $row = next_row() ) {
+ ++$i;
+ unless ($all) {
+ ($i > $ilimit) and last; # controls for user-input data/files
+ ($i > $offset) or next;
+ }
+ my $xml = $row->{marcxml};
+ my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber};
+ # $xml now pared down to just the <leader> element
+ $verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber};
+ my $stripped = StripNonXmlChars($xml);
+ ($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
+ my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
+ if (not $record) {
+ $found++;
+ my $msg = $@ || '';
+ $verbose or $msg =~ s# at /usr/.*$##gs; # shorten common error message
+ print "$bibnum_prefix ERROR: $msg\n";
+ } else {
+ $fine++;
+ }
+ if ($lint) {
+ open (FILE, ">$filename") or die "Cannot write to temp file: $filename";
+ print FILE $xml;
+ close FILE;
+ my $file = MARC::File::XML->in( $filename );
+ while ( my $marc = $file->next() ) { # should be only 1
+ # $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper; print Dumper($marc);
+ $lint_object->check_record( $marc );
+ if ($lint_object->warnings) {
+ $warns++;
+ print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
+ }
+ }
+ }
+ if ($fix and not $record) {
+ my $record_from_blob = MARC::Record->new_from_usmarc($row->{marc});
+ unless ($record_from_blob) {
+ print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
+ $failed++;
+ } else {
+ my $mod = ModBiblioMarc($record_from_blob, $row->{biblionumber}, '');
+ if ($mod) {
+ $fixed++; print "$bibnum_prefix FIXED\n";
+ } else {
+ $failed++; print "$bibnum_prefix FAILED from marc. Manual intervention required.\n";
+ }
+ }
+ }
+ $dump and print $row->{marcxml}, "\n";
+}
+
+(-f $filename) and unlink ($filename); # remove tempfile
+
+if ($summary) {
+ printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
+ printf "# limit %d, offset %d:\n", $limit, $offset;
+ print "\nRESULTS (number of records)...\n";
+ printf " %6d -- OK \n", $fine;
+ printf " %6d -- w/ bad marcxml \n", $found;
+ printf " %6d -- w/ MARC::Lint warnings\n", $warns;
+ printf " %6d -- fixed from marc \n", $fixed;
+ printf " %6d -- failed to fix \n", $failed;
+}
+
+
+__END__
+
+=head1 NAME
+
+MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
+
+=head1 SYNOPSIS
+
+MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
+
+ Help Options:
+ -h --help -? Brief help message
+ -m --man Full documentation, same as --help --verbose
+ --version Prints version info
+
+ Feedback Options:
+ -d --dump Dump MARCXML of biblioitems processed, default OFF
+ -s --summary Print initial and closing summary of good and bad biblioitems counted, default ON
+ -L --Lint Show any warnings from MARC::Lint, default OFF
+ -v --verbose Increase verbosity of output, default OFF
+
+ Run Options:
+ -f --fix Replace biblioitems.marcxml from data in marc field, default OFF
+ -A --All Use the whole biblioitems table as target set, default OFF
+ -l --limit Number of biblioitems to display or fix
+ -o --offset Number of biblioitems to skip (not displayed or fixed)
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--All>
+
+Target the entire biblioitems table.
+Beware, on a large table B<--All> can be very costly to performance.
+
+=item B<--fix>
+
+Without this option, no changes to any records are made. With <--fix>, the script attempts to reconstruct
+biblioitems.marcxml from biblioitems.marc.
+
+=item B<--limit=N>
+
+Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
+This applies whether the target records are determined by user input, filenames or <--All>.
+
+=item B<--offset=N>
+
+Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
+The default is 0, i.e. skip none of them.
+
+=back
+
+The binary ON/OFF options can be negated like:
+ B<--nosummary> Do not display summary.
+ B<--nodump> Do not dump MARCXML.
+ B<--noLint> Do not show MARC::Lint warnings.
+ B<--nofix> Do not change any records. This is the default mode.
+
+=head1 ARGUMENTS
+
+Any number of filepath arguments can be referenced. They will be read in order and used to select the target
+set of biblioitems. The file format should be simply one biblionumber per line. The B<--limit> and B<--offset>
+options can still be used with biblionumbers specified from file. Files will be ignored under the B<--All> option.
+
+=head1 DESCRIPTION
+
+This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.
+As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
+
+Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
+Or run with B<--All> to go through the entire table.
+Run the script again with B<--fix> to attempt repair of the same target set.
+
+After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
+
+=head1 USAGE EXAMPLES
+
+B<MARC21_parse_test.pl>
+
+In the most basic form, allows you to input biblionumbers and checks them individually.
+
+B<MARC21_parse_test.pl --fix>
+
+Same thing but fixes them if they fail to parse.
+
+B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
+
+Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt. Multiple file arguments can be used.
+
+B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
+
+Dumps MARCXML from the 16th, 17th and 18th records found in the database.
+
+B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
+
+Same thing as previous example in terse form.
+
+=head1 TODO
+
+Add more documentation for OPTIONS.
+
+Update zebra status so rebuild of index is not necessary.
+
+=head1 SEE ALSO
+
+MARC::Lint
+C4::Biblio
+
+=cut