refactored internal WebPAC::Input::* API a bit, added dump_rec,
authorDobrica Pavlinusic <dpavlin@rot13.org>
Thu, 7 Sep 2006 15:01:45 +0000 (15:01 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Thu, 7 Sep 2006 15:01:45 +0000 (15:01 +0000)
validate is now more clever and reports all errors from database at end

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@652 07558da8-63fa-0310-ba24-9fe276d99e06

lib/WebPAC/Input.pm
lib/WebPAC/Input/Excel.pm
lib/WebPAC/Input/ISIS.pm
lib/WebPAC/Input/MARC.pm
lib/WebPAC/Validate.pm
run.pl

index 2fb252d..be16e01 100644 (file)
@@ -107,7 +107,7 @@ sub new {
        #eval $self->{module} .'->import';
 
        # check if required subclasses are implemented
-       foreach my $subclass (qw/open_db fetch_rec init/) {
+       foreach my $subclass (qw/open_db fetch_rec init dump_rec/) {
                my $n = $self->{module} . '::' . $subclass;
                if (! defined &{ $n }) {
                        my $missing = "missing $subclass in $self->{module}";
@@ -298,7 +298,7 @@ sub open {
 
                $log->debug("position: $pos\n");
 
-               my $rec = $self->{fetch_rec}->($self, $db, $pos, sub {
+               my $rec = $self->{fetch_rec}->($self, $pos, sub {
                                my ($l,$f_nr) = @_;
 #                              return unless defined($l);
 #                              return $l unless ($rec_regex && $f_nr);
@@ -348,7 +348,7 @@ sub open {
                if ($self->{stats}) {
 
                        # fetch clean record with regexpes applied for statistics
-                       my $rec = $self->{fetch_rec}->($self, $db, $pos);
+                       my $rec = $self->{fetch_rec}->($self, $pos);
 
                        foreach my $fld (keys %{ $rec }) {
                                $self->{_stats}->{fld}->{ $fld }++;
@@ -546,6 +546,19 @@ sub stats {
        return $out;
 }
 
+=head2 dump
+
+Display humanly readable dump of record
+
+=cut
+
+sub dump {
+       my $self = shift;
+
+       return $self->{dump_rec}->($self, $self->{pos});
+
+}
+
 =head2 modify_record_regexps
 
 Generate hash with regexpes to be applied using l<filter>.
index c05f38d..21679f4 100644 (file)
@@ -96,7 +96,7 @@ sub open_db {
 
 Return record with ID C<$mfn> from database
 
-  my $rec = $self->fetch_rec( $db, $mfn );
+  my $rec = $self->fetch_rec( $mfn );
 
 }
 
@@ -105,7 +105,7 @@ Return record with ID C<$mfn> from database
 sub fetch_rec {
        my $self = shift;
 
-       my (undef, $mfn) = @_;
+       my $mfn = shift;
 
        my $log = $self->_get_logger();
 
index 05bb19c..4b882c5 100644 (file)
@@ -12,11 +12,11 @@ WebPAC::Input::ISIS - support for CDS/ISIS database files
 
 =head1 VERSION
 
-Version 0.06
+Version 0.07
 
 =cut
 
-our $VERSION = '0.06';
+our $VERSION = '0.07';
 
 
 =head1 SYNOPSIS
@@ -72,6 +72,8 @@ sub open_db {
 
        my $size = $isis_db->count;
 
+       $self->{_isis_db} = $isis_db;
+
        return ($isis_db, $size);
 }
 
@@ -79,16 +81,16 @@ sub open_db {
 
 Return record with ID C<$mfn> from database
 
-  my $rec = $self->fetch_rec( $db, $mfn, $filter_coderef);
+  my $rec = $self->fetch_rec( $mfn, $filter_coderef);
 
 =cut
 
 sub fetch_rec {
        my $self = shift;
 
-       my ($isis_db, $mfn, $filter_coderef) = @_;
+       my ($mfn, $filter_coderef) = @_;
 
-       my $rec = $isis_db->to_hash({
+       my $rec = $self->{_isis_db}->to_hash({
                mfn => $mfn,
                include_subfields => 1,
                hash_filter => $filter_coderef,
@@ -104,6 +106,22 @@ sub fetch_rec {
        return $rec;
 }
 
+=head2 dump_rec
+
+Return dump of record ID C<$mfn> from database
+
+  my $rec = $self->dump_rec( $db, $mfn );
+
+=cut
+
+sub dump_rec {
+       my $self = shift;
+
+       my $mfn = shift;
+
+       return $self->{_isis_db}->to_ascii( $mfn );
+}
+
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
index 3870471..a5bce1f 100644 (file)
@@ -55,6 +55,7 @@ sub open_db {
        my $db_size = $db->count - 1;   # FIXME
 
        $self->{_marc_size} = $db_size;
+       $self->{_marc_db} = $db;
 
        return ($db, $db_size);
 }
@@ -63,7 +64,7 @@ sub open_db {
 
 Return record with ID C<$mfn> from database
 
-  my $rec = $self->fetch_rec( $db, $mfn );
+  my $rec = $self->fetch_rec( $mfn );
 
 }
 
@@ -72,12 +73,12 @@ Return record with ID C<$mfn> from database
 sub fetch_rec {
        my $self = shift;
 
-       my ($db, $mfn) = @_;
+       my $mfn = shift;
 
        if ($mfn > $self->{_marc_size}) {
                $self->_get_logger()->warn("seek beyond database size $self->{_marc_size} to $mfn");
        } else {
-               my $row = $db->to_hash($mfn);
+               my $row = $self->{_marc_db}->to_hash($mfn);
                push @{$row->{'000'}}, $mfn;
                return $row;
        }
index 7ce2c50..ae1471b 100644 (file)
@@ -8,7 +8,7 @@ use blib;
 use base 'WebPAC::Common';
 use File::Slurp;
 use List::Util qw/first/;
-use Data::Dumper;
+use Data::Dump qw/dump/;
 use WebPAC::Normalize qw/_pack_subfields_hash/;
 use Storable qw/dclone/;
 
@@ -18,11 +18,11 @@ WebPAC::Validate - provide simple validation for records
 
 =head1 VERSION
 
-Version 0.04
+Version 0.06
 
 =cut
 
-our $VERSION = '0.04';
+our $VERSION = '0.06';
 
 =head1 SYNOPSIS
 
@@ -104,7 +104,7 @@ sub new {
 
        }
 
-       $log->debug("current validation rules: ", Dumper($v));
+       $log->debug("current validation rules: ", dump($v));
 
        $self->{rules} = $v;
 
@@ -131,9 +131,9 @@ sub validate_errors {
        $log->logdie("rec isn't HASH") unless (ref($rec) eq 'HASH');
        $log->logdie("can't find validation rules") unless (my $r = $self->{rules});
 
-       my @errors;
+       my $errors;
 
-       $log->debug("rec = ", sub { Dumper($rec) }, "keys = ", keys %{ $rec });
+       $log->debug("rec = ", sub { dump($rec) }, "keys = ", keys %{ $rec });
 
        my $fields;
 
@@ -144,13 +144,13 @@ sub validate_errors {
                $fields->{$f}++;
 
                if ( ! defined($r->{$f}) ) {
-                       push @errors, "field '$f' shouldn't exists";
+                       $errors->{field}->{ $f }->{extra} = "not expected";
                        next;
                }
 
 
                if (ref($rec->{$f}) ne 'ARRAY') {
-                       push @errors, "field '$f' isn't repetable, probably bug in parsing input data";
+                       $errors->{field}->{ $f }->{not_repeatable} = "probably bug in parsing input data";
                        next;
                }
 
@@ -158,8 +158,11 @@ sub validate_errors {
                        # can we have subfields?
                        if (ref($r->{$f}) eq 'ARRAY') {
                                # are values hashes? (has subfields)
-                               if (ref($v) ne 'HASH') {
-                                       push @errors, "$f has value without subfields: $v";
+                               if (! defined($v)) {
+                                       $errors->{field}->{$f}->{empty} = undef;
+                                       $errors->{dump}->{record}++;
+                               } elsif (ref($v) ne 'HASH') {
+                                       $errors->{field}->{$f}->{missing_subfield} = "value without subfields: $v";
                                        next;
                                } else {
 
@@ -181,50 +184,54 @@ sub validate_errors {
                                                                $sf_repeatable->{$sf}++;
                                                        };
                                                        if (! first { $_ eq $sf } @{ $r->{$f} }) {
-                                                               push @errors, "$f has unknown subfield: $sf";
+                                                               $errors->{field}->{ $f }->{subfield}->{$sf} = "unknown";
                                                        }
                                                }
 
                                        }
                                        if (my @r_sf = sort keys( %$sf_repeatable )) {
-                                               my $plural = $#r_sf > 0 ? 1 : 0;
-
-                                               push @errors, "$f subfield" .
-                                               ( $plural ? 's ' : ' ' ) .
-                                               join(', ', @r_sf) .
-                                               ( $plural ? ' are ' : ' is ' ) .
-                                               'repeatable in: ' .
-                                               join('', _pack_subfields_hash( $h, 1) );
+
+                                               foreach my $sf (@r_sf) {
+                                                       $errors->{field}->{$f}->{subfield}->{$sf} = "repeatable";
+                                                       $errors->{dump}->{field}->{$f} =
+                                                               join('', _pack_subfields_hash( $h, 1 ) );
+                                               }
+
                                        }
 
                                        if ( defined( $self->{must_exist_sf}->{$f} ) ) {
                                                foreach my $sf (sort keys %{ $self->{must_exist_sf}->{$f} }) {
 #warn "====> $f $sf must exist\n";
-                                                       push @errors, "$f missing required subfield $sf"
-                                                               unless (
-                                                                       defined( $subfields->{$sf} )
-                                                               )
+                                                       $errors->{field}->{$f}->{subfield}->{$sf} = "missing"
+                                                               unless defined( $subfields->{$sf} );
                                                }
                                        }
 
                                }
                        } elsif (ref($v) eq 'HASH') {
-                               push @errors, "$f has subfields which is not valid";
+                               $errors->{field}->{$f}->{unexpected_subfields}++;
+                               $errors->{dump}->{field}->{$f} =
+                                       join('', _pack_subfields_hash( $v, 1 ) );
                        }
                }
        }
 
        foreach my $must (sort keys %{ $self->{must_exist} }) {
                next if ($fields->{$must});
-               push @errors,
-                       "field $must should exist, but it doesn't";
+               $errors->{field}->{$must}->{missing}++;
+               $errors->{dump}->{record}++;
        }
 
-       #$log->logcluck("return from this function is ARRAY") unless wantarray;
+       if ($errors) {
+               $log->debug("errors: ", sub { dump( $errors ) } );
 
-       $log->debug("errors: ", join(", ", @errors)) if (@errors);
+               my $mfn = $rec->{'000'}->[0];
+               $self->{errors}->{$mfn} = $errors;
+       }
+
+       #$log->logcluck("return from this function is ARRAY") unless wantarray;
 
-       return @errors;
+       return $errors;
 }
 
 =head1 AUTHOR
diff --git a/run.pl b/run.pl
index 6829ac8..0b61092 100755 (executable)
--- a/run.pl
+++ b/run.pl
@@ -12,7 +12,7 @@ use WebPAC::Input 0.11;
 use WebPAC::Store 0.03;
 use WebPAC::Normalize 0.11;
 use WebPAC::Output::TT;
-use WebPAC::Validate;
+use WebPAC::Validate 0.06;
 use WebPAC::Output::MARC;
 use YAML qw/LoadFile/;
 use Getopt::Long;
@@ -406,8 +406,13 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) {
 
 
                                if ($validate) {
-                                       my @errors = $validate->validate_errors( $row );
-                                       $log->error( "MFN $mfn validation errors:\n", join("\n", @errors) ) if (@errors);
+                                       if ( my $errors = $validate->validate_errors( $row ) ) {
+                                               my $dump_record = defined($errors->{dump}->{record});
+                                               $log->error( "MFN $mfn\n",
+                                                       $dump_record ? $input_db->dump : '',
+                                                       "validation errors:\n", dump( $errors )
+                                               );
+                                       }
                                }
 
                                my $ds_config = dclone($db_config);
@@ -460,6 +465,8 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) {
                                $total_rows++;
                        }
 
+                       $log->info("validation errors:\n", dump( $validate->{errors} ) ) if ($validate && defined($validate->{errors}));
+
                        $log->info("statistics of fields usage:\n", $input_db->stats) if ($stats);
 
                        # close MARC file