use WebPAC::Input 0.16;
use WebPAC::Store 0.15;
use WebPAC::Normalize 0.22;
-use WebPAC::Output::TT;
+#use WebPAC::Output::TT;
use WebPAC::Validate 0.11;
use WebPAC::Output::MARC;
use WebPAC::Config;
use Data::Dump qw/dump/;
use Storable qw/dclone/;
use Pod::Usage qw/pod2usage/;
+use LWP::Simple qw//;
-use Proc::Queue size => 1;
use POSIX ":sys_wait_h"; # imports WNOHANG
=head1 NAME
By default turned on if normalisation file has C<marc*> directives. You can disable lint
messages with C<--no-marc-lint>.
+=item --marc-xml
+
+Creeate MARCXML file (this can be quite large)
+
=item --marc-dump
Force dump or input and marc record for debugging.
Create merged index of databases which have links
+=item --mirror http://www.example.com
+
+Tries to download input path files from mirror URI
+
=back
=cut
my $validate_path;
my $validate_delimiters_path;
my $marc_generate = 1;
-my $marc_lint = 1;
+my $marc_lint = 0;
my $marc_dump = 0;
+my $marc_xml = 0;
my $parallel = 0;
my $only_links = 0;
my $merge = 0;
+my $mirror;
my $help;
my $log = _new WebPAC::Common()->_get_logger();
"marc-generate!" => \$marc_generate,
"marc-lint!" => \$marc_lint,
"marc-dump!" => \$marc_dump,
+ "marcxml!" => \$marc_xml,
"parallel=i" => \$parallel,
"only-links!" => \$only_links,
"merge" => \$merge,
+ "mirror=s" => \$mirror,
"help" => \$help,
);
my @links;
if ($parallel) {
+ eval 'use Proc::Queue size => 1;';
+ die $@ if $@;
$log->info("Using $parallel processes for speedup");
Proc::Queue::size($parallel);
}
$log->info("database $database doesn't have inputs defined");
} );
- if ( -e 'out/debug' ) { # fixme flag?
+ if ( -e 'out/debug' ) { # FIXME flag?
my $out;
foreach my $i ( @inputs ) {
warn dump( $i );
my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
- next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
+ if ( $input->{skip} ) {
+ $log->info("skip $input_name");
+ next;
+ }
+
+ next if defined $only_input && $input_name !~ m#$only_input#i;
my $type = lc($input->{type});
delete($input->{modify_file});
}
+ if ( $mirror ) {
+ my $path = $input->{path} || die "no input path in ",dump( $input );
+
+ my $base = $path;
+ $base =~ s{/[^/]+$}{};
+ mkpath $base unless -e $base;
+
+ my $rc = LWP::Simple::mirror( "$mirror/$path", $path );
+ if (LWP::Simple::is_error( $rc )) {
+ die "can't mirror $mirror/$path -> $path [$rc]";
+ } else {
+ $log->info( "mirror ", $path, " [$rc] ", -s $path, " bytes" );
+ }
+
+ }
+
my $input_db = new WebPAC::Input(
module => $input_module,
- encoding => $config->webpac('webpac_encoding'),
limit => $limit || $input->{limit},
- offset => $offset,
+ offset => $offset || $input->{offset},
recode => $input->{recode},
stats => $stats,
modify_records => $input->{modify_records},
my $maxmfn = $input_db->open(
path => $input->{path},
- code_page => $input->{encoding}, # database encoding
+ input_encoding => $input->{encoding}, # database encoding
lookup_coderef => $lookup_coderef,
lookup => $lookup_jar,
%{ $input },
my $report_fh;
if ($stats || $validate) {
- my $path = "out/report/${database}-${input_name}.txt";
+ my $out_report = 'out/report'; # FIXME move to config
+ mkpath $out_report unless -e $out_report;
+ my $path = "$out_report/${database}-${input_name}.txt";
open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
print $report_fh "Report for database '$database' input '$input_name' records ",
my $marc;
if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
+
+ my $out_marc = 'out/marc'; # FIXME move to config
+ mkpath $out_marc unless -e $out_marc;
+
$marc = new WebPAC::Output::MARC(
- path => "out/marc/${database}-${input_name}.marc",
+ path => "$out_marc/${database}-${input_name}",
lint => $marc_lint,
dump => $marc_dump,
+ marcxml => $marc_xml,
);
}
my $rules = $parser->normalize_rules($database,$input_name);
- $log->logwarn("no normalize rules for $database/$input_name") unless $rules;
+ if ( ! $rules ) {
+ $log->logwarn("no normalize rules for $database/$input_name", $input_db->input_module->can('normalize') ? " using normalize from input module" : '');
+ next;
+ }
$log->debug("parsed normalize rules:\n$rules");
push @{ $row->{'000'} }, $pos;
}
+ foreach my $out ( @output_modules ) {
+ $out->add_row( $mfn, $row ) if $out->can('add_row');
+ }
if ($validate) {
if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
next; # validation doesn't create any output
}
+ my $ds;
+
if ($rules) {
- my $ds = WebPAC::Normalize::data_structure(
+ $ds = WebPAC::Normalize::data_structure(
row => $row,
rules => $rules,
lookup => $lookup_hash,
},
);
- $log->debug("ds = ", sub { dump($ds) });
-
- if ( $ds ) {
-
- $store->save_ds(
- database => $database,
- input => $input_name,
- id => $mfn,
- ds => $ds,
- ) if !$stats;
-
- $indexer->add(
- id => "${input_name}/${mfn}",
- ds => $ds,
- type => $config->get($indexer_config)->{type},
- ) if $indexer;
+ } elsif ( $input_db->input_module->can('normalize') ) {
+ $ds = $input_db->input_module->normalize( $mfn );
+ }
- foreach my $out ( @output_modules ) {
- $out->add( $mfn, $ds ) if $out->can('add');
- }
+ if ( $ds ) {
+ $log->debug("ds = ", sub { dump($ds) });
- } else {
- $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
+ $store->save_ds(
+ database => $database,
+ input => $input_name,
+ id => $mfn,
+ ds => $ds,
+ ) if !$stats;
+
+ $indexer->add(
+ id => "${input_name}/${mfn}",
+ ds => $ds,
+ type => $config->get($indexer_config)->{type},
+ ) if $indexer;
+
+ foreach my $out ( @output_modules ) {
+ $out->add( $mfn, $ds ) if $out->can('add');
}
- }
+ } else {
+ $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
+ }
if ($marc) {
my $i = 0;
close($report_fh) if ($report_fh);
}
- eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
+ $indexer->finish if $indexer && $indexer->can('finish');
foreach my $out ( @output_modules ) {
$out->finish if $out->can('finish');