6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 #use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Creeate MARCXML file (this can be quite large)
95 Force dump or input and marc record for debugging.
99 Run databases in parallel (aproximatly same as number of processors in
100 machine if you want to use full load)
108 Create merged index of databases which have links
110 =item --mirror http://www.example.com
112 Tries to download input path files from mirror URI
127 my $validate_delimiters_path;
128 my $marc_generate = 1;
138 my $log = _new WebPAC::Common()->_get_logger();
141 "limit=i" => \$limit,
142 "offset=i" => \$offset,
144 "one=s" => \$only_filter,
145 "only=s" => \$only_filter,
146 "config=s" => \$config_path,
149 "validate=s" => \$validate_path,
150 "validate-delimiters=s" => \$validate_delimiters_path,
151 "marc-generate!" => \$marc_generate,
152 "marc-lint!" => \$marc_lint,
153 "marc-dump!" => \$marc_dump,
154 "marcxml!" => \$marc_xml,
155 "parallel=i" => \$parallel,
156 "only-links!" => \$only_links,
158 "mirror=s" => \$mirror,
162 $marc_generate = 0 if ( $validate_delimiters_path );
164 pod2usage(-verbose => 2) if ($help);
166 my $config = new WebPAC::Config( path => $config_path );
168 WebPAC::Normalize::_debug( $debug - 1 ) if $debug > 1;
170 #print "config = ",dump($config) if ($debug);
172 die "no databases in config file!\n" unless ($config->databases);
174 $log->info( "-" x 79 );
176 my $log_file = 'log';
178 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
179 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
180 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
184 my $estcmd_path = './estcmd-merge.sh';
186 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
187 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
188 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
189 $log->info("created merge batch file $estcmd_path");
193 $validate = new WebPAC::Validate(
194 delimiters => $config->webpac('delimiters'),
195 ) if ($validate_path || $validate_delimiters_path);
197 my $use_indexer = $config->use_indexer;
198 $stats ||= $validate;
200 $log->debug("disabled indexing for stats collection");
201 $use_indexer = undef;
202 } elsif ( $use_indexer ) {
203 $log->info("using $use_indexer indexing engine...");
206 # parse normalize files and create source files for lookup and normalization
208 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
210 my $parser = new WebPAC::Parser(
212 only_database => $only_database,
213 only_input => $only_input,
217 my $start_t = time();
222 eval 'use Proc::Queue size => 1;';
224 $log->info("Using $parallel processes for speedup");
225 Proc::Queue::size($parallel);
228 sub create_ds_config {
229 my ($db_config, $database, $input, $mfn) = @_;
230 my $c = dclone( $db_config );
231 $c->{_} = $database || $log->logconfess("need database");
232 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
233 $c->{input} = $input || $log->logconfess("need input");
237 foreach my $database ( sort keys %{ $config->databases } ) {
238 my $db_config = $config->databases->{$database};
240 next if ($only_database && $database !~ m/$only_database/i);
244 if(defined ($f) and $f==0) {
245 $log->info("Created processes $$ for speedup");
252 if ($use_indexer && $parser->have_rules( 'search', $database )) {
254 my $cfg_name = $use_indexer;
255 $cfg_name =~ s/\-.*$//;
257 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
258 $indexer_config->{database} = $database;
259 $indexer_config->{clean} = $clean;
260 $indexer_config->{label} = $db_config->{name};
262 # force clean if database has links
263 $indexer_config->{clean} = 1 if ($db_config->{links});
265 if ($use_indexer eq 'hyperestraier') {
267 # open Hyper Estraier database
268 require WebPAC::Output::Estraier;
269 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
271 } elsif ($use_indexer eq 'hyperestraier-native') {
273 # open Hyper Estraier database
274 require WebPAC::Output::EstraierNative;
275 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
277 } elsif ($use_indexer eq 'kinosearch') {
279 die "no longer supported";
282 $log->logdie("unknown use_indexer: $use_indexer");
285 $log->logdie("can't continue without valid indexer") unless ($indexer);
290 # store Hyper Estraier links to other databases
292 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
293 foreach my $link (@{ $db_config->{links} }) {
294 if ($use_indexer eq 'hyperestraier') {
296 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
298 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
300 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
304 credit => $link->{credit},
309 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
313 next if ($only_links);
319 my $store = new WebPAC::Store({
327 my @outputs = force_array( $db_config->{output}, sub {
328 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
333 foreach my $output ( @outputs ) {
335 #warn '## output = ',dump( $output );
337 my $module = $output->{module} || $log->logdie("need module in output section of $database");
338 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
340 $log->debug("loading output module $module");
341 eval "require $module";
343 # add database to arugemnts for output filter
344 $output->{database} = $database;
345 $output->{clean} = $clean;
347 $log->debug("calling $module->new(",dump( $output ),")");
348 my $out = new $module->new( $output );
350 push @output_modules, $out;
352 $log->warn("SKIPPED $module");
358 # now, iterate through input formats
362 my @inputs = force_array( $db_config->{input}, sub {
363 $log->info("database $database doesn't have inputs defined");
366 if ( -e 'out/debug' ) { # FIXME flag?
368 foreach my $i ( @inputs ) {
370 next unless defined $i->{normalize};
371 warn dump( $i->{normalize} );
372 foreach my $normalize ( @{ $i->{normalize} } ) {
373 my $path = $normalize->{path};
374 $out .= qq/\n##\n## $path\n##\n\n/;
375 $out .= read_file( $path );
378 my $all = "out/debug/all-normalize.pl";
379 write_file( $all, $out );
380 warn "### all normalize for this input saved to: $all";
383 foreach my $input (@inputs) {
385 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
387 if ( $input->{skip} ) {
388 $log->info("skip $input_name");
392 next if defined $only_input && $input_name !~ m#$only_input#i;
394 my $type = lc($input->{type});
396 # FIXME check if input module exists
397 my $input_module = $input->{module};
399 if ( ! $input_module ) {
400 if ( grep(/$type/, $config->webpac('inputs')) ) {
401 $input_module = $config->webpac('inputs')->{$type};
403 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
407 my @lookups = $parser->have_lookup_create($database, $input);
409 $log->info("working on $database/$input_name with $input_module on $input->{path}",
410 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
414 # disable modification of records if --stats is in use
415 delete($input->{modify_records});
416 delete($input->{modify_file});
420 my $path = $input->{path} || die "no input path in ",dump( $input );
423 $base =~ s{/[^/]+$}{};
424 mkpath $base unless -e $base;
426 my $rc = LWP::Simple::mirror( "$mirror/$path", $path );
427 if (LWP::Simple::is_error( $rc )) {
428 die "can't mirror $mirror/$path -> $path [$rc]";
430 $log->info( "mirror ", $path, " [$rc] ", -s $path, " bytes" );
435 my $input_db = new WebPAC::Input(
436 module => $input_module,
437 limit => $limit || $input->{limit},
438 offset => $offset || $input->{offset},
439 recode => $input->{recode},
441 modify_records => $input->{modify_records},
442 modify_file => $input->{modify_file},
443 input_config => $input,
445 $log->logdie("can't create input using $input_module") unless ($input);
447 if (defined( $input->{lookup} )) {
448 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
449 delete( $input->{lookup} );
456 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
458 $lookup_coderef = sub {
459 my $rec = shift || die "need rec!";
460 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
462 WebPAC::Normalize::data_structure(
465 config => create_ds_config( $db_config, $database, $input, $mfn ),
468 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
471 WebPAC::Normalize::_set_lookup( undef );
473 $log->debug("created lookup_coderef using:\n$rules");
479 my $maxmfn = $input_db->open(
480 path => $input->{path},
481 input_encoding => $input->{encoding}, # database encoding
482 lookup_coderef => $lookup_coderef,
483 lookup => $lookup_jar,
487 return $store->load_row(
488 database => $database,
489 input => $input_name,
495 return $store->save_row(
496 database => $database,
497 input => $input_name,
505 my $lookup_data = WebPAC::Normalize::_get_lookup();
507 if (defined( $lookup_data->{$database}->{$input_name} )) {
508 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
510 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
512 database => $database,
513 input => $input_name,
515 data => $lookup_data->{$database}->{$input_name}->{$key},
521 if ($stats || $validate) {
522 my $out_report = 'out/report'; # FIXME move to config
523 mkpath $out_report unless -e $out_report;
524 my $path = "$out_report/${database}-${input_name}.txt";
525 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
527 print $report_fh "Report for database '$database' input '$input_name' records ",
528 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
529 $log->info("Generating report file $path");
532 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
533 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
538 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
540 my $out_marc = 'out/marc'; # FIXME move to config
541 mkpath $out_marc unless -e $out_marc;
543 $marc = new WebPAC::Output::MARC(
544 path => "$out_marc/${database}-${input_name}",
547 marcxml => $marc_xml,
551 my $rules = $parser->normalize_rules($database,$input_name);
553 $log->logwarn("no normalize rules for $database/$input_name", $input_db->input_module->can('normalize') ? " using normalize from input module" : '');
557 $log->debug("parsed normalize rules:\n$rules");
559 # reset position in database
562 # generate name of config key for indexer (strip everything after -)
563 my $indexer_config = $use_indexer;
564 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
567 my $depends = $parser->depends($database,$input_name);
570 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
571 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
573 foreach my $db (keys %$depends) {
574 foreach my $i (keys %{$depends->{$db}}) {
575 foreach my $k (keys %{$depends->{$db}->{$i}}) {
577 $log->debug("loading lookup $db/$i");
578 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
583 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
588 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
592 # setup input name for all output filters
593 foreach my $out ( @output_modules ) {
594 if ( $out->can('input') ) {
595 $out->input( $input_name );
597 $log->warn("output filter ",ref($out)," doesn't support input name");
602 foreach my $pos ( 0 ... $input_db->size ) {
604 my $row = $input_db->fetch || next;
608 my $mfn = $row->{'000'}->[0];
610 if (! $mfn || $mfn !~ m{^\d+$}) {
611 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
613 push @{ $row->{'000'} }, $pos;
616 foreach my $out ( @output_modules ) {
617 $out->add_row( $mfn, $row ) if $out->can('add_row');
621 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
622 $log->error( "MFN $mfn validation error:\n",
623 $validate->report_error( $errors )
626 next; # validation doesn't create any output
633 $ds = WebPAC::Normalize::data_structure(
636 lookup => $lookup_hash,
637 config => create_ds_config( $db_config, $database, $input, $mfn ),
638 marc_encoding => 'utf-8',
639 load_row_coderef => sub {
640 my ($database,$input,$mfn) = @_;
641 #warn "### load_row($database,$input,$mfn) from data_structure\n";
642 return $store->load_row(
643 database => $database,
650 } elsif ( $input_db->input_module->can('normalize') ) {
651 $ds = $input_db->input_module->normalize( $mfn );
655 $log->debug("ds = ", sub { dump($ds) });
658 database => $database,
659 input => $input_name,
665 id => "${input_name}/${mfn}",
667 type => $config->get($indexer_config)->{type},
670 foreach my $out ( @output_modules ) {
671 $out->add( $mfn, $ds ) if $out->can('add');
675 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
680 while (my $fields = WebPAC::Normalize::MARC::_get_marc_fields( fetch_next => 1 ) ) {
682 id => $mfn . ( $i ? "/$i" : '' ),
684 leader => WebPAC::Normalize::MARC::_get_marc_leader(),
690 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
696 my $errors = $validate->report;
698 $log->info("validation errors:\n$errors\n" );
699 print $report_fh "$errors\n" if ($report_fh);
702 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
704 # must be last thing that touches $validate for this input
709 my $s = $input_db->stats;
710 $log->info("statistics of fields usage:\n$s");
711 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
715 $marc->finish if ($marc);
718 close($report_fh) if ($report_fh);
721 $indexer->finish if $indexer && $indexer->can('finish');
723 foreach my $out ( @output_modules ) {
724 $out->finish if $out->can('finish');
727 my $dt = time() - $start_t;
728 $log->info("$total_rows records ", $indexer ? "indexed " : "",
729 sprintf("in %.2f sec [%.2f rec/sec]",
730 $dt, ($total_rows / $dt)
737 $log->info("parallel process $$ finished");
744 # wait all children to finish
745 sleep(1) while wait != -1;
746 $log->info("all parallel processes finished");
749 # save new delimiters if needed
750 $validate->save_delimiters_templates if ( $validate_delimiters_path );
753 # handle links or merge after indexing
757 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
759 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
762 foreach my $link (@links) {
763 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');