6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 You can use special variables C<$database> and $C<$input> in this parametar
71 like C<--validate 'conf/validate/$database-$input'> to construct filename
73 =item --validate-delimiters path/to/validate_delimiters_file
75 this option is used with C<--validate> to turn on extra validation of
76 delimiters. If file is non existant, it will be created on first run.
80 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
81 You can use this option as C<--no-marc-generate> to disable MARC generation.
85 By default turned on if normalisation file has C<marc*> directives. You can disable lint
86 messages with C<--no-marc-lint>.
90 Force dump or input and marc record for debugging.
94 Run databases in parallel (aproximatly same as number of processors in
95 machine if you want to use full load)
103 Create merged index of databases which have links
118 my $validate_delimiters_path;
119 my $marc_generate = 1;
127 my $log = _new WebPAC::Common()->_get_logger();
130 "limit=i" => \$limit,
131 "offset=i" => \$offset,
133 "one=s" => \$only_filter,
134 "only=s" => \$only_filter,
135 "config=s" => \$config_path,
138 "validate=s" => \$validate_path,
139 "validate-delimiters=s" => \$validate_delimiters_path,
140 "marc-generate!" => \$marc_generate,
141 "marc-lint!" => \$marc_lint,
142 "marc-dump!" => \$marc_dump,
143 "parallel=i" => \$parallel,
144 "only-links!" => \$only_links,
149 $marc_generate = 0 if ( $validate_delimiters_path );
151 pod2usage(-verbose => 2) if ($help);
153 my $config = new WebPAC::Config( path => $config_path );
155 WebPAC::Normalize::_debug( $debug - 1 ) if $debug > 1;
157 #print "config = ",dump($config) if ($debug);
159 die "no databases in config file!\n" unless ($config->databases);
161 $log->info( "-" x 79 );
163 my $log_file = 'log';
165 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
166 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
167 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
171 my $estcmd_path = './estcmd-merge.sh';
173 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
174 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
175 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
176 $log->info("created merge batch file $estcmd_path");
180 $validate = new WebPAC::Validate(
181 delimiters => $config->webpac('delimiters'),
182 ) if ($validate_path || $validate_delimiters_path);
184 my $use_indexer = $config->use_indexer;
185 $stats ||= $validate;
187 $log->debug("disabled indexing for stats collection");
188 $use_indexer = undef;
189 } elsif ( $use_indexer ) {
190 $log->info("using $use_indexer indexing engine...");
193 # parse normalize files and create source files for lookup and normalization
195 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
197 my $parser = new WebPAC::Parser(
199 only_database => $only_database,
200 only_input => $only_input,
204 my $start_t = time();
209 eval 'use Proc::Queue size => 1;';
211 $log->info("Using $parallel processes for speedup");
212 Proc::Queue::size($parallel);
215 sub create_ds_config {
216 my ($db_config, $database, $input, $mfn) = @_;
217 my $c = dclone( $db_config );
218 $c->{_} = $database || $log->logconfess("need database");
219 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
220 $c->{input} = $input || $log->logconfess("need input");
224 foreach my $database ( sort keys %{ $config->databases } ) {
225 my $db_config = $config->databases->{$database};
227 next if ($only_database && $database !~ m/$only_database/i);
231 if(defined ($f) and $f==0) {
232 $log->info("Created processes $$ for speedup");
239 if ($use_indexer && $parser->have_rules( 'search', $database )) {
241 my $cfg_name = $use_indexer;
242 $cfg_name =~ s/\-.*$//;
244 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
245 $indexer_config->{database} = $database;
246 $indexer_config->{clean} = $clean;
247 $indexer_config->{label} = $db_config->{name};
249 # force clean if database has links
250 $indexer_config->{clean} = 1 if ($db_config->{links});
252 if ($use_indexer eq 'hyperestraier') {
254 # open Hyper Estraier database
255 require WebPAC::Output::Estraier;
256 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
258 } elsif ($use_indexer eq 'hyperestraier-native') {
260 # open Hyper Estraier database
261 require WebPAC::Output::EstraierNative;
262 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
264 } elsif ($use_indexer eq 'kinosearch') {
266 die "no longer supported";
269 $log->logdie("unknown use_indexer: $use_indexer");
272 $log->logdie("can't continue without valid indexer") unless ($indexer);
277 # store Hyper Estraier links to other databases
279 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
280 foreach my $link (@{ $db_config->{links} }) {
281 if ($use_indexer eq 'hyperestraier') {
283 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
285 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
287 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
291 credit => $link->{credit},
296 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
300 next if ($only_links);
306 my $store = new WebPAC::Store({
314 my @outputs = force_array( $db_config->{output}, sub {
315 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
320 foreach my $output ( @outputs ) {
322 #warn '## output = ',dump( $output );
324 my $module = $output->{module} || $log->logdie("need module in output section of $database");
325 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
327 $log->debug("loading output module $module");
328 eval "require $module";
330 # add database to arugemnts for output filter
331 $output->{database} = $database;
332 $output->{clean} = $clean;
334 $log->debug("calling $module->new(",dump( $output ),")");
335 my $out = new $module->new( $output );
337 push @output_modules, $out;
339 $log->warn("SKIPPED $module");
345 # now, iterate through input formats
349 my @inputs = force_array( $db_config->{input}, sub {
350 $log->info("database $database doesn't have inputs defined");
353 if ( -e 'out/debug' ) { # FIXME flag?
355 foreach my $i ( @inputs ) {
357 next unless defined $i->{normalize};
358 warn dump( $i->{normalize} );
359 foreach my $normalize ( @{ $i->{normalize} } ) {
360 my $path = $normalize->{path};
361 $out .= qq/\n##\n## $path\n##\n\n/;
362 $out .= read_file( $path );
365 my $all = "out/debug/all-normalize.pl";
366 write_file( $all, $out );
367 warn "### all normalize for this input saved to: $all";
370 foreach my $input (@inputs) {
372 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
374 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
376 my $type = lc($input->{type});
378 # FIXME check if input module exists
379 my $input_module = $input->{module};
381 if ( ! $input_module ) {
382 if ( grep(/$type/, $config->webpac('inputs')) ) {
383 $input_module = $config->webpac('inputs')->{$type};
385 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
389 my @lookups = $parser->have_lookup_create($database, $input);
391 $log->info("working on $database/$input_name with $input_module on $input->{path}",
392 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
396 # disable modification of records if --stats is in use
397 delete($input->{modify_records});
398 delete($input->{modify_file});
401 my $input_db = new WebPAC::Input(
402 module => $input_module,
403 limit => $limit || $input->{limit},
405 recode => $input->{recode},
407 modify_records => $input->{modify_records},
408 modify_file => $input->{modify_file},
409 input_config => $input,
411 $log->logdie("can't create input using $input_module") unless ($input);
413 if (defined( $input->{lookup} )) {
414 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
415 delete( $input->{lookup} );
422 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
424 $lookup_coderef = sub {
425 my $rec = shift || die "need rec!";
426 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
428 WebPAC::Normalize::data_structure(
431 config => create_ds_config( $db_config, $database, $input, $mfn ),
434 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
437 WebPAC::Normalize::_set_lookup( undef );
439 $log->debug("created lookup_coderef using:\n$rules");
445 my $maxmfn = $input_db->open(
446 path => $input->{path},
447 input_encoding => $input->{encoding}, # database encoding
448 lookup_coderef => $lookup_coderef,
449 lookup => $lookup_jar,
453 return $store->load_row(
454 database => $database,
455 input => $input_name,
461 return $store->save_row(
462 database => $database,
463 input => $input_name,
471 my $lookup_data = WebPAC::Normalize::_get_lookup();
473 if (defined( $lookup_data->{$database}->{$input_name} )) {
474 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
476 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
478 database => $database,
479 input => $input_name,
481 data => $lookup_data->{$database}->{$input_name}->{$key},
487 if ($stats || $validate) {
488 my $path = "out/report/${database}-${input_name}.txt";
489 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
491 print $report_fh "Report for database '$database' input '$input_name' records ",
492 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
493 $log->info("Generating report file $path");
496 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
497 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
502 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
503 $marc = new WebPAC::Output::MARC(
504 path => "out/marc/${database}-${input_name}.marc",
510 my $rules = $parser->normalize_rules($database,$input_name);
511 $log->logwarn("no normalize rules for $database/$input_name") unless $rules;
513 $log->debug("parsed normalize rules:\n$rules");
515 # reset position in database
518 # generate name of config key for indexer (strip everything after -)
519 my $indexer_config = $use_indexer;
520 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
523 my $depends = $parser->depends($database,$input_name);
526 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
527 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
529 foreach my $db (keys %$depends) {
530 foreach my $i (keys %{$depends->{$db}}) {
531 foreach my $k (keys %{$depends->{$db}->{$i}}) {
533 $log->debug("loading lookup $db/$i");
534 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
539 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
544 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
548 # setup input name for all output filters
549 foreach my $out ( @output_modules ) {
550 if ( $out->can('input') ) {
551 $out->input( $input_name );
553 $log->warn("output filter ",ref($out)," doesn't support input name");
558 foreach my $pos ( 0 ... $input_db->size ) {
560 my $row = $input_db->fetch || next;
564 my $mfn = $row->{'000'}->[0];
566 if (! $mfn || $mfn !~ m{^\d+$}) {
567 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
569 push @{ $row->{'000'} }, $pos;
574 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
575 $log->error( "MFN $mfn validation error:\n",
576 $validate->report_error( $errors )
579 next; # validation doesn't create any output
584 my $ds = WebPAC::Normalize::data_structure(
587 lookup => $lookup_hash,
588 config => create_ds_config( $db_config, $database, $input, $mfn ),
589 marc_encoding => 'utf-8',
590 load_row_coderef => sub {
591 my ($database,$input,$mfn) = @_;
592 #warn "### load_row($database,$input,$mfn) from data_structure\n";
593 return $store->load_row(
594 database => $database,
601 $log->debug("ds = ", sub { dump($ds) });
606 database => $database,
607 input => $input_name,
613 id => "${input_name}/${mfn}",
615 type => $config->get($indexer_config)->{type},
618 foreach my $out ( @output_modules ) {
619 $out->add( $mfn, $ds ) if $out->can('add');
623 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
630 while (my $fields = WebPAC::Normalize::MARC::_get_marc_fields( fetch_next => 1 ) ) {
632 id => $mfn . ( $i ? "/$i" : '' ),
634 leader => WebPAC::Normalize::MARC::_get_marc_leader(),
640 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
646 my $errors = $validate->report;
648 $log->info("validation errors:\n$errors\n" );
649 print $report_fh "$errors\n" if ($report_fh);
652 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
654 # must be last thing that touches $validate for this input
659 my $s = $input_db->stats;
660 $log->info("statistics of fields usage:\n$s");
661 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
665 $marc->finish if ($marc);
668 close($report_fh) if ($report_fh);
671 $indexer->finish if $indexer && $indexer->can('finish');
673 foreach my $out ( @output_modules ) {
674 $out->finish if $out->can('finish');
677 my $dt = time() - $start_t;
678 $log->info("$total_rows records ", $indexer ? "indexed " : "",
679 sprintf("in %.2f sec [%.2f rec/sec]",
680 $dt, ($total_rows / $dt)
687 $log->info("parallel process $$ finished");
694 # wait all children to finish
695 sleep(1) while wait != -1;
696 $log->info("all parallel processes finished");
699 # save new delimiters if needed
700 $validate->save_delimiters_templates if ( $validate_delimiters_path );
703 # handle links or merge after indexing
707 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
709 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
712 foreach my $link (@links) {
713 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');