6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 You can use special variables C<$database> and $C<$input> in this parametar
71 like C<--validate 'conf/validate/$database-$input'> to construct filename
73 =item --validate-delimiters path/to/validate_delimiters_file
75 this option is used with C<--validate> to turn on extra validation of
76 delimiters. If file is non existant, it will be created on first run.
80 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
81 You can use this option as C<--no-marc-generate> to disable MARC generation.
85 By default turned on if normalisation file has C<marc*> directives. You can disable lint
86 messages with C<--no-marc-lint>.
90 Force dump or input and marc record for debugging.
94 Run databases in parallel (aproximatly same as number of processors in
95 machine if you want to use full load)
103 Create merged index of databases which have links
118 my $validate_delimiters_path;
119 my $marc_generate = 1;
127 my $log = _new WebPAC::Common()->_get_logger();
130 "limit=i" => \$limit,
131 "offset=i" => \$offset,
133 "one=s" => \$only_filter,
134 "only=s" => \$only_filter,
135 "config=s" => \$config_path,
138 "validate=s" => \$validate_path,
139 "validate-delimiters=s" => \$validate_delimiters_path,
140 "marc-generate!" => \$marc_generate,
141 "marc-lint!" => \$marc_lint,
142 "marc-dump!" => \$marc_dump,
143 "parallel=i" => \$parallel,
144 "only-links!" => \$only_links,
149 $marc_generate = 0 if ( $validate_delimiters_path );
151 pod2usage(-verbose => 2) if ($help);
153 my $config = new WebPAC::Config( path => $config_path );
155 WebPAC::Normalize::_debug( $debug - 1 ) if $debug > 1;
157 #print "config = ",dump($config) if ($debug);
159 die "no databases in config file!\n" unless ($config->databases);
161 $log->info( "-" x 79 );
163 my $log_file = 'log';
165 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
166 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
167 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
171 my $estcmd_path = './estcmd-merge.sh';
173 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
174 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
175 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
176 $log->info("created merge batch file $estcmd_path");
180 $validate = new WebPAC::Validate(
181 delimiters => $config->webpac('delimiters'),
182 ) if ($validate_path || $validate_delimiters_path);
184 my $use_indexer = $config->use_indexer;
185 $stats ||= $validate;
187 $log->debug("disabled indexing for stats collection");
188 $use_indexer = undef;
189 } elsif ( $use_indexer ) {
190 $log->info("using $use_indexer indexing engine...");
193 # parse normalize files and create source files for lookup and normalization
195 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
197 my $parser = new WebPAC::Parser(
199 only_database => $only_database,
200 only_input => $only_input,
204 my $start_t = time();
209 eval 'use Proc::Queue size => 1;';
211 $log->info("Using $parallel processes for speedup");
212 Proc::Queue::size($parallel);
215 sub create_ds_config {
216 my ($db_config, $database, $input, $mfn) = @_;
217 my $c = dclone( $db_config );
218 $c->{_} = $database || $log->logconfess("need database");
219 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
220 $c->{input} = $input || $log->logconfess("need input");
224 foreach my $database ( sort keys %{ $config->databases } ) {
225 my $db_config = $config->databases->{$database};
227 next if ($only_database && $database !~ m/$only_database/i);
231 if(defined ($f) and $f==0) {
232 $log->info("Created processes $$ for speedup");
239 if ($use_indexer && $parser->have_rules( 'search', $database )) {
241 my $cfg_name = $use_indexer;
242 $cfg_name =~ s/\-.*$//;
244 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
245 $indexer_config->{database} = $database;
246 $indexer_config->{clean} = $clean;
247 $indexer_config->{label} = $db_config->{name};
249 # force clean if database has links
250 $indexer_config->{clean} = 1 if ($db_config->{links});
252 if ($use_indexer eq 'hyperestraier') {
254 # open Hyper Estraier database
255 require WebPAC::Output::Estraier;
256 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
258 } elsif ($use_indexer eq 'hyperestraier-native') {
260 # open Hyper Estraier database
261 require WebPAC::Output::EstraierNative;
262 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
264 } elsif ($use_indexer eq 'kinosearch') {
266 die "no longer supported";
269 $log->logdie("unknown use_indexer: $use_indexer");
272 $log->logdie("can't continue without valid indexer") unless ($indexer);
277 # store Hyper Estraier links to other databases
279 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
280 foreach my $link (@{ $db_config->{links} }) {
281 if ($use_indexer eq 'hyperestraier') {
283 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
285 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
287 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
291 credit => $link->{credit},
296 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
300 next if ($only_links);
306 my $store = new WebPAC::Store({
314 my @outputs = force_array( $db_config->{output}, sub {
315 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
320 foreach my $output ( @outputs ) {
322 #warn '## output = ',dump( $output );
324 my $module = $output->{module} || $log->logdie("need module in output section of $database");
325 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
327 $log->debug("loading output module $module");
328 eval "require $module";
330 # add database to arugemnts for output filter
331 $output->{database} = $database;
332 $output->{clean} = $clean;
334 $log->debug("calling $module->new(",dump( $output ),")");
335 my $out = new $module->new( $output );
337 push @output_modules, $out;
339 $log->warn("SKIPPED $module");
345 # now, iterate through input formats
349 my @inputs = force_array( $db_config->{input}, sub {
350 $log->info("database $database doesn't have inputs defined");
353 if ( -e 'out/debug' ) { # FIXME flag?
355 foreach my $i ( @inputs ) {
357 next unless defined $i->{normalize};
358 warn dump( $i->{normalize} );
359 foreach my $normalize ( @{ $i->{normalize} } ) {
360 my $path = $normalize->{path};
361 $out .= qq/\n##\n## $path\n##\n\n/;
362 $out .= read_file( $path );
365 my $all = "out/debug/all-normalize.pl";
366 write_file( $all, $out );
367 warn "### all normalize for this input saved to: $all";
370 foreach my $input (@inputs) {
372 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
374 if ( $input->{skip} ) {
375 $log->info("skip $input_name");
379 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
381 my $type = lc($input->{type});
383 # FIXME check if input module exists
384 my $input_module = $input->{module};
386 if ( ! $input_module ) {
387 if ( grep(/$type/, $config->webpac('inputs')) ) {
388 $input_module = $config->webpac('inputs')->{$type};
390 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
394 my @lookups = $parser->have_lookup_create($database, $input);
396 $log->info("working on $database/$input_name with $input_module on $input->{path}",
397 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
401 # disable modification of records if --stats is in use
402 delete($input->{modify_records});
403 delete($input->{modify_file});
406 my $input_db = new WebPAC::Input(
407 module => $input_module,
408 limit => $limit || $input->{limit},
410 recode => $input->{recode},
412 modify_records => $input->{modify_records},
413 modify_file => $input->{modify_file},
414 input_config => $input,
416 $log->logdie("can't create input using $input_module") unless ($input);
418 if (defined( $input->{lookup} )) {
419 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
420 delete( $input->{lookup} );
427 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
429 $lookup_coderef = sub {
430 my $rec = shift || die "need rec!";
431 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
433 WebPAC::Normalize::data_structure(
436 config => create_ds_config( $db_config, $database, $input, $mfn ),
439 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
442 WebPAC::Normalize::_set_lookup( undef );
444 $log->debug("created lookup_coderef using:\n$rules");
450 my $maxmfn = $input_db->open(
451 path => $input->{path},
452 input_encoding => $input->{encoding}, # database encoding
453 lookup_coderef => $lookup_coderef,
454 lookup => $lookup_jar,
458 return $store->load_row(
459 database => $database,
460 input => $input_name,
466 return $store->save_row(
467 database => $database,
468 input => $input_name,
476 my $lookup_data = WebPAC::Normalize::_get_lookup();
478 if (defined( $lookup_data->{$database}->{$input_name} )) {
479 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
481 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
483 database => $database,
484 input => $input_name,
486 data => $lookup_data->{$database}->{$input_name}->{$key},
492 if ($stats || $validate) {
493 my $path = "out/report/${database}-${input_name}.txt";
494 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
496 print $report_fh "Report for database '$database' input '$input_name' records ",
497 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
498 $log->info("Generating report file $path");
501 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
502 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
507 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
508 $marc = new WebPAC::Output::MARC(
509 path => "out/marc/${database}-${input_name}.marc",
515 my $rules = $parser->normalize_rules($database,$input_name);
516 $log->logwarn("no normalize rules for $database/$input_name") unless $rules;
518 $log->debug("parsed normalize rules:\n$rules");
520 # reset position in database
523 # generate name of config key for indexer (strip everything after -)
524 my $indexer_config = $use_indexer;
525 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
528 my $depends = $parser->depends($database,$input_name);
531 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
532 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
534 foreach my $db (keys %$depends) {
535 foreach my $i (keys %{$depends->{$db}}) {
536 foreach my $k (keys %{$depends->{$db}->{$i}}) {
538 $log->debug("loading lookup $db/$i");
539 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
544 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
549 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
553 # setup input name for all output filters
554 foreach my $out ( @output_modules ) {
555 if ( $out->can('input') ) {
556 $out->input( $input_name );
558 $log->warn("output filter ",ref($out)," doesn't support input name");
563 foreach my $pos ( 0 ... $input_db->size ) {
565 my $row = $input_db->fetch || next;
569 my $mfn = $row->{'000'}->[0];
571 if (! $mfn || $mfn !~ m{^\d+$}) {
572 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
574 push @{ $row->{'000'} }, $pos;
579 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
580 $log->error( "MFN $mfn validation error:\n",
581 $validate->report_error( $errors )
584 next; # validation doesn't create any output
589 my $ds = WebPAC::Normalize::data_structure(
592 lookup => $lookup_hash,
593 config => create_ds_config( $db_config, $database, $input, $mfn ),
594 marc_encoding => 'utf-8',
595 load_row_coderef => sub {
596 my ($database,$input,$mfn) = @_;
597 #warn "### load_row($database,$input,$mfn) from data_structure\n";
598 return $store->load_row(
599 database => $database,
606 $log->debug("ds = ", sub { dump($ds) });
611 database => $database,
612 input => $input_name,
618 id => "${input_name}/${mfn}",
620 type => $config->get($indexer_config)->{type},
623 foreach my $out ( @output_modules ) {
624 $out->add( $mfn, $ds ) if $out->can('add');
628 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
635 while (my $fields = WebPAC::Normalize::MARC::_get_marc_fields( fetch_next => 1 ) ) {
637 id => $mfn . ( $i ? "/$i" : '' ),
639 leader => WebPAC::Normalize::MARC::_get_marc_leader(),
645 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
651 my $errors = $validate->report;
653 $log->info("validation errors:\n$errors\n" );
654 print $report_fh "$errors\n" if ($report_fh);
657 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
659 # must be last thing that touches $validate for this input
664 my $s = $input_db->stats;
665 $log->info("statistics of fields usage:\n$s");
666 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
670 $marc->finish if ($marc);
673 close($report_fh) if ($report_fh);
676 $indexer->finish if $indexer && $indexer->can('finish');
678 foreach my $out ( @output_modules ) {
679 $out->finish if $out->can('finish');
682 my $dt = time() - $start_t;
683 $log->info("$total_rows records ", $indexer ? "indexed " : "",
684 sprintf("in %.2f sec [%.2f rec/sec]",
685 $dt, ($total_rows / $dt)
692 $log->info("parallel process $$ finished");
699 # wait all children to finish
700 sleep(1) while wait != -1;
701 $log->info("all parallel processes finished");
704 # save new delimiters if needed
705 $validate->save_delimiters_templates if ( $validate_delimiters_path );
708 # handle links or merge after indexing
712 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
714 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
717 foreach my $link (@links) {
718 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');