6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 #use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
106 =item --mirror http://www.example.com
108 Tries to download input path files from mirror URI
123 my $validate_delimiters_path;
124 my $marc_generate = 1;
133 my $log = _new WebPAC::Common()->_get_logger();
136 "limit=i" => \$limit,
137 "offset=i" => \$offset,
139 "one=s" => \$only_filter,
140 "only=s" => \$only_filter,
141 "config=s" => \$config_path,
144 "validate=s" => \$validate_path,
145 "validate-delimiters=s" => \$validate_delimiters_path,
146 "marc-generate!" => \$marc_generate,
147 "marc-lint!" => \$marc_lint,
148 "marc-dump!" => \$marc_dump,
149 "parallel=i" => \$parallel,
150 "only-links!" => \$only_links,
152 "mirror=s" => \$mirror,
156 $marc_generate = 0 if ( $validate_delimiters_path );
158 pod2usage(-verbose => 2) if ($help);
160 my $config = new WebPAC::Config( path => $config_path );
162 WebPAC::Normalize::_debug( $debug - 1 ) if $debug > 1;
164 #print "config = ",dump($config) if ($debug);
166 die "no databases in config file!\n" unless ($config->databases);
168 $log->info( "-" x 79 );
170 my $log_file = 'log';
172 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
173 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
174 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
178 my $estcmd_path = './estcmd-merge.sh';
180 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
181 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
182 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
183 $log->info("created merge batch file $estcmd_path");
187 $validate = new WebPAC::Validate(
188 delimiters => $config->webpac('delimiters'),
189 ) if ($validate_path || $validate_delimiters_path);
191 my $use_indexer = $config->use_indexer;
192 $stats ||= $validate;
194 $log->debug("disabled indexing for stats collection");
195 $use_indexer = undef;
196 } elsif ( $use_indexer ) {
197 $log->info("using $use_indexer indexing engine...");
200 # parse normalize files and create source files for lookup and normalization
202 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
204 my $parser = new WebPAC::Parser(
206 only_database => $only_database,
207 only_input => $only_input,
211 my $start_t = time();
216 eval 'use Proc::Queue size => 1;';
218 $log->info("Using $parallel processes for speedup");
219 Proc::Queue::size($parallel);
222 sub create_ds_config {
223 my ($db_config, $database, $input, $mfn) = @_;
224 my $c = dclone( $db_config );
225 $c->{_} = $database || $log->logconfess("need database");
226 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
227 $c->{input} = $input || $log->logconfess("need input");
231 foreach my $database ( sort keys %{ $config->databases } ) {
232 my $db_config = $config->databases->{$database};
234 next if ($only_database && $database !~ m/$only_database/i);
238 if(defined ($f) and $f==0) {
239 $log->info("Created processes $$ for speedup");
246 if ($use_indexer && $parser->have_rules( 'search', $database )) {
248 my $cfg_name = $use_indexer;
249 $cfg_name =~ s/\-.*$//;
251 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
252 $indexer_config->{database} = $database;
253 $indexer_config->{clean} = $clean;
254 $indexer_config->{label} = $db_config->{name};
256 # force clean if database has links
257 $indexer_config->{clean} = 1 if ($db_config->{links});
259 if ($use_indexer eq 'hyperestraier') {
261 # open Hyper Estraier database
262 require WebPAC::Output::Estraier;
263 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
265 } elsif ($use_indexer eq 'hyperestraier-native') {
267 # open Hyper Estraier database
268 require WebPAC::Output::EstraierNative;
269 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
271 } elsif ($use_indexer eq 'kinosearch') {
273 die "no longer supported";
276 $log->logdie("unknown use_indexer: $use_indexer");
279 $log->logdie("can't continue without valid indexer") unless ($indexer);
284 # store Hyper Estraier links to other databases
286 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
287 foreach my $link (@{ $db_config->{links} }) {
288 if ($use_indexer eq 'hyperestraier') {
290 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
292 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
294 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
298 credit => $link->{credit},
303 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
307 next if ($only_links);
313 my $store = new WebPAC::Store({
321 my @outputs = force_array( $db_config->{output}, sub {
322 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
327 foreach my $output ( @outputs ) {
329 #warn '## output = ',dump( $output );
331 my $module = $output->{module} || $log->logdie("need module in output section of $database");
332 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
334 $log->debug("loading output module $module");
335 eval "require $module";
337 # add database to arugemnts for output filter
338 $output->{database} = $database;
339 $output->{clean} = $clean;
341 $log->debug("calling $module->new(",dump( $output ),")");
342 my $out = new $module->new( $output );
344 push @output_modules, $out;
346 $log->warn("SKIPPED $module");
352 # now, iterate through input formats
356 my @inputs = force_array( $db_config->{input}, sub {
357 $log->info("database $database doesn't have inputs defined");
360 if ( -e 'out/debug' ) { # FIXME flag?
362 foreach my $i ( @inputs ) {
364 next unless defined $i->{normalize};
365 warn dump( $i->{normalize} );
366 foreach my $normalize ( @{ $i->{normalize} } ) {
367 my $path = $normalize->{path};
368 $out .= qq/\n##\n## $path\n##\n\n/;
369 $out .= read_file( $path );
372 my $all = "out/debug/all-normalize.pl";
373 write_file( $all, $out );
374 warn "### all normalize for this input saved to: $all";
377 foreach my $input (@inputs) {
379 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
381 if ( $input->{skip} ) {
382 $log->info("skip $input_name");
386 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
388 my $type = lc($input->{type});
390 # FIXME check if input module exists
391 my $input_module = $input->{module};
393 if ( ! $input_module ) {
394 if ( grep(/$type/, $config->webpac('inputs')) ) {
395 $input_module = $config->webpac('inputs')->{$type};
397 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
401 my @lookups = $parser->have_lookup_create($database, $input);
403 $log->info("working on $database/$input_name with $input_module on $input->{path}",
404 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
408 # disable modification of records if --stats is in use
409 delete($input->{modify_records});
410 delete($input->{modify_file});
414 my $path = $input->{path} || die "no input path in ",dump( $input );
417 $base =~ s{/[^/]+$}{};
418 mkpath $base unless -e $base;
420 my $rc = LWP::Simple::mirror( "$mirror/$path", $path );
421 if (LWP::Simple::is_error( $rc )) {
422 die "can't mirror $mirror/$path -> $path [$rc]";
424 $log->info( "mirror ", $path, " [$rc] ", -s $path, " bytes" );
429 my $input_db = new WebPAC::Input(
430 module => $input_module,
431 limit => $limit || $input->{limit},
433 recode => $input->{recode},
435 modify_records => $input->{modify_records},
436 modify_file => $input->{modify_file},
437 input_config => $input,
439 $log->logdie("can't create input using $input_module") unless ($input);
441 if (defined( $input->{lookup} )) {
442 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
443 delete( $input->{lookup} );
450 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
452 $lookup_coderef = sub {
453 my $rec = shift || die "need rec!";
454 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
456 WebPAC::Normalize::data_structure(
459 config => create_ds_config( $db_config, $database, $input, $mfn ),
462 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
465 WebPAC::Normalize::_set_lookup( undef );
467 $log->debug("created lookup_coderef using:\n$rules");
473 my $maxmfn = $input_db->open(
474 path => $input->{path},
475 input_encoding => $input->{encoding}, # database encoding
476 lookup_coderef => $lookup_coderef,
477 lookup => $lookup_jar,
481 return $store->load_row(
482 database => $database,
483 input => $input_name,
489 return $store->save_row(
490 database => $database,
491 input => $input_name,
499 my $lookup_data = WebPAC::Normalize::_get_lookup();
501 if (defined( $lookup_data->{$database}->{$input_name} )) {
502 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
504 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
506 database => $database,
507 input => $input_name,
509 data => $lookup_data->{$database}->{$input_name}->{$key},
515 if ($stats || $validate) {
516 my $out_report = 'out/report'; # FIXME move to config
517 mkpath $out_report unless -e $out_report;
518 my $path = "$out_report/${database}-${input_name}.txt";
519 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
521 print $report_fh "Report for database '$database' input '$input_name' records ",
522 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
523 $log->info("Generating report file $path");
526 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
527 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
532 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
534 my $out_marc = 'out/marc'; # FIXME move to config
535 mkpath $out_marc unless -e $out_marc;
537 $marc = new WebPAC::Output::MARC(
538 path => "$out_marc/${database}-${input_name}.marc",
544 my $rules = $parser->normalize_rules($database,$input_name);
545 $log->logwarn("no normalize rules for $database/$input_name", $input_db->input_module->can('normalize') ? " using normalize from input module" : '') unless $rules;
547 $log->debug("parsed normalize rules:\n$rules");
549 # reset position in database
552 # generate name of config key for indexer (strip everything after -)
553 my $indexer_config = $use_indexer;
554 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
557 my $depends = $parser->depends($database,$input_name);
560 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
561 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
563 foreach my $db (keys %$depends) {
564 foreach my $i (keys %{$depends->{$db}}) {
565 foreach my $k (keys %{$depends->{$db}->{$i}}) {
567 $log->debug("loading lookup $db/$i");
568 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
573 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
578 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
582 # setup input name for all output filters
583 foreach my $out ( @output_modules ) {
584 if ( $out->can('input') ) {
585 $out->input( $input_name );
587 $log->warn("output filter ",ref($out)," doesn't support input name");
592 foreach my $pos ( 0 ... $input_db->size ) {
594 my $row = $input_db->fetch || next;
598 my $mfn = $row->{'000'}->[0];
600 if (! $mfn || $mfn !~ m{^\d+$}) {
601 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
603 push @{ $row->{'000'} }, $pos;
606 foreach my $out ( @output_modules ) {
607 $out->add_row( $mfn, $row ) if $out->can('add_row');
611 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
612 $log->error( "MFN $mfn validation error:\n",
613 $validate->report_error( $errors )
616 next; # validation doesn't create any output
623 $ds = WebPAC::Normalize::data_structure(
626 lookup => $lookup_hash,
627 config => create_ds_config( $db_config, $database, $input, $mfn ),
628 marc_encoding => 'utf-8',
629 load_row_coderef => sub {
630 my ($database,$input,$mfn) = @_;
631 #warn "### load_row($database,$input,$mfn) from data_structure\n";
632 return $store->load_row(
633 database => $database,
640 } elsif ( $input_db->input_module->can('normalize') ) {
641 $ds = $input_db->input_module->normalize( $mfn );
645 $log->debug("ds = ", sub { dump($ds) });
648 database => $database,
649 input => $input_name,
655 id => "${input_name}/${mfn}",
657 type => $config->get($indexer_config)->{type},
660 foreach my $out ( @output_modules ) {
661 $out->add( $mfn, $ds ) if $out->can('add');
665 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
670 while (my $fields = WebPAC::Normalize::MARC::_get_marc_fields( fetch_next => 1 ) ) {
672 id => $mfn . ( $i ? "/$i" : '' ),
674 leader => WebPAC::Normalize::MARC::_get_marc_leader(),
680 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
686 my $errors = $validate->report;
688 $log->info("validation errors:\n$errors\n" );
689 print $report_fh "$errors\n" if ($report_fh);
692 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
694 # must be last thing that touches $validate for this input
699 my $s = $input_db->stats;
700 $log->info("statistics of fields usage:\n$s");
701 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
705 $marc->finish if ($marc);
708 close($report_fh) if ($report_fh);
711 $indexer->finish if $indexer && $indexer->can('finish');
713 foreach my $out ( @output_modules ) {
714 $out->finish if $out->can('finish');
717 my $dt = time() - $start_t;
718 $log->info("$total_rows records ", $indexer ? "indexed " : "",
719 sprintf("in %.2f sec [%.2f rec/sec]",
720 $dt, ($total_rows / $dt)
727 $log->info("parallel process $$ finished");
734 # wait all children to finish
735 sleep(1) while wait != -1;
736 $log->info("all parallel processes finished");
739 # save new delimiters if needed
740 $validate->save_delimiters_templates if ( $validate_delimiters_path );
743 # handle links or merge after indexing
747 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
749 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
752 foreach my $link (@links) {
753 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');