6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
119 my $validate_delimiters_path;
120 my $marc_generate = 1;
128 my $log = _new WebPAC::Common()->_get_logger();
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config=s" => \$config_path,
139 "validate=s" => \$validate_path,
140 "validate-delimiters=s" => \$validate_delimiters_path,
141 "marc-generate!" => \$marc_generate,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
150 $marc_generate = 0 if ( $validate_delimiters_path );
152 pod2usage(-verbose => 2) if ($help);
154 my $config = new WebPAC::Config( path => $config_path );
156 WebPAC::Normalize::_debug( $debug - 1 ) if $debug > 1;
158 #print "config = ",dump($config) if ($debug);
160 die "no databases in config file!\n" unless ($config->databases);
162 $log->info( "-" x 79 );
164 my $log_file = 'log';
166 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
167 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
168 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
172 my $estcmd_path = './estcmd-merge.sh';
174 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
175 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
176 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
177 $log->info("created merge batch file $estcmd_path");
181 $validate = new WebPAC::Validate(
182 delimiters => $config->webpac('delimiters'),
183 ) if ($validate_path || $validate_delimiters_path);
185 my $use_indexer = $config->use_indexer;
186 $stats ||= $validate;
188 $log->debug("disabled indexing for stats collection");
189 $use_indexer = undef;
190 } elsif ( $use_indexer ) {
191 $log->info("using $use_indexer indexing engine...");
194 # parse normalize files and create source files for lookup and normalization
196 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
198 my $parser = new WebPAC::Parser(
200 only_database => $only_database,
201 only_input => $only_input,
205 my $start_t = time();
210 $log->info("Using $parallel processes for speedup");
211 Proc::Queue::size($parallel);
214 sub create_ds_config {
215 my ($db_config, $database, $input, $mfn) = @_;
216 my $c = dclone( $db_config );
217 $c->{_} = $database || $log->logconfess("need database");
218 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
219 $c->{input} = $input || $log->logconfess("need input");
223 foreach my $database ( sort keys %{ $config->databases } ) {
224 my $db_config = $config->databases->{$database};
226 next if ($only_database && $database !~ m/$only_database/i);
230 if(defined ($f) and $f==0) {
231 $log->info("Created processes $$ for speedup");
238 if ($use_indexer && $parser->have_rules( 'search', $database )) {
240 my $cfg_name = $use_indexer;
241 $cfg_name =~ s/\-.*$//;
243 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
244 $indexer_config->{database} = $database;
245 $indexer_config->{clean} = $clean;
246 $indexer_config->{label} = $db_config->{name};
248 # force clean if database has links
249 $indexer_config->{clean} = 1 if ($db_config->{links});
251 if ($use_indexer eq 'hyperestraier') {
253 # open Hyper Estraier database
254 require WebPAC::Output::Estraier;
255 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
257 } elsif ($use_indexer eq 'hyperestraier-native') {
259 # open Hyper Estraier database
260 require WebPAC::Output::EstraierNative;
261 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
263 } elsif ($use_indexer eq 'kinosearch') {
265 die "no longer supported";
268 $log->logdie("unknown use_indexer: $use_indexer");
271 $log->logdie("can't continue without valid indexer") unless ($indexer);
276 # store Hyper Estraier links to other databases
278 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
279 foreach my $link (@{ $db_config->{links} }) {
280 if ($use_indexer eq 'hyperestraier') {
282 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
284 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
286 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
290 credit => $link->{credit},
295 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
299 next if ($only_links);
305 my $store = new WebPAC::Store({
313 my @outputs = force_array( $db_config->{output}, sub {
314 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
319 foreach my $output ( @outputs ) {
321 #warn '## output = ',dump( $output );
323 my $module = $output->{module} || $log->logdie("need module in output section of $database");
324 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
326 $log->debug("loading output module $module");
327 eval "require $module";
329 # add database to arugemnts for output filter
330 $output->{database} = $database;
331 $output->{clean} = $clean;
333 $log->debug("calling $module->new(",dump( $output ),")");
334 my $out = new $module->new( $output );
336 push @output_modules, $out;
338 $log->warn("SKIPPED $module");
344 # now, iterate through input formats
348 my @inputs = force_array( $db_config->{input}, sub {
349 $log->info("database $database doesn't have inputs defined");
352 foreach my $input (@inputs) {
354 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
356 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
358 my $type = lc($input->{type});
360 # FIXME check if input module exists
361 my $input_module = $input->{module};
363 if ( ! $input_module ) {
364 if ( grep(/$type/, $config->webpac('inputs')) ) {
365 $input_module = $config->webpac('inputs')->{$type};
367 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
371 my @lookups = $parser->have_lookup_create($database, $input);
373 $log->info("working on $database/$input_name with $input_module on $input->{path}",
374 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
378 # disable modification of records if --stats is in use
379 delete($input->{modify_records});
380 delete($input->{modify_file});
383 my $input_db = new WebPAC::Input(
384 module => $input_module,
385 encoding => $config->webpac('webpac_encoding'),
386 limit => $limit || $input->{limit},
388 recode => $input->{recode},
390 modify_records => $input->{modify_records},
391 modify_file => $input->{modify_file},
392 input_config => $input,
394 $log->logdie("can't create input using $input_module") unless ($input);
396 if (defined( $input->{lookup} )) {
397 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
398 delete( $input->{lookup} );
405 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
407 $lookup_coderef = sub {
408 my $rec = shift || die "need rec!";
409 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
411 WebPAC::Normalize::data_structure(
414 config => create_ds_config( $db_config, $database, $input, $mfn ),
417 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
420 WebPAC::Normalize::_set_lookup( undef );
422 $log->debug("created lookup_coderef using:\n$rules");
428 my $maxmfn = $input_db->open(
429 path => $input->{path},
430 code_page => $input->{encoding}, # database encoding
431 lookup_coderef => $lookup_coderef,
432 lookup => $lookup_jar,
436 return $store->load_row(
437 database => $database,
438 input => $input_name,
444 return $store->save_row(
445 database => $database,
446 input => $input_name,
454 my $lookup_data = WebPAC::Normalize::_get_lookup();
456 if (defined( $lookup_data->{$database}->{$input_name} )) {
457 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
459 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
461 database => $database,
462 input => $input_name,
464 data => $lookup_data->{$database}->{$input_name}->{$key},
470 if ($stats || $validate) {
471 my $path = "out/report/${database}-${input_name}.txt";
472 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
474 print $report_fh "Report for database '$database' input '$input_name' records ",
475 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
476 $log->info("Generating report file $path");
479 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
480 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
485 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
486 $marc = new WebPAC::Output::MARC(
487 path => "out/marc/${database}-${input_name}.marc",
493 my $rules = $parser->normalize_rules($database,$input_name);
494 $log->logwarn("no normalize rules for $database/$input_name") unless $rules;
496 $log->debug("parsed normalize rules:\n$rules");
498 # reset position in database
501 # generate name of config key for indexer (strip everything after -)
502 my $indexer_config = $use_indexer;
503 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
506 my $depends = $parser->depends($database,$input_name);
509 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
510 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
512 foreach my $db (keys %$depends) {
513 foreach my $i (keys %{$depends->{$db}}) {
514 foreach my $k (keys %{$depends->{$db}->{$i}}) {
516 $log->debug("loading lookup $db/$i");
517 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
522 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
527 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
531 # setup input name for all output filters
532 foreach my $out ( @output_modules ) {
533 if ( $out->can('input') ) {
534 $out->input( $input_name );
536 $log->warn("output filter ",ref($out)," doesn't support input name");
541 foreach my $pos ( 0 ... $input_db->size ) {
543 my $row = $input_db->fetch || next;
547 my $mfn = $row->{'000'}->[0];
549 if (! $mfn || $mfn !~ m{^\d+$}) {
550 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
552 push @{ $row->{'000'} }, $pos;
557 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
558 $log->error( "MFN $mfn validation error:\n",
559 $validate->report_error( $errors )
562 next; # validation doesn't create any output
567 my $ds = WebPAC::Normalize::data_structure(
570 lookup => $lookup_hash,
571 config => create_ds_config( $db_config, $database, $input, $mfn ),
572 marc_encoding => 'utf-8',
573 load_row_coderef => sub {
574 my ($database,$input,$mfn) = @_;
575 #warn "### load_row($database,$input,$mfn) from data_structure\n";
576 return $store->load_row(
577 database => $database,
584 $log->debug("ds = ", sub { dump($ds) });
589 database => $database,
590 input => $input_name,
596 id => "${input_name}/${mfn}",
598 type => $config->get($indexer_config)->{type},
601 foreach my $out ( @output_modules ) {
602 $out->add( $mfn, $ds ) if $out->can('add');
606 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
613 while (my $fields = WebPAC::Normalize::MARC::_get_marc_fields( fetch_next => 1 ) ) {
615 id => $mfn . ( $i ? "/$i" : '' ),
617 leader => WebPAC::Normalize::MARC::_get_marc_leader(),
623 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
629 my $errors = $validate->report;
631 $log->info("validation errors:\n$errors\n" );
632 print $report_fh "$errors\n" if ($report_fh);
635 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
637 # must be last thing that touches $validate for this input
642 my $s = $input_db->stats;
643 $log->info("statistics of fields usage:\n$s");
644 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
648 $marc->finish if ($marc);
651 close($report_fh) if ($report_fh);
654 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
656 foreach my $out ( @output_modules ) {
657 $out->finish if $out->can('finish');
660 my $dt = time() - $start_t;
661 $log->info("$total_rows records ", $indexer ? "indexed " : "",
662 sprintf("in %.2f sec [%.2f rec/sec]",
663 $dt, ($total_rows / $dt)
670 $log->info("parallel process $$ finished");
677 # wait all children to finish
678 sleep(1) while wait != -1;
679 $log->info("all parallel processes finished");
682 # save new delimiters if needed
683 $validate->save_delimiters_templates if ( $validate_delimiters_path );
686 # handle links or merge after indexing
690 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
692 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
695 foreach my $link (@links) {
696 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');