6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
119 my $validate_delimiters_path;
120 my $marc_generate = 1;
128 my $log = _new WebPAC::Common()->_get_logger();
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config=s" => \$config_path,
139 "validate=s" => \$validate_path,
140 "validate-delimiters=s" => \$validate_delimiters_path,
141 "marc-generate!" => \$marc_generate,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
150 $marc_generate = 0 if ( $validate_delimiters_path );
152 pod2usage(-verbose => 2) if ($help);
154 my $config = new WebPAC::Config( path => $config_path );
156 #print "config = ",dump($config) if ($debug);
158 die "no databases in config file!\n" unless ($config->databases);
160 $log->info( "-" x 79 );
162 my $log_file = 'log';
164 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
165 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
166 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
170 my $estcmd_path = './estcmd-merge.sh';
172 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
173 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
174 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
175 $log->info("created merge batch file $estcmd_path");
179 $validate = new WebPAC::Validate(
180 delimiters => $config->webpac('delimiters'),
181 ) if ($validate_path || $validate_delimiters_path);
183 my $use_indexer = $config->use_indexer;
184 $stats ||= $validate;
186 $log->debug("disabled indexing for stats collection");
187 $use_indexer = undef;
188 } elsif ( $use_indexer ) {
189 $log->info("using $use_indexer indexing engine...");
192 # parse normalize files and create source files for lookup and normalization
194 my $parser = new WebPAC::Parser( config => $config );
197 my $start_t = time();
202 $log->info("Using $parallel processes for speedup");
203 Proc::Queue::size($parallel);
206 sub create_ds_config {
207 my ($db_config, $database, $input, $mfn) = @_;
208 my $c = dclone( $db_config );
209 $c->{_} = $database || $log->logconfess("need database");
210 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
211 $c->{input} = $input || $log->logconfess("need input");
215 foreach my $database ( sort keys %{ $config->databases } ) {
216 my $db_config = $config->databases->{$database};
218 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
219 next if ($only_database && $database !~ m/$only_database/i);
223 if(defined ($f) and $f==0) {
224 $log->info("Created processes $$ for speedup");
231 if ($use_indexer && $parser->have_rules( 'search', $database )) {
233 my $cfg_name = $use_indexer;
234 $cfg_name =~ s/\-.*$//;
236 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
237 $indexer_config->{database} = $database;
238 $indexer_config->{clean} = $clean;
239 $indexer_config->{label} = $db_config->{name};
241 # force clean if database has links
242 $indexer_config->{clean} = 1 if ($db_config->{links});
244 if ($use_indexer eq 'hyperestraier') {
246 # open Hyper Estraier database
247 require WebPAC::Output::Estraier;
248 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
250 } elsif ($use_indexer eq 'hyperestraier-native') {
252 # open Hyper Estraier database
253 require WebPAC::Output::EstraierNative;
254 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
256 } elsif ($use_indexer eq 'kinosearch') {
258 die "no longer supported";
261 $log->logdie("unknown use_indexer: $use_indexer");
264 $log->logdie("can't continue without valid indexer") unless ($indexer);
269 # store Hyper Estraier links to other databases
271 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
272 foreach my $link (@{ $db_config->{links} }) {
273 if ($use_indexer eq 'hyperestraier') {
275 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
277 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
279 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
283 credit => $link->{credit},
288 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
292 next if ($only_links);
298 my $store = new WebPAC::Store({
306 my @outputs = force_array( $db_config->{output}, sub {
307 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
312 foreach my $output ( @outputs ) {
314 #warn '## output = ',dump( $output );
316 my $module = $output->{module} || $log->logdie("need module in output section of $database");
317 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
319 $log->debug("loading output module $module");
320 eval "require $module";
322 # add database to arugemnts for output filter
323 $output->{database} = $database;
325 $log->debug("calling $module->new(",dump( $output ),")");
326 my $out = new $module->new( $output );
329 push @output_modules, $out;
334 # now, iterate through input formats
338 my @inputs = force_array( $db_config->{input}, sub {
339 $log->info("database $database doesn't have inputs defined");
342 foreach my $input (@inputs) {
344 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
346 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
348 my $type = lc($input->{type});
350 # FIXME check if input module exists
351 my $input_module = $input->{module};
353 if ( ! $input_module ) {
354 if ( grep(/$type/, $config->webpac('inputs')) ) {
355 $input_module = $config->webpac('inputs')->{$type};
357 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
361 my @lookups = $parser->have_lookup_create($database, $input);
363 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
364 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
368 # disable modification of records if --stats is in use
369 delete($input->{modify_records});
370 delete($input->{modify_file});
373 my $input_db = new WebPAC::Input(
374 module => $input_module,
375 encoding => $config->webpac('webpac_encoding'),
376 limit => $limit || $input->{limit},
378 recode => $input->{recode},
380 modify_records => $input->{modify_records},
381 modify_file => $input->{modify_file},
382 input_config => $input,
384 $log->logdie("can't create input using $input_module") unless ($input);
386 if (defined( $input->{lookup} )) {
387 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
388 delete( $input->{lookup} );
395 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
397 $lookup_coderef = sub {
398 my $rec = shift || die "need rec!";
399 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
401 WebPAC::Normalize::data_structure(
404 config => create_ds_config( $db_config, $database, $input, $mfn ),
407 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
410 WebPAC::Normalize::_set_lookup( undef );
412 $log->debug("created lookup_coderef using:\n$rules");
418 my $maxmfn = $input_db->open(
419 path => $input->{path},
420 code_page => $input->{encoding}, # database encoding
421 lookup_coderef => $lookup_coderef,
422 lookup => $lookup_jar,
426 return $store->load_row(
427 database => $database,
428 input => $input_name,
434 return $store->save_row(
435 database => $database,
436 input => $input_name,
444 my $lookup_data = WebPAC::Normalize::_get_lookup();
446 if (defined( $lookup_data->{$database}->{$input_name} )) {
447 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
449 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
451 database => $database,
452 input => $input_name,
454 data => $lookup_data->{$database}->{$input_name}->{$key},
460 if ($stats || $validate) {
461 my $path = "out/report/${database}-${input_name}.txt";
462 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
464 print $report_fh "Report for database '$database' input '$input_name' records ",
465 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
466 $log->info("Generating report file $path");
469 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
470 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
475 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
476 $marc = new WebPAC::Output::MARC(
477 path => "out/marc/${database}-${input_name}.marc",
483 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
484 $log->debug("parsed normalize rules:\n$rules");
486 # reset position in database
489 # generate name of config key for indexer (strip everything after -)
490 my $indexer_config = $use_indexer;
491 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
494 my $depends = $parser->depends($database,$input_name);
497 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
498 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
500 foreach my $db (keys %$depends) {
501 foreach my $i (keys %{$depends->{$db}}) {
502 foreach my $k (keys %{$depends->{$db}->{$i}}) {
504 $log->debug("loading lookup $db/$i");
505 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
510 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
515 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
519 # setup input name for all output filters
520 foreach my $out ( @output_modules ) {
521 if ( $out->can('input') ) {
522 $out->input( $input_name );
524 $log->warn("output filter ",ref($out)," doesn't support input name");
529 foreach my $pos ( 0 ... $input_db->size ) {
531 my $row = $input_db->fetch || next;
535 my $mfn = $row->{'000'}->[0];
537 if (! $mfn || $mfn !~ m{^\d+$}) {
538 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
540 push @{ $row->{'000'} }, $pos;
545 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
546 $log->error( "MFN $mfn validation error:\n",
547 $validate->report_error( $errors )
550 next; # validation doesn't create any output
553 if ( my $ds = WebPAC::Normalize::data_structure(
556 lookup => $lookup_hash,
557 config => create_ds_config( $db_config, $database, $input, $mfn ),
558 marc_encoding => 'utf-8',
559 load_row_coderef => sub {
560 my ($database,$input,$mfn) = @_;
561 return $store->load_row(
562 database => $database,
569 $log->debug("ds = ", sub { dump($ds) });
572 database => $database,
573 input => $input_name,
579 id => "${input_name}/${mfn}",
581 type => $config->get($indexer_config)->{type},
587 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
589 id => $mfn . ( $i ? "/$i" : '' ),
591 leader => WebPAC::Normalize::_get_marc_leader(),
597 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
600 foreach my $out ( @output_modules ) {
601 $out->add( $mfn, $ds ) if $out->can('add');
605 $log->warn("record $pos didn't produce any output after normalization rules!");
611 my $errors = $validate->report;
613 $log->info("validation errors:\n$errors\n" );
614 print $report_fh "$errors\n" if ($report_fh);
617 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
619 # must be last thing that touches $validate for this input
624 my $s = $input_db->stats;
625 $log->info("statistics of fields usage:\n$s");
626 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
630 $marc->finish if ($marc);
633 close($report_fh) if ($report_fh);
636 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
638 foreach my $out ( @output_modules ) {
639 $out->finish if $out->can('finish');
642 my $dt = time() - $start_t;
643 $log->info("$total_rows records ", $indexer ? "indexed " : "",
644 sprintf("in %.2f sec [%.2f rec/sec]",
645 $dt, ($total_rows / $dt)
652 $log->info("parallel process $$ finished");
659 # wait all children to finish
660 sleep(1) while wait != -1;
661 $log->info("all parallel processes finished");
664 # save new delimiters if needed
665 $validate->save_delimiters_templates if ( $validate_delimiters_path );
668 # handle links or merge after indexing
672 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
674 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
677 foreach my $link (@links) {
678 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');