6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
119 my $validate_delimiters_path;
120 my $marc_generate = 1;
128 my $log = _new WebPAC::Common()->_get_logger();
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config=s" => \$config_path,
139 "validate=s" => \$validate_path,
140 "validate-delimiters=s" => \$validate_delimiters_path,
141 "marc-generate!" => \$marc_generate,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
150 $marc_generate = 0 if ( $validate_delimiters_path );
152 pod2usage(-verbose => 2) if ($help);
154 my $config = new WebPAC::Config( path => $config_path );
156 #print "config = ",dump($config) if ($debug);
158 die "no databases in config file!\n" unless ($config->databases);
160 $log->info( "-" x 79 );
162 my $log_file = 'log';
164 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
165 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
166 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
170 my $estcmd_path = './estcmd-merge.sh';
172 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
173 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
174 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
175 $log->info("created merge batch file $estcmd_path");
179 $validate = new WebPAC::Validate(
180 delimiters => $config->webpac('delimiters'),
181 ) if ($validate_path || $validate_delimiters_path);
183 my $use_indexer = $config->use_indexer;
184 $stats ||= $validate;
186 $log->debug("disabled indexing for stats collection");
187 $use_indexer = undef;
188 } elsif ( $use_indexer ) {
189 $log->info("using $use_indexer indexing engine...");
192 # parse normalize files and create source files for lookup and normalization
194 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
196 my $parser = new WebPAC::Parser(
198 only_database => $only_database,
199 only_input => $only_input,
203 my $start_t = time();
208 $log->info("Using $parallel processes for speedup");
209 Proc::Queue::size($parallel);
212 sub create_ds_config {
213 my ($db_config, $database, $input, $mfn) = @_;
214 my $c = dclone( $db_config );
215 $c->{_} = $database || $log->logconfess("need database");
216 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
217 $c->{input} = $input || $log->logconfess("need input");
221 foreach my $database ( sort keys %{ $config->databases } ) {
222 my $db_config = $config->databases->{$database};
224 next if ($only_database && $database !~ m/$only_database/i);
228 if(defined ($f) and $f==0) {
229 $log->info("Created processes $$ for speedup");
236 if ($use_indexer && $parser->have_rules( 'search', $database )) {
238 my $cfg_name = $use_indexer;
239 $cfg_name =~ s/\-.*$//;
241 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
242 $indexer_config->{database} = $database;
243 $indexer_config->{clean} = $clean;
244 $indexer_config->{label} = $db_config->{name};
246 # force clean if database has links
247 $indexer_config->{clean} = 1 if ($db_config->{links});
249 if ($use_indexer eq 'hyperestraier') {
251 # open Hyper Estraier database
252 require WebPAC::Output::Estraier;
253 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
255 } elsif ($use_indexer eq 'hyperestraier-native') {
257 # open Hyper Estraier database
258 require WebPAC::Output::EstraierNative;
259 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
261 } elsif ($use_indexer eq 'kinosearch') {
263 die "no longer supported";
266 $log->logdie("unknown use_indexer: $use_indexer");
269 $log->logdie("can't continue without valid indexer") unless ($indexer);
274 # store Hyper Estraier links to other databases
276 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
277 foreach my $link (@{ $db_config->{links} }) {
278 if ($use_indexer eq 'hyperestraier') {
280 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
282 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
284 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
288 credit => $link->{credit},
293 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
297 next if ($only_links);
303 my $store = new WebPAC::Store({
311 my @outputs = force_array( $db_config->{output}, sub {
312 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
317 foreach my $output ( @outputs ) {
319 #warn '## output = ',dump( $output );
321 my $module = $output->{module} || $log->logdie("need module in output section of $database");
322 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
324 $log->debug("loading output module $module");
325 eval "require $module";
327 # add database to arugemnts for output filter
328 $output->{database} = $database;
330 $log->debug("calling $module->new(",dump( $output ),")");
331 my $out = new $module->new( $output );
334 push @output_modules, $out;
339 # now, iterate through input formats
343 my @inputs = force_array( $db_config->{input}, sub {
344 $log->info("database $database doesn't have inputs defined");
347 foreach my $input (@inputs) {
349 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
351 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
353 my $type = lc($input->{type});
355 # FIXME check if input module exists
356 my $input_module = $input->{module};
358 if ( ! $input_module ) {
359 if ( grep(/$type/, $config->webpac('inputs')) ) {
360 $input_module = $config->webpac('inputs')->{$type};
362 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
366 my @lookups = $parser->have_lookup_create($database, $input);
368 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
369 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
373 # disable modification of records if --stats is in use
374 delete($input->{modify_records});
375 delete($input->{modify_file});
378 my $input_db = new WebPAC::Input(
379 module => $input_module,
380 encoding => $config->webpac('webpac_encoding'),
381 limit => $limit || $input->{limit},
383 recode => $input->{recode},
385 modify_records => $input->{modify_records},
386 modify_file => $input->{modify_file},
387 input_config => $input,
389 $log->logdie("can't create input using $input_module") unless ($input);
391 if (defined( $input->{lookup} )) {
392 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
393 delete( $input->{lookup} );
400 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
402 $lookup_coderef = sub {
403 my $rec = shift || die "need rec!";
404 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
406 WebPAC::Normalize::data_structure(
409 config => create_ds_config( $db_config, $database, $input, $mfn ),
412 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
415 WebPAC::Normalize::_set_lookup( undef );
417 $log->debug("created lookup_coderef using:\n$rules");
423 my $maxmfn = $input_db->open(
424 path => $input->{path},
425 code_page => $input->{encoding}, # database encoding
426 lookup_coderef => $lookup_coderef,
427 lookup => $lookup_jar,
431 return $store->load_row(
432 database => $database,
433 input => $input_name,
439 return $store->save_row(
440 database => $database,
441 input => $input_name,
449 my $lookup_data = WebPAC::Normalize::_get_lookup();
451 if (defined( $lookup_data->{$database}->{$input_name} )) {
452 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
454 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
456 database => $database,
457 input => $input_name,
459 data => $lookup_data->{$database}->{$input_name}->{$key},
465 if ($stats || $validate) {
466 my $path = "out/report/${database}-${input_name}.txt";
467 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
469 print $report_fh "Report for database '$database' input '$input_name' records ",
470 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
471 $log->info("Generating report file $path");
474 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
475 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
480 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
481 $marc = new WebPAC::Output::MARC(
482 path => "out/marc/${database}-${input_name}.marc",
488 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
489 $log->debug("parsed normalize rules:\n$rules");
491 # reset position in database
494 # generate name of config key for indexer (strip everything after -)
495 my $indexer_config = $use_indexer;
496 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
499 my $depends = $parser->depends($database,$input_name);
502 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
503 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
505 foreach my $db (keys %$depends) {
506 foreach my $i (keys %{$depends->{$db}}) {
507 foreach my $k (keys %{$depends->{$db}->{$i}}) {
509 $log->debug("loading lookup $db/$i");
510 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
515 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
520 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
524 # setup input name for all output filters
525 foreach my $out ( @output_modules ) {
526 if ( $out->can('input') ) {
527 $out->input( $input_name );
529 $log->warn("output filter ",ref($out)," doesn't support input name");
534 foreach my $pos ( 0 ... $input_db->size ) {
536 my $row = $input_db->fetch || next;
540 my $mfn = $row->{'000'}->[0];
542 if (! $mfn || $mfn !~ m{^\d+$}) {
543 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
545 push @{ $row->{'000'} }, $pos;
550 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
551 $log->error( "MFN $mfn validation error:\n",
552 $validate->report_error( $errors )
555 next; # validation doesn't create any output
558 my $ds = WebPAC::Normalize::data_structure(
561 lookup => $lookup_hash,
562 config => create_ds_config( $db_config, $database, $input, $mfn ),
563 marc_encoding => 'utf-8',
564 load_row_coderef => sub {
565 my ($database,$input,$mfn) = @_;
566 #warn "### load_row($database,$input,$mfn) from data_structure\n";
567 return $store->load_row(
568 database => $database,
575 $log->debug("ds = ", sub { dump($ds) });
580 database => $database,
581 input => $input_name,
587 id => "${input_name}/${mfn}",
589 type => $config->get($indexer_config)->{type},
592 foreach my $out ( @output_modules ) {
593 $out->add( $mfn, $ds ) if $out->can('add');
597 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
603 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
605 id => $mfn . ( $i ? "/$i" : '' ),
607 leader => WebPAC::Normalize::_get_marc_leader(),
613 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
619 my $errors = $validate->report;
621 $log->info("validation errors:\n$errors\n" );
622 print $report_fh "$errors\n" if ($report_fh);
625 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
627 # must be last thing that touches $validate for this input
632 my $s = $input_db->stats;
633 $log->info("statistics of fields usage:\n$s");
634 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
638 $marc->finish if ($marc);
641 close($report_fh) if ($report_fh);
644 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
646 foreach my $out ( @output_modules ) {
647 $out->finish if $out->can('finish');
650 my $dt = time() - $start_t;
651 $log->info("$total_rows records ", $indexer ? "indexed " : "",
652 sprintf("in %.2f sec [%.2f rec/sec]",
653 $dt, ($total_rows / $dt)
660 $log->info("parallel process $$ finished");
667 # wait all children to finish
668 sleep(1) while wait != -1;
669 $log->info("all parallel processes finished");
672 # save new delimiters if needed
673 $validate->save_delimiters_templates if ( $validate_delimiters_path );
676 # handle links or merge after indexing
680 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
682 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
685 foreach my $link (@links) {
686 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');