6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.15;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
106 =item --mirror http://www.example.com
108 Tries to download input path files from mirror URI
123 my $validate_delimiters_path;
124 my $marc_generate = 1;
133 my $log = _new WebPAC::Common()->_get_logger();
136 "limit=i" => \$limit,
137 "offset=i" => \$offset,
139 "one=s" => \$only_filter,
140 "only=s" => \$only_filter,
141 "config=s" => \$config_path,
144 "validate=s" => \$validate_path,
145 "validate-delimiters=s" => \$validate_delimiters_path,
146 "marc-generate!" => \$marc_generate,
147 "marc-lint!" => \$marc_lint,
148 "marc-dump!" => \$marc_dump,
149 "parallel=i" => \$parallel,
150 "only-links!" => \$only_links,
152 "mirror=s" => \$mirror,
156 $marc_generate = 0 if ( $validate_delimiters_path );
158 pod2usage(-verbose => 2) if ($help);
160 my $config = new WebPAC::Config( path => $config_path );
162 WebPAC::Normalize::_debug( $debug - 1 ) if $debug > 1;
164 #print "config = ",dump($config) if ($debug);
166 die "no databases in config file!\n" unless ($config->databases);
168 $log->info( "-" x 79 );
170 my $log_file = 'log';
172 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
173 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
174 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
178 my $estcmd_path = './estcmd-merge.sh';
180 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
181 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
182 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
183 $log->info("created merge batch file $estcmd_path");
187 $validate = new WebPAC::Validate(
188 delimiters => $config->webpac('delimiters'),
189 ) if ($validate_path || $validate_delimiters_path);
191 my $use_indexer = $config->use_indexer;
192 $stats ||= $validate;
194 $log->debug("disabled indexing for stats collection");
195 $use_indexer = undef;
196 } elsif ( $use_indexer ) {
197 $log->info("using $use_indexer indexing engine...");
200 # parse normalize files and create source files for lookup and normalization
202 my ($only_database,$only_input) = split(m#/#, $only_filter) if $only_filter;
204 my $parser = new WebPAC::Parser(
206 only_database => $only_database,
207 only_input => $only_input,
211 my $start_t = time();
216 eval 'use Proc::Queue size => 1;';
218 $log->info("Using $parallel processes for speedup");
219 Proc::Queue::size($parallel);
222 sub create_ds_config {
223 my ($db_config, $database, $input, $mfn) = @_;
224 my $c = dclone( $db_config );
225 $c->{_} = $database || $log->logconfess("need database");
226 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
227 $c->{input} = $input || $log->logconfess("need input");
231 foreach my $database ( sort keys %{ $config->databases } ) {
232 my $db_config = $config->databases->{$database};
234 next if ($only_database && $database !~ m/$only_database/i);
238 if(defined ($f) and $f==0) {
239 $log->info("Created processes $$ for speedup");
246 if ($use_indexer && $parser->have_rules( 'search', $database )) {
248 my $cfg_name = $use_indexer;
249 $cfg_name =~ s/\-.*$//;
251 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
252 $indexer_config->{database} = $database;
253 $indexer_config->{clean} = $clean;
254 $indexer_config->{label} = $db_config->{name};
256 # force clean if database has links
257 $indexer_config->{clean} = 1 if ($db_config->{links});
259 if ($use_indexer eq 'hyperestraier') {
261 # open Hyper Estraier database
262 require WebPAC::Output::Estraier;
263 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
265 } elsif ($use_indexer eq 'hyperestraier-native') {
267 # open Hyper Estraier database
268 require WebPAC::Output::EstraierNative;
269 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
271 } elsif ($use_indexer eq 'kinosearch') {
273 die "no longer supported";
276 $log->logdie("unknown use_indexer: $use_indexer");
279 $log->logdie("can't continue without valid indexer") unless ($indexer);
284 # store Hyper Estraier links to other databases
286 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
287 foreach my $link (@{ $db_config->{links} }) {
288 if ($use_indexer eq 'hyperestraier') {
290 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
292 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
294 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
298 credit => $link->{credit},
303 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
307 next if ($only_links);
313 my $store = new WebPAC::Store({
321 my @outputs = force_array( $db_config->{output}, sub {
322 $log->error("Database $database doesn't have any outputs defined. Do you want to remove it from configuration?" );
327 foreach my $output ( @outputs ) {
329 #warn '## output = ',dump( $output );
331 my $module = $output->{module} || $log->logdie("need module in output section of $database");
332 $module = 'WebPAC::Output::' . $module unless $module =~ m/::/;
334 $log->debug("loading output module $module");
335 eval "require $module";
337 # add database to arugemnts for output filter
338 $output->{database} = $database;
339 $output->{clean} = $clean;
341 $log->debug("calling $module->new(",dump( $output ),")");
342 my $out = new $module->new( $output );
344 push @output_modules, $out;
346 $log->warn("SKIPPED $module");
352 # now, iterate through input formats
356 my @inputs = force_array( $db_config->{input}, sub {
357 $log->info("database $database doesn't have inputs defined");
360 if ( -e 'out/debug' ) { # FIXME flag?
362 foreach my $i ( @inputs ) {
364 next unless defined $i->{normalize};
365 warn dump( $i->{normalize} );
366 foreach my $normalize ( @{ $i->{normalize} } ) {
367 my $path = $normalize->{path};
368 $out .= qq/\n##\n## $path\n##\n\n/;
369 $out .= read_file( $path );
372 my $all = "out/debug/all-normalize.pl";
373 write_file( $all, $out );
374 warn "### all normalize for this input saved to: $all";
377 foreach my $input (@inputs) {
379 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
381 if ( $input->{skip} ) {
382 $log->info("skip $input_name");
386 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
388 my $type = lc($input->{type});
390 # FIXME check if input module exists
391 my $input_module = $input->{module};
393 if ( ! $input_module ) {
394 if ( grep(/$type/, $config->webpac('inputs')) ) {
395 $input_module = $config->webpac('inputs')->{$type};
397 $log->logdie("I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!" );
401 my @lookups = $parser->have_lookup_create($database, $input);
403 $log->info("working on $database/$input_name with $input_module on $input->{path}",
404 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
408 # disable modification of records if --stats is in use
409 delete($input->{modify_records});
410 delete($input->{modify_file});
414 my $path = $input->{path} || die "no input path in ",dump( $input );
415 $log->info( "mirror ", $path, " ", -s $path, " bytes" );
417 $log->warn( "$path not modified" )
418 if mirror( "$mirror/$path", $path ) == RC_NOT_MODIFIED;
421 my $input_db = new WebPAC::Input(
422 module => $input_module,
423 limit => $limit || $input->{limit},
425 recode => $input->{recode},
427 modify_records => $input->{modify_records},
428 modify_file => $input->{modify_file},
429 input_config => $input,
431 $log->logdie("can't create input using $input_module") unless ($input);
433 if (defined( $input->{lookup} )) {
434 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
435 delete( $input->{lookup} );
442 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
444 $lookup_coderef = sub {
445 my $rec = shift || die "need rec!";
446 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
448 WebPAC::Normalize::data_structure(
451 config => create_ds_config( $db_config, $database, $input, $mfn ),
454 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
457 WebPAC::Normalize::_set_lookup( undef );
459 $log->debug("created lookup_coderef using:\n$rules");
465 my $maxmfn = $input_db->open(
466 path => $input->{path},
467 input_encoding => $input->{encoding}, # database encoding
468 lookup_coderef => $lookup_coderef,
469 lookup => $lookup_jar,
473 return $store->load_row(
474 database => $database,
475 input => $input_name,
481 return $store->save_row(
482 database => $database,
483 input => $input_name,
491 my $lookup_data = WebPAC::Normalize::_get_lookup();
493 if (defined( $lookup_data->{$database}->{$input_name} )) {
494 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
496 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
498 database => $database,
499 input => $input_name,
501 data => $lookup_data->{$database}->{$input_name}->{$key},
507 if ($stats || $validate) {
508 my $path = "out/report/${database}-${input_name}.txt";
509 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
511 print $report_fh "Report for database '$database' input '$input_name' records ",
512 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
513 $log->info("Generating report file $path");
516 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
517 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
522 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
523 $marc = new WebPAC::Output::MARC(
524 path => "out/marc/${database}-${input_name}.marc",
530 my $rules = $parser->normalize_rules($database,$input_name);
531 $log->logwarn("no normalize rules for $database/$input_name") unless $rules;
533 $log->debug("parsed normalize rules:\n$rules");
535 # reset position in database
538 # generate name of config key for indexer (strip everything after -)
539 my $indexer_config = $use_indexer;
540 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
543 my $depends = $parser->depends($database,$input_name);
546 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
547 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
549 foreach my $db (keys %$depends) {
550 foreach my $i (keys %{$depends->{$db}}) {
551 foreach my $k (keys %{$depends->{$db}->{$i}}) {
553 $log->debug("loading lookup $db/$i");
554 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
559 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
564 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
568 # setup input name for all output filters
569 foreach my $out ( @output_modules ) {
570 if ( $out->can('input') ) {
571 $out->input( $input_name );
573 $log->warn("output filter ",ref($out)," doesn't support input name");
578 foreach my $pos ( 0 ... $input_db->size ) {
580 my $row = $input_db->fetch || next;
584 my $mfn = $row->{'000'}->[0];
586 if (! $mfn || $mfn !~ m{^\d+$}) {
587 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
589 push @{ $row->{'000'} }, $pos;
594 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
595 $log->error( "MFN $mfn validation error:\n",
596 $validate->report_error( $errors )
599 next; # validation doesn't create any output
604 my $ds = WebPAC::Normalize::data_structure(
607 lookup => $lookup_hash,
608 config => create_ds_config( $db_config, $database, $input, $mfn ),
609 marc_encoding => 'utf-8',
610 load_row_coderef => sub {
611 my ($database,$input,$mfn) = @_;
612 #warn "### load_row($database,$input,$mfn) from data_structure\n";
613 return $store->load_row(
614 database => $database,
621 $log->debug("ds = ", sub { dump($ds) });
626 database => $database,
627 input => $input_name,
633 id => "${input_name}/${mfn}",
635 type => $config->get($indexer_config)->{type},
638 foreach my $out ( @output_modules ) {
639 $out->add( $mfn, $ds ) if $out->can('add');
643 $log->warn("record $pos didn't produce any output after normalization rules!") unless $marc;
650 while (my $fields = WebPAC::Normalize::MARC::_get_marc_fields( fetch_next => 1 ) ) {
652 id => $mfn . ( $i ? "/$i" : '' ),
654 leader => WebPAC::Normalize::MARC::_get_marc_leader(),
660 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
666 my $errors = $validate->report;
668 $log->info("validation errors:\n$errors\n" );
669 print $report_fh "$errors\n" if ($report_fh);
672 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
674 # must be last thing that touches $validate for this input
679 my $s = $input_db->stats;
680 $log->info("statistics of fields usage:\n$s");
681 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
685 $marc->finish if ($marc);
688 close($report_fh) if ($report_fh);
691 $indexer->finish if $indexer && $indexer->can('finish');
693 foreach my $out ( @output_modules ) {
694 $out->finish if $out->can('finish');
697 my $dt = time() - $start_t;
698 $log->info("$total_rows records ", $indexer ? "indexed " : "",
699 sprintf("in %.2f sec [%.2f rec/sec]",
700 $dt, ($total_rows / $dt)
707 $log->info("parallel process $$ finished");
714 # wait all children to finish
715 sleep(1) while wait != -1;
716 $log->info("all parallel processes finished");
719 # save new delimiters if needed
720 $validate->save_delimiters_templates if ( $validate_delimiters_path );
723 # handle links or merge after indexing
727 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
729 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
732 foreach my $link (@links) {
733 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');