6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
73 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
74 You can use this option as C<--no-marc-generate> to disable MARC generation.
78 By default turned on if normalisation file has C<marc*> directives. You can disable lint
79 messages with C<--no-marc-lint>.
83 Force dump or input and marc record for debugging.
87 Run databases in parallel (aproximatly same as number of processors in
88 machine if you want to use full load)
96 Create merged index of databases which have links
111 my $marc_generate = 1;
119 my $log = _new WebPAC::Common()->_get_logger();
122 "limit=i" => \$limit,
123 "offset=i" => \$offset,
125 "one=s" => \$only_filter,
126 "only=s" => \$only_filter,
127 "config" => \$config_path,
130 "validate=s" => \$validate_path,
131 "marc-generate!" => \$marc_generate,
132 "marc-lint!" => \$marc_lint,
133 "marc-dump!" => \$marc_dump,
134 "parallel=i" => \$parallel,
135 "only-links!" => \$only_links,
140 pod2usage(-verbose => 2) if ($help);
142 my $config = new WebPAC::Config( path => $config_path );
144 #print "config = ",dump($config) if ($debug);
146 die "no databases in config file!\n" unless ($config->databases);
148 $log->info( "-" x 79 );
150 my $log_file = 'log';
152 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
153 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
154 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
158 my $estcmd_path = './estcmd-merge.sh';
160 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
161 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
162 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
163 $log->info("created merge batch file $estcmd_path");
168 $validate = new WebPAC::Validate(
169 path => $validate_path,
170 ) if ($validate_path);
173 my $use_indexer = $config->use_indexer;
174 $stats ||= $validate;
176 $log->debug("disabled indexing for stats collection");
177 $use_indexer = undef;
179 $log->info("using $use_indexer indexing engine...");
182 # parse normalize files and create source files for lookup and normalization
184 my $parser = new WebPAC::Parser( config => $config );
187 my $start_t = time();
192 $log->info("Using $parallel processes for speedup");
193 Proc::Queue::size($parallel);
196 sub create_ds_config {
197 my ($db_config, $database, $input, $mfn) = @_;
198 my $c = dclone( $db_config );
199 $c->{_} = $database || $log->logconfess("need database");
200 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
201 $c->{input} = $input || $log->logconfess("need input");
205 while (my ($database, $db_config) = each %{ $config->databases }) {
207 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
208 next if ($only_database && $database !~ m/$only_database/i);
212 if(defined ($f) and $f==0) {
213 $log->info("Created processes $$ for speedup");
220 if ($use_indexer && $parser->have_rules( 'search', $database )) {
222 my $cfg_name = $use_indexer;
223 $cfg_name =~ s/\-.*$//;
225 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
226 $indexer_config->{database} = $database;
227 $indexer_config->{clean} = $clean;
228 $indexer_config->{label} = $db_config->{name};
230 # force clean if database has links
231 $indexer_config->{clean} = 1 if ($db_config->{links});
233 if ($use_indexer eq 'hyperestraier') {
235 # open Hyper Estraier database
236 use WebPAC::Output::Estraier '0.10';
237 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
239 } elsif ($use_indexer eq 'hyperestraier-native') {
241 # open Hyper Estraier database
242 use WebPAC::Output::EstraierNative;
243 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
245 } elsif ($use_indexer eq 'kinosearch') {
248 use WebPAC::Output::KinoSearch;
249 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
250 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
253 $log->logdie("unknown use_indexer: $use_indexer");
256 $log->logide("can't continue without valid indexer") unless ($indexer);
261 # store Hyper Estraier links to other databases
263 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
264 foreach my $link (@{ $db_config->{links} }) {
265 if ($use_indexer eq 'hyperestraier') {
267 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
269 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
271 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
275 credit => $link->{credit},
280 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
284 next if ($only_links);
290 my $abs_path = abs_path($0);
291 $abs_path =~ s#/[^/]*$#/#;
293 my $db_path = $config->webpac('db_path');
296 $log->info("creating new database '$database' in $db_path");
297 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
299 $log->info("working on database '$database' in $db_path");
302 my $store = new WebPAC::Store(
309 # now, iterate through input formats
313 if (ref($db_config->{input}) eq 'ARRAY') {
314 @inputs = @{ $db_config->{input} };
315 } elsif ($db_config->{input}) {
316 push @inputs, $db_config->{input};
318 $log->info("database $database doesn't have inputs defined");
321 foreach my $input (@inputs) {
323 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
325 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
327 my $type = lc($input->{type});
329 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
331 my $input_module = $config->webpac('inputs')->{$type};
333 my @lookups = $parser->have_lookup_create($database, $input);
335 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
336 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
340 # disable modification of records if --stats is in use
341 delete($input->{modify_records});
342 delete($input->{modify_file});
345 my $input_db = new WebPAC::Input(
346 module => $input_module,
347 encoding => $config->webpac('webpac_encoding'),
348 limit => $limit || $input->{limit},
350 recode => $input->{recode},
352 modify_records => $input->{modify_records},
353 modify_file => $input->{modify_file},
355 $log->logdie("can't create input using $input_module") unless ($input);
357 if (defined( $input->{lookup} )) {
358 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
359 delete( $input->{lookup} );
366 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
368 $lookup_coderef = sub {
369 my $rec = shift || die "need rec!";
370 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
372 WebPAC::Normalize::data_structure(
375 config => create_ds_config( $db_config, $database, $input, $mfn ),
378 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
381 WebPAC::Normalize::_set_lookup( undef );
383 $log->debug("created lookup_coderef using:\n$rules");
389 my $maxmfn = $input_db->open(
390 path => $input->{path},
391 code_page => $input->{encoding}, # database encoding
392 lookup_coderef => $lookup_coderef,
393 lookup => $lookup_jar,
397 return $store->load_row(
398 database => $database,
399 input => $input_name,
405 return $store->save_row(
406 database => $database,
407 input => $input_name,
415 my $lookup_data = WebPAC::Normalize::_get_lookup();
417 if (defined( $lookup_data->{$database}->{$input_name} )) {
418 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
420 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
422 database => $database,
423 input => $input_name,
425 data => $lookup_data->{$database}->{$input_name}->{$key},
431 if ($stats || $validate) {
432 my $path = "out/report/${database}-${input_name}.txt";
433 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
435 print $report_fh "Report for database '$database' input '$input_name' records ",
436 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
437 $log->info("Generating report file $path");
441 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
442 $marc = new WebPAC::Output::MARC(
443 path => "out/marc/${database}-${input_name}.marc",
449 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
450 $log->debug("parsed normalize rules:\n$rules");
452 # reset position in database
455 # generate name of config key for indexer (strip everything after -)
456 my $indexer_config = $use_indexer;
457 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
460 my $depends = $parser->depends($database,$input_name);
463 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
464 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
466 foreach my $db (keys %$depends) {
467 foreach my $i (keys %{$depends->{$db}}) {
468 foreach my $k (keys %{$depends->{$db}->{$i}}) {
470 $log->debug("loading lookup $db/$i");
471 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
476 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
481 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
485 foreach my $pos ( 0 ... $input_db->size ) {
487 my $row = $input_db->fetch || next;
491 my $mfn = $row->{'000'}->[0];
493 if (! $mfn || $mfn !~ m#^\d+$#) {
494 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
496 push @{ $row->{'000'} }, $pos;
501 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
502 $log->error( "MFN $mfn validation error:\n",
503 $validate->report_error( $errors )
506 next; # validation doesn't create any output
509 my $ds = WebPAC::Normalize::data_structure(
512 lookup => $lookup_hash,
513 config => create_ds_config( $db_config, $database, $input, $mfn ),
514 marc_encoding => 'utf-8',
515 load_row_coderef => sub {
516 my ($database,$input,$mfn) = @_;
517 return $store->load_row(
518 database => $database,
525 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
528 database => $database,
529 input => $input_name,
532 ) if ($ds && !$stats);
535 id => "${input_name}/${mfn}",
537 type => $config->get($indexer_config)->{type},
538 ) if ($indexer && $ds);
543 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
545 id => $mfn . ( $i ? "/$i" : '' ),
547 leader => WebPAC::Normalize::_get_marc_leader(),
553 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
558 my $errors = $validate->report;
560 $log->info("validation errors:\n$errors\n" );
561 print $report_fh "$errors\n" if ($report_fh);
566 my $s = $input_db->stats;
567 $log->info("statistics of fields usage:\n$s");
568 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
572 $marc->finish if ($marc);
575 close($report_fh) if ($report_fh)
579 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
581 my $dt = time() - $start_t;
582 $log->info("$total_rows records ", $indexer ? "indexed " : "",
583 sprintf("in %.2f sec [%.2f rec/sec]",
584 $dt, ($total_rows / $dt)
591 $log->info("parallel process $$ finished");
598 # wait all children to finish
599 sleep(1) while wait != -1;
600 $log->info("all parallel processes finished");
604 # handle links or merge after indexing
608 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
610 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
613 foreach my $link (@links) {
614 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');