6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
72 By default turned on if normalisation file has C<marc*> directives. You can disable lint
73 messages with C<--no-marc-lint>.
77 Force dump or input and marc record for debugging.
81 Run databases in parallel (aproximatly same as number of processors in
82 machine if you want to use full load)
90 Create merged index of databases which have links
111 my $log = _new WebPAC::Common()->_get_logger();
114 "limit=i" => \$limit,
115 "offset=i" => \$offset,
117 "one=s" => \$only_filter,
118 "only=s" => \$only_filter,
119 "config" => \$config_path,
122 "validate=s" => \$validate_path,
123 "marc-lint!" => \$marc_lint,
124 "marc-dump!" => \$marc_dump,
125 "parallel=i" => \$parallel,
126 "only-links!" => \$only_links,
130 my $config = new WebPAC::Config( path => $config_path );
132 #print "config = ",dump($config) if ($debug);
134 die "no databases in config file!\n" unless ($config->databases);
136 $log->info( "-" x 79 );
138 my $log_file = 'log';
140 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
141 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
142 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
146 my $estcmd_path = './estcmd-merge.sh';
148 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
149 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
150 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
151 $log->info("created merge batch file $estcmd_path");
156 $validate = new WebPAC::Validate(
157 path => $validate_path,
158 ) if ($validate_path);
161 my $use_indexer = $config->use_indexer;
162 $stats ||= $validate;
164 $log->debug("disabled indexing for stats collection");
165 $use_indexer = undef;
167 $log->info("using $use_indexer indexing engine...");
170 # parse normalize files and create source files for lookup and normalization
172 my $parser = new WebPAC::Parser( config => $config );
175 my $start_t = time();
180 $log->info("Using $parallel processes for speedup");
181 Proc::Queue::size($parallel);
184 sub create_ds_config {
185 my ($db_config, $database, $input, $mfn) = @_;
186 my $c = dclone( $db_config );
187 $c->{_} = $database || $log->logconfess("need database");
188 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
189 $c->{input} = $input || $log->logconfess("need input");
193 while (my ($database, $db_config) = each %{ $config->databases }) {
195 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
196 next if ($only_database && $database !~ m/$only_database/i);
200 if(defined ($f) and $f==0) {
201 $log->info("Created processes $$ for speedup");
208 if ($use_indexer && $parser->have_rules( 'search', $database )) {
210 my $cfg_name = $use_indexer;
211 $cfg_name =~ s/\-.*$//;
213 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
214 $indexer_config->{database} = $database;
215 $indexer_config->{clean} = $clean;
216 $indexer_config->{label} = $db_config->{name};
218 # force clean if database has links
219 $indexer_config->{clean} = 1 if ($db_config->{links});
221 if ($use_indexer eq 'hyperestraier') {
223 # open Hyper Estraier database
224 use WebPAC::Output::Estraier '0.10';
225 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
227 } elsif ($use_indexer eq 'hyperestraier-native') {
229 # open Hyper Estraier database
230 use WebPAC::Output::EstraierNative;
231 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
233 } elsif ($use_indexer eq 'kinosearch') {
236 use WebPAC::Output::KinoSearch;
237 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
238 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
241 $log->logdie("unknown use_indexer: $use_indexer");
244 $log->logide("can't continue without valid indexer") unless ($indexer);
249 # store Hyper Estraier links to other databases
251 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
252 foreach my $link (@{ $db_config->{links} }) {
253 if ($use_indexer eq 'hyperestraier') {
255 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
257 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
259 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
263 credit => $link->{credit},
268 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
272 next if ($only_links);
278 my $abs_path = abs_path($0);
279 $abs_path =~ s#/[^/]*$#/#;
281 my $db_path = $config->webpac('db_path');
284 $log->info("creating new database '$database' in $db_path");
285 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
287 $log->info("working on database '$database' in $db_path");
290 my $store = new WebPAC::Store(
297 # now, iterate through input formats
301 if (ref($db_config->{input}) eq 'ARRAY') {
302 @inputs = @{ $db_config->{input} };
303 } elsif ($db_config->{input}) {
304 push @inputs, $db_config->{input};
306 $log->info("database $database doesn't have inputs defined");
309 foreach my $input (@inputs) {
311 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
313 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
315 my $type = lc($input->{type});
317 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
319 my $input_module = $config->webpac('inputs')->{$type};
321 my @lookups = $parser->have_lookup_create($database, $input);
323 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
324 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
328 # disable modification of records if --stats is in use
329 delete($input->{modify_records});
330 delete($input->{modify_file});
333 my $input_db = new WebPAC::Input(
334 module => $input_module,
335 encoding => $config->webpac('webpac_encoding'),
336 limit => $limit || $input->{limit},
338 recode => $input->{recode},
340 modify_records => $input->{modify_records},
341 modify_file => $input->{modify_file},
343 $log->logdie("can't create input using $input_module") unless ($input);
345 if (defined( $input->{lookup} )) {
346 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
347 delete( $input->{lookup} );
354 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
356 $lookup_coderef = sub {
357 my $rec = shift || die "need rec!";
358 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
360 WebPAC::Normalize::data_structure(
363 config => create_ds_config( $db_config, $database, $input, $mfn ),
366 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
369 WebPAC::Normalize::_set_lookup( undef );
371 $log->debug("created lookup_coderef using:\n$rules");
377 my $maxmfn = $input_db->open(
378 path => $input->{path},
379 code_page => $input->{encoding}, # database encoding
380 lookup_coderef => $lookup_coderef,
381 lookup => $lookup_jar,
385 return $store->load_row(
386 database => $database,
387 input => $input_name,
393 return $store->save_row(
394 database => $database,
395 input => $input_name,
403 my $lookup_data = WebPAC::Normalize::_get_lookup();
405 if (defined( $lookup_data->{$database}->{$input_name} )) {
406 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
408 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
410 database => $database,
411 input => $input_name,
413 data => $lookup_data->{$database}->{$input_name}->{$key},
419 if ($stats || $validate) {
420 my $path = "out/report/${database}-${input_name}.txt";
421 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
423 print $report_fh "Report for database '$database' input '$input_name' records ",
424 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
425 $log->info("Generating report file $path");
429 if ($parser->have_rules( 'marc', $database, $input_name )) {
430 $marc = new WebPAC::Output::MARC(
431 path => "out/marc/${database}-${input_name}.marc",
437 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
438 $log->debug("parsed normalize rules:\n$rules");
440 # reset position in database
443 # generate name of config key for indexer (strip everything after -)
444 my $indexer_config = $use_indexer;
445 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
448 my $depends = $parser->depends($database,$input_name);
451 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
452 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
454 foreach my $db (keys %$depends) {
455 foreach my $i (keys %{$depends->{$db}}) {
456 foreach my $k (keys %{$depends->{$db}->{$i}}) {
458 $log->debug("loading lookup $db/$i");
459 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
464 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
469 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
473 foreach my $pos ( 0 ... $input_db->size ) {
475 my $row = $input_db->fetch || next;
479 my $mfn = $row->{'000'}->[0];
481 if (! $mfn || $mfn !~ m#^\d+$#) {
482 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
484 push @{ $row->{'000'} }, $pos;
489 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
490 $log->error( "MFN $mfn validation error:\n",
491 $validate->report_error( $errors )
494 next; # validation doesn't create any output
497 my $ds = WebPAC::Normalize::data_structure(
500 lookup => $lookup_hash,
501 config => create_ds_config( $db_config, $database, $input, $mfn ),
502 marc_encoding => 'utf-8',
503 load_row_coderef => sub {
504 my ($database,$input,$mfn) = @_;
505 return $store->load_row(
506 database => $database,
513 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
516 database => $database,
517 input => $input_name,
520 ) if ($ds && !$stats);
523 id => "${input_name}/${mfn}",
525 type => $config->get($indexer_config)->{type},
526 ) if ($indexer && $ds);
531 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
533 id => $mfn . ( $i ? "/$i" : '' ),
535 leader => WebPAC::Normalize::marc_leader(),
541 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
546 my $errors = $validate->report;
548 $log->info("validation errors:\n$errors\n" );
549 print $report_fh "$errors\n" if ($report_fh);
554 my $s = $input_db->stats;
555 $log->info("statistics of fields usage:\n$s");
556 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
560 $marc->finish if ($marc);
563 close($report_fh) if ($report_fh)
567 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
569 my $dt = time() - $start_t;
570 $log->info("$total_rows records ", $indexer ? "indexed " : "",
571 sprintf("in %.2f sec [%.2f rec/sec]",
572 $dt, ($total_rows / $dt)
579 $log->info("parallel process $$ finished");
586 # wait all children to finish
587 sleep(1) while wait != -1;
588 $log->info("all parallel processes finished");
592 # handle links or merge after indexing
596 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
598 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
601 foreach my $link (@links) {
602 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');