6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.14;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
72 By default turned on if normalisation file has C<marc*> directives. You can disable lint
73 messages with C<--no-marc-lint>.
77 Force dump or input and marc record for debugging.
81 Run databases in parallel (aproximatly same as number of processors in
82 machine if you want to use full load)
90 Create merged index of databases which have links
111 my $log = _new WebPAC::Common()->_get_logger();
114 "limit=i" => \$limit,
115 "offset=i" => \$offset,
117 "one=s" => \$only_filter,
118 "only=s" => \$only_filter,
119 "config" => \$config_path,
122 "validate=s" => \$validate_path,
123 "marc-lint!" => \$marc_lint,
124 "marc-dump!" => \$marc_dump,
125 "parallel=i" => \$parallel,
126 "only-links!" => \$only_links,
130 my $config = new WebPAC::Config( path => $config_path );
132 #print "config = ",dump($config) if ($debug);
134 die "no databases in config file!\n" unless ($config->databases);
136 $log->info( "-" x 79 );
140 my $estcmd_path = './estcmd-merge.sh';
142 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
143 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
144 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
145 $log->info("created merge batch file $estcmd_path");
150 $validate = new WebPAC::Validate(
151 path => $validate_path,
152 ) if ($validate_path);
155 my $use_indexer = $config->use_indexer;
157 $log->debug("option --stats disables update of indexing engine...");
158 $use_indexer = undef;
160 $log->info("using $use_indexer indexing engine...");
163 # parse normalize files and create source files for lookup and normalization
165 my $parser = new WebPAC::Parser( config => $config );
168 my $start_t = time();
173 $log->info("Using $parallel processes for speedup");
174 Proc::Queue::size($parallel);
177 sub create_ds_config {
178 my ($db_config, $database, $input, $mfn) = @_;
179 my $c = dclone( $db_config );
180 $c->{_} = $database || $log->logconfess("need database");
181 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
182 $c->{input} = $input || $log->logconfess("need input");
186 while (my ($database, $db_config) = each %{ $config->databases }) {
188 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
189 next if ($only_database && $database !~ m/$only_database/i);
193 if(defined ($f) and $f==0) {
194 $log->info("Created processes $$ for speedup");
201 if ($use_indexer && $parser->have_rules( 'search', $database )) {
203 my $cfg_name = $use_indexer;
204 $cfg_name =~ s/\-.*$//;
206 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
207 $indexer_config->{database} = $database;
208 $indexer_config->{clean} = $clean;
209 $indexer_config->{label} = $db_config->{name};
211 # force clean if database has links
212 $indexer_config->{clean} = 1 if ($db_config->{links});
214 if ($use_indexer eq 'hyperestraier') {
216 # open Hyper Estraier database
217 use WebPAC::Output::Estraier '0.10';
218 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
220 } elsif ($use_indexer eq 'hyperestraier-native') {
222 # open Hyper Estraier database
223 use WebPAC::Output::EstraierNative;
224 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
226 } elsif ($use_indexer eq 'kinosearch') {
229 use WebPAC::Output::KinoSearch;
230 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
231 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
234 $log->logdie("unknown use_indexer: $use_indexer");
237 $log->logide("can't continue without valid indexer") unless ($indexer);
242 # store Hyper Estraier links to other databases
244 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
245 foreach my $link (@{ $db_config->{links} }) {
246 if ($use_indexer eq 'hyperestraier') {
248 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
250 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
252 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
256 credit => $link->{credit},
261 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
265 next if ($only_links);
271 my $abs_path = abs_path($0);
272 $abs_path =~ s#/[^/]*$#/#;
274 my $db_path = $config->webpac('db_path');
277 $log->info("creating new database '$database' in $db_path");
278 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
280 $log->info("working on database '$database' in $db_path");
283 my $store = new WebPAC::Store(
290 # now, iterate through input formats
294 if (ref($db_config->{input}) eq 'ARRAY') {
295 @inputs = @{ $db_config->{input} };
296 } elsif ($db_config->{input}) {
297 push @inputs, $db_config->{input};
299 $log->info("database $database doesn't have inputs defined");
302 foreach my $input (@inputs) {
304 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
306 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
308 my $type = lc($input->{type});
310 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
312 my $input_module = $config->webpac('inputs')->{$type};
314 my @lookups = $parser->have_lookup_create($database, $input);
316 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
317 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
321 # disable modification of records if --stats is in use
322 delete($input->{modify_records});
323 delete($input->{modify_file});
326 my $input_db = new WebPAC::Input(
327 module => $input_module,
328 encoding => $config->webpac('webpac_encoding'),
329 limit => $limit || $input->{limit},
331 recode => $input->{recode},
333 modify_records => $input->{modify_records},
334 modify_file => $input->{modify_file},
336 $log->logdie("can't create input using $input_module") unless ($input);
338 if (defined( $input->{lookup} )) {
339 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
340 delete( $input->{lookup} );
347 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
349 $lookup_coderef = sub {
350 my $rec = shift || die "need rec!";
351 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
353 WebPAC::Normalize::data_structure(
356 config => create_ds_config( $db_config, $database, $input, $mfn ),
359 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
362 WebPAC::Normalize::_set_lookup( undef );
364 $log->debug("created lookup_coderef using:\n$rules");
370 my $maxmfn = $input_db->open(
371 path => $input->{path},
372 code_page => $input->{encoding}, # database encoding
373 lookup_coderef => $lookup_coderef,
374 lookup => $lookup_jar,
378 return $store->load_row(
379 database => $database,
380 input => $input_name,
386 return $store->save_row(
387 database => $database,
388 input => $input_name,
396 my $lookup_data = WebPAC::Normalize::_get_lookup();
398 if (defined( $lookup_data->{$database}->{$input_name} )) {
399 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
401 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
403 database => $database,
404 input => $input_name,
406 data => $lookup_data->{$database}->{$input_name}->{$key},
412 if ($stats || $validate) {
413 my $path = "out/report/${database}-${input_name}.txt";
414 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
416 print $report_fh "Report for database '$database' input '$input_name' records ",
417 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
418 $log->info("Generating report file $path");
422 if ($parser->have_rules( 'marc', $database, $input_name )) {
423 $marc = new WebPAC::Output::MARC(
424 path => "out/marc/${database}-${input_name}.marc",
430 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
431 $log->debug("parsed normalize rules:\n$rules");
433 # reset position in database
436 # generate name of config key for indexer (strip everything after -)
437 my $indexer_config = $use_indexer;
438 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
441 my $depends = $parser->depends($database,$input_name);
444 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
445 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
447 foreach my $db (keys %$depends) {
448 foreach my $i (keys %{$depends->{$db}}) {
449 foreach my $k (keys %{$depends->{$db}->{$i}}) {
451 $log->debug("loading lookup $db/$i");
452 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
457 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
462 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
466 foreach my $pos ( 0 ... $input_db->size ) {
468 my $row = $input_db->fetch || next;
470 my $mfn = $row->{'000'}->[0];
472 if (! $mfn || $mfn !~ m#^\d+$#) {
473 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
475 push @{ $row->{'000'} }, $pos;
480 if ( my $errors = $validate->validate_rec( $row, $input_db->dump ) ) {
481 $log->error( "MFN $mfn validation error:\n",
482 $validate->report_error( $errors )
487 my $ds = WebPAC::Normalize::data_structure(
490 lookup => $lookup_hash,
491 config => create_ds_config( $db_config, $database, $input, $mfn ),
492 marc_encoding => 'utf-8',
493 load_row_coderef => sub {
494 my ($database,$input,$mfn) = @_;
495 return $store->load_row(
496 database => $database,
503 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
506 database => $database,
507 input => $input_name,
510 ) if ($ds && !$stats);
513 id => "${input_name}/${mfn}",
515 type => $config->get($indexer_config)->{type},
516 ) if ($indexer && $ds);
521 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
523 id => $mfn . ( $i ? "/$i" : '' ),
525 leader => WebPAC::Normalize::marc_leader(),
531 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
538 my $errors = $validate->report;
540 $log->info("validation errors:\n$errors\n" );
541 print $report_fh "$errors\n" if ($report_fh);
546 my $s = $input_db->stats;
547 $log->info("statistics of fields usage:\n$s");
548 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
552 $marc->finish if ($marc);
555 close($report_fh) if ($report_fh)
559 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
561 my $dt = time() - $start_t;
562 $log->info("$total_rows records ", $indexer ? "indexed " : "",
563 sprintf("in %.2f sec [%.2f rec/sec]",
564 $dt, ($total_rows / $dt)
571 $log->info("parallel process $$ finished");
578 # wait all children to finish
579 sleep(1) while wait != -1;
580 $log->info("all parallel processes finished");
584 # handle links or merge after indexing
588 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
590 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
593 foreach my $link (@links) {
594 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');