6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
72 By default turned on if normalisation file has C<marc*> directives. You can disable lint
73 messages with C<--no-marc-lint>.
77 Force dump or input and marc record for debugging.
81 Run databases in parallel (aproximatly same as number of processors in
82 machine if you want to use full load)
90 Create merged index of databases which have links
111 my $log = _new WebPAC::Common()->_get_logger();
114 "limit=i" => \$limit,
115 "offset=i" => \$offset,
117 "one=s" => \$only_filter,
118 "only=s" => \$only_filter,
119 "config" => \$config_path,
122 "validate=s" => \$validate_path,
123 "marc-lint!" => \$marc_lint,
124 "marc-dump!" => \$marc_dump,
125 "parallel=i" => \$parallel,
126 "only-links!" => \$only_links,
130 my $config = new WebPAC::Config( path => $config_path );
132 #print "config = ",dump($config) if ($debug);
134 die "no databases in config file!\n" unless ($config->databases);
136 $log->info( "-" x 79 );
140 my $estcmd_path = './estcmd-merge.sh';
142 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
143 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
144 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
145 $log->info("created merge batch file $estcmd_path");
150 $validate = new WebPAC::Validate(
151 path => $validate_path,
152 ) if ($validate_path);
155 my $use_indexer = $config->use_indexer;
156 $stats ||= $validate;
158 $log->debug("disabled indexing for stats collection");
159 $use_indexer = undef;
161 $log->info("using $use_indexer indexing engine...");
164 # parse normalize files and create source files for lookup and normalization
166 my $parser = new WebPAC::Parser( config => $config );
169 my $start_t = time();
174 $log->info("Using $parallel processes for speedup");
175 Proc::Queue::size($parallel);
178 sub create_ds_config {
179 my ($db_config, $database, $input, $mfn) = @_;
180 my $c = dclone( $db_config );
181 $c->{_} = $database || $log->logconfess("need database");
182 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
183 $c->{input} = $input || $log->logconfess("need input");
187 while (my ($database, $db_config) = each %{ $config->databases }) {
189 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
190 next if ($only_database && $database !~ m/$only_database/i);
194 if(defined ($f) and $f==0) {
195 $log->info("Created processes $$ for speedup");
202 if ($use_indexer && $parser->have_rules( 'search', $database )) {
204 my $cfg_name = $use_indexer;
205 $cfg_name =~ s/\-.*$//;
207 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
208 $indexer_config->{database} = $database;
209 $indexer_config->{clean} = $clean;
210 $indexer_config->{label} = $db_config->{name};
212 # force clean if database has links
213 $indexer_config->{clean} = 1 if ($db_config->{links});
215 if ($use_indexer eq 'hyperestraier') {
217 # open Hyper Estraier database
218 use WebPAC::Output::Estraier '0.10';
219 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
221 } elsif ($use_indexer eq 'hyperestraier-native') {
223 # open Hyper Estraier database
224 use WebPAC::Output::EstraierNative;
225 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
227 } elsif ($use_indexer eq 'kinosearch') {
230 use WebPAC::Output::KinoSearch;
231 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
232 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
235 $log->logdie("unknown use_indexer: $use_indexer");
238 $log->logide("can't continue without valid indexer") unless ($indexer);
243 # store Hyper Estraier links to other databases
245 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
246 foreach my $link (@{ $db_config->{links} }) {
247 if ($use_indexer eq 'hyperestraier') {
249 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
251 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
253 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
257 credit => $link->{credit},
262 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
266 next if ($only_links);
272 my $abs_path = abs_path($0);
273 $abs_path =~ s#/[^/]*$#/#;
275 my $db_path = $config->webpac('db_path');
278 $log->info("creating new database '$database' in $db_path");
279 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
281 $log->info("working on database '$database' in $db_path");
284 my $store = new WebPAC::Store(
291 # now, iterate through input formats
295 if (ref($db_config->{input}) eq 'ARRAY') {
296 @inputs = @{ $db_config->{input} };
297 } elsif ($db_config->{input}) {
298 push @inputs, $db_config->{input};
300 $log->info("database $database doesn't have inputs defined");
303 foreach my $input (@inputs) {
305 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
307 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
309 my $type = lc($input->{type});
311 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
313 my $input_module = $config->webpac('inputs')->{$type};
315 my @lookups = $parser->have_lookup_create($database, $input);
317 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
318 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
322 # disable modification of records if --stats is in use
323 delete($input->{modify_records});
324 delete($input->{modify_file});
327 my $input_db = new WebPAC::Input(
328 module => $input_module,
329 encoding => $config->webpac('webpac_encoding'),
330 limit => $limit || $input->{limit},
332 recode => $input->{recode},
334 modify_records => $input->{modify_records},
335 modify_file => $input->{modify_file},
337 $log->logdie("can't create input using $input_module") unless ($input);
339 if (defined( $input->{lookup} )) {
340 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
341 delete( $input->{lookup} );
348 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
350 $lookup_coderef = sub {
351 my $rec = shift || die "need rec!";
352 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
354 WebPAC::Normalize::data_structure(
357 config => create_ds_config( $db_config, $database, $input, $mfn ),
360 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
363 WebPAC::Normalize::_set_lookup( undef );
365 $log->debug("created lookup_coderef using:\n$rules");
371 my $maxmfn = $input_db->open(
372 path => $input->{path},
373 code_page => $input->{encoding}, # database encoding
374 lookup_coderef => $lookup_coderef,
375 lookup => $lookup_jar,
379 return $store->load_row(
380 database => $database,
381 input => $input_name,
387 return $store->save_row(
388 database => $database,
389 input => $input_name,
397 my $lookup_data = WebPAC::Normalize::_get_lookup();
399 if (defined( $lookup_data->{$database}->{$input_name} )) {
400 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
402 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
404 database => $database,
405 input => $input_name,
407 data => $lookup_data->{$database}->{$input_name}->{$key},
413 if ($stats || $validate) {
414 my $path = "out/report/${database}-${input_name}.txt";
415 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
417 print $report_fh "Report for database '$database' input '$input_name' records ",
418 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
419 $log->info("Generating report file $path");
423 if ($parser->have_rules( 'marc', $database, $input_name )) {
424 $marc = new WebPAC::Output::MARC(
425 path => "out/marc/${database}-${input_name}.marc",
431 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
432 $log->debug("parsed normalize rules:\n$rules");
434 # reset position in database
437 # generate name of config key for indexer (strip everything after -)
438 my $indexer_config = $use_indexer;
439 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
442 my $depends = $parser->depends($database,$input_name);
445 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
446 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
448 foreach my $db (keys %$depends) {
449 foreach my $i (keys %{$depends->{$db}}) {
450 foreach my $k (keys %{$depends->{$db}->{$i}}) {
452 $log->debug("loading lookup $db/$i");
453 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
458 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
463 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
467 foreach my $pos ( 0 ... $input_db->size ) {
469 my $row = $input_db->fetch || next;
473 my $mfn = $row->{'000'}->[0];
475 if (! $mfn || $mfn !~ m#^\d+$#) {
476 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
478 push @{ $row->{'000'} }, $pos;
483 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
484 $log->error( "MFN $mfn validation error:\n",
485 $validate->report_error( $errors )
488 next; # validation doesn't create any output
491 my $ds = WebPAC::Normalize::data_structure(
494 lookup => $lookup_hash,
495 config => create_ds_config( $db_config, $database, $input, $mfn ),
496 marc_encoding => 'utf-8',
497 load_row_coderef => sub {
498 my ($database,$input,$mfn) = @_;
499 return $store->load_row(
500 database => $database,
507 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
510 database => $database,
511 input => $input_name,
514 ) if ($ds && !$stats);
517 id => "${input_name}/${mfn}",
519 type => $config->get($indexer_config)->{type},
520 ) if ($indexer && $ds);
525 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
527 id => $mfn . ( $i ? "/$i" : '' ),
529 leader => WebPAC::Normalize::marc_leader(),
535 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
540 my $errors = $validate->report;
542 $log->info("validation errors:\n$errors\n" );
543 print $report_fh "$errors\n" if ($report_fh);
548 my $s = $input_db->stats;
549 $log->info("statistics of fields usage:\n$s");
550 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
554 $marc->finish if ($marc);
557 close($report_fh) if ($report_fh)
561 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
563 my $dt = time() - $start_t;
564 $log->info("$total_rows records ", $indexer ? "indexed " : "",
565 sprintf("in %.2f sec [%.2f rec/sec]",
566 $dt, ($total_rows / $dt)
573 $log->info("parallel process $$ finished");
580 # wait all children to finish
581 sleep(1) while wait != -1;
582 $log->info("all parallel processes finished");
586 # handle links or merge after indexing
590 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
592 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
595 foreach my $link (@links) {
596 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');