6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
119 my $validate_delimiters_path;
120 my $marc_generate = 1;
128 my $log = _new WebPAC::Common()->_get_logger();
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config" => \$config_path,
139 "validate=s" => \$validate_path,
140 "validate-delimiters=s" => \$validate_delimiters_path,
141 "marc-generate!" => \$marc_generate,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
150 $marc_generate = 0 if ( $validate_delimiters_path );
152 pod2usage(-verbose => 2) if ($help);
154 my $config = new WebPAC::Config( path => $config_path );
156 #print "config = ",dump($config) if ($debug);
158 die "no databases in config file!\n" unless ($config->databases);
160 $log->info( "-" x 79 );
162 my $log_file = 'log';
164 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
165 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
166 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
170 my $estcmd_path = './estcmd-merge.sh';
172 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
173 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
174 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
175 $log->info("created merge batch file $estcmd_path");
179 $validate = new WebPAC::Validate(
180 delimiters => $config->webpac('delimiters'),
181 ) if ($validate_path || $validate_delimiters_path);
183 my $use_indexer = $config->use_indexer;
184 $stats ||= $validate;
186 $log->debug("disabled indexing for stats collection");
187 $use_indexer = undef;
188 } elsif ( $use_indexer ) {
189 $log->info("using $use_indexer indexing engine...");
192 # parse normalize files and create source files for lookup and normalization
194 my $parser = new WebPAC::Parser( config => $config );
197 my $start_t = time();
202 $log->info("Using $parallel processes for speedup");
203 Proc::Queue::size($parallel);
206 sub create_ds_config {
207 my ($db_config, $database, $input, $mfn) = @_;
208 my $c = dclone( $db_config );
209 $c->{_} = $database || $log->logconfess("need database");
210 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
211 $c->{input} = $input || $log->logconfess("need input");
215 foreach my $database ( sort keys %{ $config->databases } ) {
216 my $db_config = $config->databases->{$database};
218 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
219 next if ($only_database && $database !~ m/$only_database/i);
223 if(defined ($f) and $f==0) {
224 $log->info("Created processes $$ for speedup");
231 if ($use_indexer && $parser->have_rules( 'search', $database )) {
233 my $cfg_name = $use_indexer;
234 $cfg_name =~ s/\-.*$//;
236 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
237 $indexer_config->{database} = $database;
238 $indexer_config->{clean} = $clean;
239 $indexer_config->{label} = $db_config->{name};
241 # force clean if database has links
242 $indexer_config->{clean} = 1 if ($db_config->{links});
244 if ($use_indexer eq 'hyperestraier') {
246 # open Hyper Estraier database
247 require WebPAC::Output::Estraier;
248 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
250 } elsif ($use_indexer eq 'hyperestraier-native') {
252 # open Hyper Estraier database
253 require WebPAC::Output::EstraierNative;
254 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
256 } elsif ($use_indexer eq 'kinosearch') {
259 require WebPAC::Output::KinoSearch;
260 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
261 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
264 $log->logdie("unknown use_indexer: $use_indexer");
267 $log->logdie("can't continue without valid indexer") unless ($indexer);
272 # store Hyper Estraier links to other databases
274 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
275 foreach my $link (@{ $db_config->{links} }) {
276 if ($use_indexer eq 'hyperestraier') {
278 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
280 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
282 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
286 credit => $link->{credit},
291 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
295 next if ($only_links);
301 my $abs_path = abs_path($0);
302 $abs_path =~ s#/[^/]*$#/#;
304 my $db_path = $config->webpac('db_path');
307 $log->info("creating new database '$database' in $db_path");
308 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
310 $log->info("working on database '$database' in $db_path");
313 my $store = new WebPAC::Store(
320 # now, iterate through input formats
324 if (ref($db_config->{input}) eq 'ARRAY') {
325 @inputs = @{ $db_config->{input} };
326 } elsif ($db_config->{input}) {
327 push @inputs, $db_config->{input};
329 $log->info("database $database doesn't have inputs defined");
332 foreach my $input (@inputs) {
334 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
336 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
338 my $type = lc($input->{type});
340 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
342 my $input_module = $config->webpac('inputs')->{$type};
344 my @lookups = $parser->have_lookup_create($database, $input);
346 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
347 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
351 # disable modification of records if --stats is in use
352 delete($input->{modify_records});
353 delete($input->{modify_file});
356 my $input_db = new WebPAC::Input(
357 module => $input_module,
358 encoding => $config->webpac('webpac_encoding'),
359 limit => $limit || $input->{limit},
361 recode => $input->{recode},
363 modify_records => $input->{modify_records},
364 modify_file => $input->{modify_file},
366 $log->logdie("can't create input using $input_module") unless ($input);
368 if (defined( $input->{lookup} )) {
369 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
370 delete( $input->{lookup} );
377 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
379 $lookup_coderef = sub {
380 my $rec = shift || die "need rec!";
381 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
383 WebPAC::Normalize::data_structure(
386 config => create_ds_config( $db_config, $database, $input, $mfn ),
389 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
392 WebPAC::Normalize::_set_lookup( undef );
394 $log->debug("created lookup_coderef using:\n$rules");
400 my $maxmfn = $input_db->open(
401 path => $input->{path},
402 code_page => $input->{encoding}, # database encoding
403 lookup_coderef => $lookup_coderef,
404 lookup => $lookup_jar,
408 return $store->load_row(
409 database => $database,
410 input => $input_name,
416 return $store->save_row(
417 database => $database,
418 input => $input_name,
426 my $lookup_data = WebPAC::Normalize::_get_lookup();
428 if (defined( $lookup_data->{$database}->{$input_name} )) {
429 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
431 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
433 database => $database,
434 input => $input_name,
436 data => $lookup_data->{$database}->{$input_name}->{$key},
442 if ($stats || $validate) {
443 my $path = "out/report/${database}-${input_name}.txt";
444 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
446 print $report_fh "Report for database '$database' input '$input_name' records ",
447 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
448 $log->info("Generating report file $path");
451 $validate->read_validate_file( $validate->fill_in( $validate_path, database => $database, input => $input_name ) ) if ( $validate_path );
452 $validate->read_validate_delimiters_file( $validate->fill_in( $validate_delimiters_path, database => $database, input => $input_name ) ) if ( $validate_delimiters_path );
457 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
458 $marc = new WebPAC::Output::MARC(
459 path => "out/marc/${database}-${input_name}.marc",
465 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
466 $log->debug("parsed normalize rules:\n$rules");
468 # reset position in database
471 # generate name of config key for indexer (strip everything after -)
472 my $indexer_config = $use_indexer;
473 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
476 my $depends = $parser->depends($database,$input_name);
479 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
480 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
482 foreach my $db (keys %$depends) {
483 foreach my $i (keys %{$depends->{$db}}) {
484 foreach my $k (keys %{$depends->{$db}->{$i}}) {
486 $log->debug("loading lookup $db/$i");
487 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
492 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
497 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
501 foreach my $pos ( 0 ... $input_db->size ) {
503 my $row = $input_db->fetch || next;
507 my $mfn = $row->{'000'}->[0];
509 if (! $mfn || $mfn !~ m#^\d+$#) {
510 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
512 push @{ $row->{'000'} }, $pos;
517 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
518 $log->error( "MFN $mfn validation error:\n",
519 $validate->report_error( $errors )
522 next; # validation doesn't create any output
525 my $ds = WebPAC::Normalize::data_structure(
528 lookup => $lookup_hash,
529 config => create_ds_config( $db_config, $database, $input, $mfn ),
530 marc_encoding => 'utf-8',
531 load_row_coderef => sub {
532 my ($database,$input,$mfn) = @_;
533 return $store->load_row(
534 database => $database,
541 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
544 database => $database,
545 input => $input_name,
548 ) if ($ds && !$stats);
551 id => "${input_name}/${mfn}",
553 type => $config->get($indexer_config)->{type},
554 ) if ($indexer && $ds);
559 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
561 id => $mfn . ( $i ? "/$i" : '' ),
563 leader => WebPAC::Normalize::_get_marc_leader(),
569 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
574 my $errors = $validate->report;
576 $log->info("validation errors:\n$errors\n" );
577 print $report_fh "$errors\n" if ($report_fh);
580 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
582 # must be last thing that touches $validate for this input
587 my $s = $input_db->stats;
588 $log->info("statistics of fields usage:\n$s");
589 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
593 $marc->finish if ($marc);
596 close($report_fh) if ($report_fh);
599 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
601 my $dt = time() - $start_t;
602 $log->info("$total_rows records ", $indexer ? "indexed " : "",
603 sprintf("in %.2f sec [%.2f rec/sec]",
604 $dt, ($total_rows / $dt)
611 $log->info("parallel process $$ finished");
618 # wait all children to finish
619 sleep(1) while wait != -1;
620 $log->info("all parallel processes finished");
623 # save new delimiters if needed
624 $validate->save_delimiters_templates if ( $validate_delimiters_path );
627 # handle links or merge after indexing
631 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
633 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
636 foreach my $link (@links) {
637 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');