6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 You can use special variables C<$database> and $C<$input> in this parametar
72 like C<--validate 'conf/validate/$database-$input'> to construct filename
74 =item --validate-delimiters path/to/validate_delimiters_file
76 this option is used with C<--validate> to turn on extra validation of
77 delimiters. If file is non existant, it will be created on first run.
81 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
82 You can use this option as C<--no-marc-generate> to disable MARC generation.
86 By default turned on if normalisation file has C<marc*> directives. You can disable lint
87 messages with C<--no-marc-lint>.
91 Force dump or input and marc record for debugging.
95 Run databases in parallel (aproximatly same as number of processors in
96 machine if you want to use full load)
104 Create merged index of databases which have links
119 my $validate_delimiters_path;
120 my $marc_generate = 1;
128 my $log = _new WebPAC::Common()->_get_logger();
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config" => \$config_path,
139 "validate=s" => \$validate_path,
140 "validate-delimiters=s" => \$validate_delimiters_path,
141 "marc-generate!" => \$marc_generate,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
150 $marc_generate = 0 if ( $validate_delimiters_path );
152 pod2usage(-verbose => 2) if ($help);
154 my $config = new WebPAC::Config( path => $config_path );
156 #print "config = ",dump($config) if ($debug);
158 die "no databases in config file!\n" unless ($config->databases);
160 $log->info( "-" x 79 );
162 my $log_file = 'log';
164 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
165 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
166 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
170 my $estcmd_path = './estcmd-merge.sh';
172 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
173 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
174 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
175 $log->info("created merge batch file $estcmd_path");
179 $validate = new WebPAC::Validate(
180 path => $validate_path,
181 delimiters => $config->webpac('delimiters'),
182 delimiters_path => $validate_delimiters_path,
183 ) if ($validate_path || $validate_delimiters_path);
185 my $use_indexer = $config->use_indexer;
186 $stats ||= $validate;
188 $log->debug("disabled indexing for stats collection");
189 $use_indexer = undef;
191 $log->info("using $use_indexer indexing engine...");
194 # parse normalize files and create source files for lookup and normalization
196 my $parser = new WebPAC::Parser( config => $config );
199 my $start_t = time();
204 $log->info("Using $parallel processes for speedup");
205 Proc::Queue::size($parallel);
208 sub create_ds_config {
209 my ($db_config, $database, $input, $mfn) = @_;
210 my $c = dclone( $db_config );
211 $c->{_} = $database || $log->logconfess("need database");
212 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
213 $c->{input} = $input || $log->logconfess("need input");
217 while (my ($database, $db_config) = each %{ $config->databases }) {
219 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
220 next if ($only_database && $database !~ m/$only_database/i);
224 if(defined ($f) and $f==0) {
225 $log->info("Created processes $$ for speedup");
232 if ($use_indexer && $parser->have_rules( 'search', $database )) {
234 my $cfg_name = $use_indexer;
235 $cfg_name =~ s/\-.*$//;
237 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
238 $indexer_config->{database} = $database;
239 $indexer_config->{clean} = $clean;
240 $indexer_config->{label} = $db_config->{name};
242 # force clean if database has links
243 $indexer_config->{clean} = 1 if ($db_config->{links});
245 if ($use_indexer eq 'hyperestraier') {
247 # open Hyper Estraier database
248 use WebPAC::Output::Estraier '0.10';
249 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
251 } elsif ($use_indexer eq 'hyperestraier-native') {
253 # open Hyper Estraier database
254 use WebPAC::Output::EstraierNative;
255 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
257 } elsif ($use_indexer eq 'kinosearch') {
260 use WebPAC::Output::KinoSearch;
261 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
262 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
265 $log->logdie("unknown use_indexer: $use_indexer");
268 $log->logide("can't continue without valid indexer") unless ($indexer);
273 # store Hyper Estraier links to other databases
275 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
276 foreach my $link (@{ $db_config->{links} }) {
277 if ($use_indexer eq 'hyperestraier') {
279 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
281 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
283 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
287 credit => $link->{credit},
292 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
296 next if ($only_links);
302 my $abs_path = abs_path($0);
303 $abs_path =~ s#/[^/]*$#/#;
305 my $db_path = $config->webpac('db_path');
308 $log->info("creating new database '$database' in $db_path");
309 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
311 $log->info("working on database '$database' in $db_path");
314 my $store = new WebPAC::Store(
321 # now, iterate through input formats
325 if (ref($db_config->{input}) eq 'ARRAY') {
326 @inputs = @{ $db_config->{input} };
327 } elsif ($db_config->{input}) {
328 push @inputs, $db_config->{input};
330 $log->info("database $database doesn't have inputs defined");
333 foreach my $input (@inputs) {
335 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
337 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
339 my $type = lc($input->{type});
341 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
343 my $input_module = $config->webpac('inputs')->{$type};
345 my @lookups = $parser->have_lookup_create($database, $input);
347 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
348 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
352 # disable modification of records if --stats is in use
353 delete($input->{modify_records});
354 delete($input->{modify_file});
357 my $input_db = new WebPAC::Input(
358 module => $input_module,
359 encoding => $config->webpac('webpac_encoding'),
360 limit => $limit || $input->{limit},
362 recode => $input->{recode},
364 modify_records => $input->{modify_records},
365 modify_file => $input->{modify_file},
367 $log->logdie("can't create input using $input_module") unless ($input);
369 if (defined( $input->{lookup} )) {
370 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
371 delete( $input->{lookup} );
378 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
380 $lookup_coderef = sub {
381 my $rec = shift || die "need rec!";
382 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
384 WebPAC::Normalize::data_structure(
387 config => create_ds_config( $db_config, $database, $input, $mfn ),
390 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
393 WebPAC::Normalize::_set_lookup( undef );
395 $log->debug("created lookup_coderef using:\n$rules");
401 my $maxmfn = $input_db->open(
402 path => $input->{path},
403 code_page => $input->{encoding}, # database encoding
404 lookup_coderef => $lookup_coderef,
405 lookup => $lookup_jar,
409 return $store->load_row(
410 database => $database,
411 input => $input_name,
417 return $store->save_row(
418 database => $database,
419 input => $input_name,
427 my $lookup_data = WebPAC::Normalize::_get_lookup();
429 if (defined( $lookup_data->{$database}->{$input_name} )) {
430 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
432 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
434 database => $database,
435 input => $input_name,
437 data => $lookup_data->{$database}->{$input_name}->{$key},
443 if ($stats || $validate) {
444 my $path = "out/report/${database}-${input_name}.txt";
445 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
447 print $report_fh "Report for database '$database' input '$input_name' records ",
448 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
449 $log->info("Generating report file $path");
453 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
454 $marc = new WebPAC::Output::MARC(
455 path => "out/marc/${database}-${input_name}.marc",
461 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
462 $log->debug("parsed normalize rules:\n$rules");
464 # reset position in database
467 # generate name of config key for indexer (strip everything after -)
468 my $indexer_config = $use_indexer;
469 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
472 my $depends = $parser->depends($database,$input_name);
475 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
476 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
478 foreach my $db (keys %$depends) {
479 foreach my $i (keys %{$depends->{$db}}) {
480 foreach my $k (keys %{$depends->{$db}->{$i}}) {
482 $log->debug("loading lookup $db/$i");
483 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
488 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
493 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
497 foreach my $pos ( 0 ... $input_db->size ) {
499 my $row = $input_db->fetch || next;
503 my $mfn = $row->{'000'}->[0];
505 if (! $mfn || $mfn !~ m#^\d+$#) {
506 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
508 push @{ $row->{'000'} }, $pos;
513 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
514 $log->error( "MFN $mfn validation error:\n",
515 $validate->report_error( $errors )
518 next; # validation doesn't create any output
521 my $ds = WebPAC::Normalize::data_structure(
524 lookup => $lookup_hash,
525 config => create_ds_config( $db_config, $database, $input, $mfn ),
526 marc_encoding => 'utf-8',
527 load_row_coderef => sub {
528 my ($database,$input,$mfn) = @_;
529 return $store->load_row(
530 database => $database,
537 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
540 database => $database,
541 input => $input_name,
544 ) if ($ds && !$stats);
547 id => "${input_name}/${mfn}",
549 type => $config->get($indexer_config)->{type},
550 ) if ($indexer && $ds);
555 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
557 id => $mfn . ( $i ? "/$i" : '' ),
559 leader => WebPAC::Normalize::_get_marc_leader(),
565 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
570 my $errors = $validate->report;
572 $log->info("validation errors:\n$errors\n" );
573 print $report_fh "$errors\n" if ($report_fh);
576 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1, current_input => 1 ), "\n\n";
578 # must be last thing that touches $validate for this input
583 my $s = $input_db->stats;
584 $log->info("statistics of fields usage:\n$s");
585 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
589 $marc->finish if ($marc);
592 close($report_fh) if ($report_fh);
595 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
597 my $dt = time() - $start_t;
598 $log->info("$total_rows records ", $indexer ? "indexed " : "",
599 sprintf("in %.2f sec [%.2f rec/sec]",
600 $dt, ($total_rows / $dt)
607 $log->info("parallel process $$ finished");
614 # wait all children to finish
615 sleep(1) while wait != -1;
616 $log->info("all parallel processes finished");
619 # save new delimiters if needed
620 $validate->save_delimiters_templates if ( $validate );
623 # handle links or merge after indexing
627 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
629 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
632 foreach my $link (@links) {
633 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');