6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
73 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
74 You can use this option as C<--no-marc-generate> to disable MARC generation.
78 By default turned on if normalisation file has C<marc*> directives. You can disable lint
79 messages with C<--no-marc-lint>.
83 Force dump or input and marc record for debugging.
87 Run databases in parallel (aproximatly same as number of processors in
88 machine if you want to use full load)
96 Create merged index of databases which have links
111 my $marc_generate = 1;
119 my $log = _new WebPAC::Common()->_get_logger();
122 "limit=i" => \$limit,
123 "offset=i" => \$offset,
125 "one=s" => \$only_filter,
126 "only=s" => \$only_filter,
127 "config" => \$config_path,
130 "validate=s" => \$validate_path,
131 "marc-generate!" => \$marc_generate,
132 "marc-lint!" => \$marc_lint,
133 "marc-dump!" => \$marc_dump,
134 "parallel=i" => \$parallel,
135 "only-links!" => \$only_links,
140 pod2usage(-verbose => 2) if ($help);
142 my $config = new WebPAC::Config( path => $config_path );
144 #print "config = ",dump($config) if ($debug);
146 die "no databases in config file!\n" unless ($config->databases);
148 $log->info( "-" x 79 );
150 my $log_file = 'log';
152 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
153 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
154 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
158 my $estcmd_path = './estcmd-merge.sh';
160 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
161 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
162 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
163 $log->info("created merge batch file $estcmd_path");
168 $validate = new WebPAC::Validate(
169 path => $validate_path,
170 delimiters => $config->webpac('delimiters'),
171 ) if ($validate_path);
174 my $use_indexer = $config->use_indexer;
175 $stats ||= $validate;
177 $log->debug("disabled indexing for stats collection");
178 $use_indexer = undef;
180 $log->info("using $use_indexer indexing engine...");
183 # parse normalize files and create source files for lookup and normalization
185 my $parser = new WebPAC::Parser( config => $config );
188 my $start_t = time();
193 $log->info("Using $parallel processes for speedup");
194 Proc::Queue::size($parallel);
197 sub create_ds_config {
198 my ($db_config, $database, $input, $mfn) = @_;
199 my $c = dclone( $db_config );
200 $c->{_} = $database || $log->logconfess("need database");
201 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
202 $c->{input} = $input || $log->logconfess("need input");
206 while (my ($database, $db_config) = each %{ $config->databases }) {
208 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
209 next if ($only_database && $database !~ m/$only_database/i);
213 if(defined ($f) and $f==0) {
214 $log->info("Created processes $$ for speedup");
221 if ($use_indexer && $parser->have_rules( 'search', $database )) {
223 my $cfg_name = $use_indexer;
224 $cfg_name =~ s/\-.*$//;
226 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
227 $indexer_config->{database} = $database;
228 $indexer_config->{clean} = $clean;
229 $indexer_config->{label} = $db_config->{name};
231 # force clean if database has links
232 $indexer_config->{clean} = 1 if ($db_config->{links});
234 if ($use_indexer eq 'hyperestraier') {
236 # open Hyper Estraier database
237 use WebPAC::Output::Estraier '0.10';
238 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
240 } elsif ($use_indexer eq 'hyperestraier-native') {
242 # open Hyper Estraier database
243 use WebPAC::Output::EstraierNative;
244 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
246 } elsif ($use_indexer eq 'kinosearch') {
249 use WebPAC::Output::KinoSearch;
250 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
251 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
254 $log->logdie("unknown use_indexer: $use_indexer");
257 $log->logide("can't continue without valid indexer") unless ($indexer);
262 # store Hyper Estraier links to other databases
264 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
265 foreach my $link (@{ $db_config->{links} }) {
266 if ($use_indexer eq 'hyperestraier') {
268 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
270 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
272 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
276 credit => $link->{credit},
281 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
285 next if ($only_links);
291 my $abs_path = abs_path($0);
292 $abs_path =~ s#/[^/]*$#/#;
294 my $db_path = $config->webpac('db_path');
297 $log->info("creating new database '$database' in $db_path");
298 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
300 $log->info("working on database '$database' in $db_path");
303 my $store = new WebPAC::Store(
310 # now, iterate through input formats
314 if (ref($db_config->{input}) eq 'ARRAY') {
315 @inputs = @{ $db_config->{input} };
316 } elsif ($db_config->{input}) {
317 push @inputs, $db_config->{input};
319 $log->info("database $database doesn't have inputs defined");
322 foreach my $input (@inputs) {
324 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
326 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
328 my $type = lc($input->{type});
330 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
332 my $input_module = $config->webpac('inputs')->{$type};
334 my @lookups = $parser->have_lookup_create($database, $input);
336 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
337 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
341 # disable modification of records if --stats is in use
342 delete($input->{modify_records});
343 delete($input->{modify_file});
346 my $input_db = new WebPAC::Input(
347 module => $input_module,
348 encoding => $config->webpac('webpac_encoding'),
349 limit => $limit || $input->{limit},
351 recode => $input->{recode},
353 modify_records => $input->{modify_records},
354 modify_file => $input->{modify_file},
356 $log->logdie("can't create input using $input_module") unless ($input);
358 if (defined( $input->{lookup} )) {
359 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
360 delete( $input->{lookup} );
367 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
369 $lookup_coderef = sub {
370 my $rec = shift || die "need rec!";
371 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
373 WebPAC::Normalize::data_structure(
376 config => create_ds_config( $db_config, $database, $input, $mfn ),
379 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
382 WebPAC::Normalize::_set_lookup( undef );
384 $log->debug("created lookup_coderef using:\n$rules");
390 my $maxmfn = $input_db->open(
391 path => $input->{path},
392 code_page => $input->{encoding}, # database encoding
393 lookup_coderef => $lookup_coderef,
394 lookup => $lookup_jar,
398 return $store->load_row(
399 database => $database,
400 input => $input_name,
406 return $store->save_row(
407 database => $database,
408 input => $input_name,
416 my $lookup_data = WebPAC::Normalize::_get_lookup();
418 if (defined( $lookup_data->{$database}->{$input_name} )) {
419 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
421 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
423 database => $database,
424 input => $input_name,
426 data => $lookup_data->{$database}->{$input_name}->{$key},
432 if ($stats || $validate) {
433 my $path = "out/report/${database}-${input_name}.txt";
434 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
436 print $report_fh "Report for database '$database' input '$input_name' records ",
437 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
438 $log->info("Generating report file $path");
442 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
443 $marc = new WebPAC::Output::MARC(
444 path => "out/marc/${database}-${input_name}.marc",
450 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
451 $log->debug("parsed normalize rules:\n$rules");
453 # reset position in database
456 # generate name of config key for indexer (strip everything after -)
457 my $indexer_config = $use_indexer;
458 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
461 my $depends = $parser->depends($database,$input_name);
464 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
465 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
467 foreach my $db (keys %$depends) {
468 foreach my $i (keys %{$depends->{$db}}) {
469 foreach my $k (keys %{$depends->{$db}->{$i}}) {
471 $log->debug("loading lookup $db/$i");
472 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
477 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
482 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
486 foreach my $pos ( 0 ... $input_db->size ) {
488 my $row = $input_db->fetch || next;
492 my $mfn = $row->{'000'}->[0];
494 if (! $mfn || $mfn !~ m#^\d+$#) {
495 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
497 push @{ $row->{'000'} }, $pos;
502 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
503 $log->error( "MFN $mfn validation error:\n",
504 $validate->report_error( $errors )
507 next; # validation doesn't create any output
510 my $ds = WebPAC::Normalize::data_structure(
513 lookup => $lookup_hash,
514 config => create_ds_config( $db_config, $database, $input, $mfn ),
515 marc_encoding => 'utf-8',
516 load_row_coderef => sub {
517 my ($database,$input,$mfn) = @_;
518 return $store->load_row(
519 database => $database,
526 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
529 database => $database,
530 input => $input_name,
533 ) if ($ds && !$stats);
536 id => "${input_name}/${mfn}",
538 type => $config->get($indexer_config)->{type},
539 ) if ($indexer && $ds);
544 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
546 id => $mfn . ( $i ? "/$i" : '' ),
548 leader => WebPAC::Normalize::_get_marc_leader(),
554 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
559 my $errors = $validate->report;
561 $log->info("validation errors:\n$errors\n" );
562 print $report_fh "$errors\n" if ($report_fh);
565 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1 ), "\n\n";
569 my $s = $input_db->stats;
570 $log->info("statistics of fields usage:\n$s");
571 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
575 $marc->finish if ($marc);
578 close($report_fh) if ($report_fh)
582 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
584 my $dt = time() - $start_t;
585 $log->info("$total_rows records ", $indexer ? "indexed " : "",
586 sprintf("in %.2f sec [%.2f rec/sec]",
587 $dt, ($total_rows / $dt)
594 $log->info("parallel process $$ finished");
601 # wait all children to finish
602 sleep(1) while wait != -1;
603 $log->info("all parallel processes finished");
607 # handle links or merge after indexing
611 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
613 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
616 foreach my $link (@links) {
617 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');