6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.11;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 =item --validate-delimiters path/to/validate_delimiters_file
73 this option is used with C<--validate> to turn on extra validation of
74 delimiters. If file is non existant, it will be created on first run.
78 Generate MARC file. This will automatically be on if file contains C<marc*> directives.
79 You can use this option as C<--no-marc-generate> to disable MARC generation.
83 By default turned on if normalisation file has C<marc*> directives. You can disable lint
84 messages with C<--no-marc-lint>.
88 Force dump or input and marc record for debugging.
92 Run databases in parallel (aproximatly same as number of processors in
93 machine if you want to use full load)
101 Create merged index of databases which have links
116 my $validate_delimiters_path;
117 my $marc_generate = 1;
125 my $log = _new WebPAC::Common()->_get_logger();
128 "limit=i" => \$limit,
129 "offset=i" => \$offset,
131 "one=s" => \$only_filter,
132 "only=s" => \$only_filter,
133 "config" => \$config_path,
136 "validate=s" => \$validate_path,
137 "validate-delimiters=s" => \$validate_delimiters_path,
138 "marc-generate!" => \$marc_generate,
139 "marc-lint!" => \$marc_lint,
140 "marc-dump!" => \$marc_dump,
141 "parallel=i" => \$parallel,
142 "only-links!" => \$only_links,
147 pod2usage(-verbose => 2) if ($help);
149 my $config = new WebPAC::Config( path => $config_path );
151 #print "config = ",dump($config) if ($debug);
153 die "no databases in config file!\n" unless ($config->databases);
155 $log->info( "-" x 79 );
157 my $log_file = 'log';
159 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
160 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
161 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
165 my $estcmd_path = './estcmd-merge.sh';
167 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
168 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
169 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
170 $log->info("created merge batch file $estcmd_path");
174 $validate = new WebPAC::Validate(
175 path => $validate_path,
176 delimiters => $config->webpac('delimiters'),
177 delimiters_path => $validate_delimiters_path,
178 ) if ($validate_path || $validate_delimiters_path);
180 my $use_indexer = $config->use_indexer;
181 $stats ||= $validate;
183 $log->debug("disabled indexing for stats collection");
184 $use_indexer = undef;
186 $log->info("using $use_indexer indexing engine...");
189 # parse normalize files and create source files for lookup and normalization
191 my $parser = new WebPAC::Parser( config => $config );
194 my $start_t = time();
199 $log->info("Using $parallel processes for speedup");
200 Proc::Queue::size($parallel);
203 sub create_ds_config {
204 my ($db_config, $database, $input, $mfn) = @_;
205 my $c = dclone( $db_config );
206 $c->{_} = $database || $log->logconfess("need database");
207 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
208 $c->{input} = $input || $log->logconfess("need input");
212 while (my ($database, $db_config) = each %{ $config->databases }) {
214 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
215 next if ($only_database && $database !~ m/$only_database/i);
219 if(defined ($f) and $f==0) {
220 $log->info("Created processes $$ for speedup");
227 if ($use_indexer && $parser->have_rules( 'search', $database )) {
229 my $cfg_name = $use_indexer;
230 $cfg_name =~ s/\-.*$//;
232 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
233 $indexer_config->{database} = $database;
234 $indexer_config->{clean} = $clean;
235 $indexer_config->{label} = $db_config->{name};
237 # force clean if database has links
238 $indexer_config->{clean} = 1 if ($db_config->{links});
240 if ($use_indexer eq 'hyperestraier') {
242 # open Hyper Estraier database
243 use WebPAC::Output::Estraier '0.10';
244 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
246 } elsif ($use_indexer eq 'hyperestraier-native') {
248 # open Hyper Estraier database
249 use WebPAC::Output::EstraierNative;
250 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
252 } elsif ($use_indexer eq 'kinosearch') {
255 use WebPAC::Output::KinoSearch;
256 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
257 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
260 $log->logdie("unknown use_indexer: $use_indexer");
263 $log->logide("can't continue without valid indexer") unless ($indexer);
268 # store Hyper Estraier links to other databases
270 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
271 foreach my $link (@{ $db_config->{links} }) {
272 if ($use_indexer eq 'hyperestraier') {
274 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
276 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
278 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
282 credit => $link->{credit},
287 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
291 next if ($only_links);
297 my $abs_path = abs_path($0);
298 $abs_path =~ s#/[^/]*$#/#;
300 my $db_path = $config->webpac('db_path');
303 $log->info("creating new database '$database' in $db_path");
304 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
306 $log->info("working on database '$database' in $db_path");
309 my $store = new WebPAC::Store(
316 # now, iterate through input formats
320 if (ref($db_config->{input}) eq 'ARRAY') {
321 @inputs = @{ $db_config->{input} };
322 } elsif ($db_config->{input}) {
323 push @inputs, $db_config->{input};
325 $log->info("database $database doesn't have inputs defined");
328 foreach my $input (@inputs) {
330 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
332 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
334 my $type = lc($input->{type});
336 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
338 my $input_module = $config->webpac('inputs')->{$type};
340 my @lookups = $parser->have_lookup_create($database, $input);
342 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
343 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
347 # disable modification of records if --stats is in use
348 delete($input->{modify_records});
349 delete($input->{modify_file});
352 my $input_db = new WebPAC::Input(
353 module => $input_module,
354 encoding => $config->webpac('webpac_encoding'),
355 limit => $limit || $input->{limit},
357 recode => $input->{recode},
359 modify_records => $input->{modify_records},
360 modify_file => $input->{modify_file},
362 $log->logdie("can't create input using $input_module") unless ($input);
364 if (defined( $input->{lookup} )) {
365 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
366 delete( $input->{lookup} );
373 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
375 $lookup_coderef = sub {
376 my $rec = shift || die "need rec!";
377 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
379 WebPAC::Normalize::data_structure(
382 config => create_ds_config( $db_config, $database, $input, $mfn ),
385 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
388 WebPAC::Normalize::_set_lookup( undef );
390 $log->debug("created lookup_coderef using:\n$rules");
396 my $maxmfn = $input_db->open(
397 path => $input->{path},
398 code_page => $input->{encoding}, # database encoding
399 lookup_coderef => $lookup_coderef,
400 lookup => $lookup_jar,
404 return $store->load_row(
405 database => $database,
406 input => $input_name,
412 return $store->save_row(
413 database => $database,
414 input => $input_name,
422 my $lookup_data = WebPAC::Normalize::_get_lookup();
424 if (defined( $lookup_data->{$database}->{$input_name} )) {
425 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
427 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
429 database => $database,
430 input => $input_name,
432 data => $lookup_data->{$database}->{$input_name}->{$key},
438 if ($stats || $validate) {
439 my $path = "out/report/${database}-${input_name}.txt";
440 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
442 print $report_fh "Report for database '$database' input '$input_name' records ",
443 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
444 $log->info("Generating report file $path");
448 if ($marc_generate && $parser->have_rules( 'marc', $database, $input_name )) {
449 $marc = new WebPAC::Output::MARC(
450 path => "out/marc/${database}-${input_name}.marc",
456 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
457 $log->debug("parsed normalize rules:\n$rules");
459 # reset position in database
462 # generate name of config key for indexer (strip everything after -)
463 my $indexer_config = $use_indexer;
464 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
467 my $depends = $parser->depends($database,$input_name);
470 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
471 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
473 foreach my $db (keys %$depends) {
474 foreach my $i (keys %{$depends->{$db}}) {
475 foreach my $k (keys %{$depends->{$db}->{$i}}) {
477 $log->debug("loading lookup $db/$i");
478 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
483 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
488 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
492 foreach my $pos ( 0 ... $input_db->size ) {
494 my $row = $input_db->fetch || next;
498 my $mfn = $row->{'000'}->[0];
500 if (! $mfn || $mfn !~ m#^\d+$#) {
501 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
503 push @{ $row->{'000'} }, $pos;
508 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
509 $log->error( "MFN $mfn validation error:\n",
510 $validate->report_error( $errors )
513 next; # validation doesn't create any output
516 my $ds = WebPAC::Normalize::data_structure(
519 lookup => $lookup_hash,
520 config => create_ds_config( $db_config, $database, $input, $mfn ),
521 marc_encoding => 'utf-8',
522 load_row_coderef => sub {
523 my ($database,$input,$mfn) = @_;
524 return $store->load_row(
525 database => $database,
532 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
535 database => $database,
536 input => $input_name,
539 ) if ($ds && !$stats);
542 id => "${input_name}/${mfn}",
544 type => $config->get($indexer_config)->{type},
545 ) if ($indexer && $ds);
550 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
552 id => $mfn . ( $i ? "/$i" : '' ),
554 leader => WebPAC::Normalize::_get_marc_leader(),
560 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
565 my $errors = $validate->report;
567 $log->info("validation errors:\n$errors\n" );
568 print $report_fh "$errors\n" if ($report_fh);
571 print $report_fh "\nAll possible subfields/delimiter templates:\n", $validate->delimiters_templates( report => 1 ), "\n\n";
575 my $s = $input_db->stats;
576 $log->info("statistics of fields usage:\n$s");
577 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
581 $marc->finish if ($marc);
584 close($report_fh) if ($report_fh)
588 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
590 my $dt = time() - $start_t;
591 $log->info("$total_rows records ", $indexer ? "indexed " : "",
592 sprintf("in %.2f sec [%.2f rec/sec]",
593 $dt, ($total_rows / $dt)
600 $log->info("parallel process $$ finished");
607 # wait all children to finish
608 sleep(1) while wait != -1;
609 $log->info("all parallel processes finished");
612 # save new delimiters if needed
613 $validate->save_delimiters_templates if ( $validate );
616 # handle links or merge after indexing
620 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
622 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
625 foreach my $link (@links) {
626 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');