6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.04;
11 use WebPAC::Input 0.13;
12 use WebPAC::Store 0.11;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
98 Create merged index of databases which have links
113 my ($marc_normalize, $marc_output);
120 my $log = _new WebPAC::Common()->_get_logger();
123 "limit=i" => \$limit,
124 "offset=i" => \$offset,
126 "one=s" => \$only_filter,
127 "only=s" => \$only_filter,
128 "config" => \$config_path,
131 "validate=s" => \$validate_path,
132 "marc-normalize=s" => \$marc_normalize,
133 "marc-output=s" => \$marc_output,
134 "marc-lint!" => \$marc_lint,
135 "marc-dump!" => \$marc_dump,
136 "parallel=i" => \$parallel,
137 "only-links!" => \$only_links,
141 my $config = new WebPAC::Config( path => $config_path );
143 #print "config = ",dump($config) if ($debug);
145 die "no databases in config file!\n" unless ($config->databases);
147 $log->info( "-" x 79 );
151 my $estcmd_path = './estcmd-merge.sh';
153 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
154 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
155 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
156 $log->info("created merge batch file $estcmd_path");
161 $validate = new WebPAC::Validate(
162 path => $validate_path,
163 ) if ($validate_path);
166 my $use_indexer = $config->use_indexer;
168 $log->debug("option --stats disables update of indexing engine...");
169 $use_indexer = undef;
171 $log->info("using $use_indexer indexing engine...");
174 # disable indexing when creating marc
175 $use_indexer = undef if ($marc_normalize);
177 # parse normalize files and create source files for lookup and normalization
179 my $parser = new WebPAC::Parser( config => $config );
182 my $start_t = time();
187 $log->info("Using $parallel processes for speedup");
188 Proc::Queue::size($parallel);
191 sub create_ds_config {
192 my ($db_config, $database, $input, $mfn) = @_;
193 my $c = dclone( $db_config );
194 $c->{_} = $database || $log->logconfess("need database");
195 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
196 $c->{input} = $input || $log->logconfess("need input");
200 while (my ($database, $db_config) = each %{ $config->databases }) {
202 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
203 next if ($only_database && $database !~ m/$only_database/i);
207 if(defined ($f) and $f==0) {
208 $log->info("Created processes $$ for speedup");
217 my $cfg_name = $use_indexer;
218 $cfg_name =~ s/\-.*$//;
220 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
221 $indexer_config->{database} = $database;
222 $indexer_config->{clean} = $clean;
223 $indexer_config->{label} = $db_config->{name};
225 # force clean if database has links
226 $indexer_config->{clean} = 1 if ($db_config->{links});
228 if ($use_indexer eq 'hyperestraier') {
230 # open Hyper Estraier database
231 use WebPAC::Output::Estraier '0.10';
232 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
234 } elsif ($use_indexer eq 'hyperestraier-native') {
236 # open Hyper Estraier database
237 use WebPAC::Output::EstraierNative;
238 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
240 } elsif ($use_indexer eq 'kinosearch') {
243 use WebPAC::Output::KinoSearch;
244 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
245 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
248 $log->logdie("unknown use_indexer: $use_indexer");
251 $log->logide("can't continue without valid indexer") unless ($indexer);
256 # store Hyper Estraier links to other databases
258 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
259 foreach my $link (@{ $db_config->{links} }) {
260 if ($use_indexer eq 'hyperestraier') {
262 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
264 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
266 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
270 credit => $link->{credit},
275 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
279 next if ($only_links);
285 my $abs_path = abs_path($0);
286 $abs_path =~ s#/[^/]*$#/#;
288 my $db_path = $config->webpac('db_path');
291 $log->info("creating new database '$database' in $db_path");
292 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
294 $log->info("working on database '$database' in $db_path");
297 my $store = new WebPAC::Store(
304 # now, iterate through input formats
308 if (ref($db_config->{input}) eq 'ARRAY') {
309 @inputs = @{ $db_config->{input} };
310 } elsif ($db_config->{input}) {
311 push @inputs, $db_config->{input};
313 $log->info("database $database doesn't have inputs defined");
316 foreach my $input (@inputs) {
318 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
320 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
322 my $type = lc($input->{type});
324 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
326 my $input_module = $config->webpac('inputs')->{$type};
328 my @lookups = $parser->have_lookup_create($database, $input);
330 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
331 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
335 # disable modification of records if --stats is in use
336 delete($input->{modify_records});
337 delete($input->{modify_file});
340 my $input_db = new WebPAC::Input(
341 module => $input_module,
342 encoding => $config->webpac('webpac_encoding'),
343 limit => $limit || $input->{limit},
345 recode => $input->{recode},
347 modify_records => $input->{modify_records},
348 modify_file => $input->{modify_file},
350 $log->logdie("can't create input using $input_module") unless ($input);
352 if (defined( $input->{lookup} )) {
353 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
354 delete( $input->{lookup} );
361 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
363 $lookup_coderef = sub {
364 my $rec = shift || die "need rec!";
365 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
368 database => $database,
369 input => $input_name,
374 WebPAC::Normalize::data_structure(
377 config => create_ds_config( $db_config, $database, $input, $mfn ),
380 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
383 WebPAC::Normalize::_set_lookup( undef );
385 $log->debug("created lookup_coderef using:\n$rules");
391 my $maxmfn = $input_db->open(
392 path => $input->{path},
393 code_page => $input->{encoding}, # database encoding
394 lookup_coderef => $lookup_coderef,
395 lookup => $lookup_jar,
399 my $lookup_data = WebPAC::Normalize::_get_lookup();
401 if (defined( $lookup_data->{$database}->{$input_name} )) {
402 $log->debug("created following lookups: ", dump( $lookup_data ));
404 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
406 database => $database,
407 input => $input_name,
409 data => $lookup_data->{$database}->{$input_name}->{$key},
415 if ($stats || $validate) {
416 my $path = "out/report/${database}-${input_name}.txt";
417 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
419 print $report_fh "Report for database '$database' input '$input_name' records ",
420 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
421 $log->info("Generating report file $path");
425 if ($marc_normalize) {
426 $marc = new WebPAC::Output::MARC(
427 path => $marc_output || "out/marc/${database}-${input_name}.marc",
433 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
434 $log->debug("parsed normalize rules:\n$rules");
436 # reset position in database
439 # generate name of config key for indexer (strip everything after -)
440 my $indexer_config = $use_indexer;
441 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
444 my $depends = $parser->depends($database,$input_name);
447 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
448 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
450 foreach my $db (keys %$depends) {
451 foreach my $i (keys %{$depends->{$db}}) {
452 foreach my $k (keys %{$depends->{$db}->{$i}}) {
453 $log->debug("loading lookup $db/$i");
454 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
463 $log->debug("lookup_hash = ", dump( $lookup_hash ));
467 foreach my $pos ( 0 ... $input_db->size ) {
469 my $row = $input_db->fetch || next;
471 my $mfn = $row->{'000'}->[0];
473 if (! $mfn || $mfn !~ m#^\d+$#) {
474 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
476 push @{ $row->{'000'} }, $pos;
481 if ( my $errors = $validate->validate_errors( $row, $input_db->dump ) ) {
482 $log->error( "MFN $mfn validation error:\n",
483 $validate->report_error( $errors )
488 my $ds = WebPAC::Normalize::data_structure(
491 lookup => $lookup_hash,
492 config => create_ds_config( $db_config, $database, $input, $mfn ),
493 marc_encoding => 'utf-8',
494 load_row_coderef => sub {
495 my ($database,$input,$mfn) = @_;
496 return $store->load_row(
497 database => $database,
504 $log->debug("ds = ",dump($ds));
507 database => $database,
508 input => $input_name,
511 ) if ($ds && !$stats);
514 id => "${input_name}/${mfn}",
516 type => $config->get($indexer_config)->{type},
517 ) if ($indexer && $ds);
522 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
524 id => $mfn . ( $i ? "/$i" : '' ),
526 leader => WebPAC::Normalize::marc_leader(),
532 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
539 my $errors = $validate->report;
541 $log->info("validation errors:\n$errors\n" );
542 print $report_fh "$errors\n" if ($report_fh);
547 my $s = $input_db->stats;
548 $log->info("statistics of fields usage:\n$s");
549 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
553 $marc->finish if ($marc);
556 close($report_fh) if ($report_fh)
560 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
562 my $dt = time() - $start_t;
563 $log->info("$total_rows records ", $indexer ? "indexed " : "",
564 sprintf("in %.2f sec [%.2f rec/sec]",
565 $dt, ($total_rows / $dt)
572 $log->info("parallel process $$ finished");
579 # wait all children to finish
580 sleep(1) while wait != -1;
581 $log->info("all parallel processes finished");
585 # handle links or merge after indexing
589 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
591 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
594 foreach my $link (@links) {
595 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');