6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.08;
11 use WebPAC::Input 0.16;
12 use WebPAC::Store 0.14;
13 use WebPAC::Normalize 0.22;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
24 use Pod::Usage qw/pod2usage/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
73 By default turned on if normalisation file has C<marc*> directives. You can disable lint
74 messages with C<--no-marc-lint>.
78 Force dump or input and marc record for debugging.
82 Run databases in parallel (aproximatly same as number of processors in
83 machine if you want to use full load)
91 Create merged index of databases which have links
113 my $log = _new WebPAC::Common()->_get_logger();
116 "limit=i" => \$limit,
117 "offset=i" => \$offset,
119 "one=s" => \$only_filter,
120 "only=s" => \$only_filter,
121 "config" => \$config_path,
124 "validate=s" => \$validate_path,
125 "marc-lint!" => \$marc_lint,
126 "marc-dump!" => \$marc_dump,
127 "parallel=i" => \$parallel,
128 "only-links!" => \$only_links,
133 pod2usage(-verbose => 2) if ($help);
135 my $config = new WebPAC::Config( path => $config_path );
137 #print "config = ",dump($config) if ($debug);
139 die "no databases in config file!\n" unless ($config->databases);
141 $log->info( "-" x 79 );
143 my $log_file = 'log';
145 if (-e $log_file ) { # && -s $log_file > 5 * 1024 * 1024) {
146 $log->info("moved old log with ", -s $log_file, " bytes to '${log_file}.old'");
147 rename $log_file, "${log_file}.old" || $log->logwarn("can't rename $log_file to ${log_file}.old: $!");
151 my $estcmd_path = './estcmd-merge.sh';
153 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
154 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
155 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
156 $log->info("created merge batch file $estcmd_path");
161 $validate = new WebPAC::Validate(
162 path => $validate_path,
163 ) if ($validate_path);
166 my $use_indexer = $config->use_indexer;
167 $stats ||= $validate;
169 $log->debug("disabled indexing for stats collection");
170 $use_indexer = undef;
172 $log->info("using $use_indexer indexing engine...");
175 # parse normalize files and create source files for lookup and normalization
177 my $parser = new WebPAC::Parser( config => $config );
180 my $start_t = time();
185 $log->info("Using $parallel processes for speedup");
186 Proc::Queue::size($parallel);
189 sub create_ds_config {
190 my ($db_config, $database, $input, $mfn) = @_;
191 my $c = dclone( $db_config );
192 $c->{_} = $database || $log->logconfess("need database");
193 $c->{_mfn} = $mfn || $log->logconfess("need mfn");
194 $c->{input} = $input || $log->logconfess("need input");
198 while (my ($database, $db_config) = each %{ $config->databases }) {
200 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
201 next if ($only_database && $database !~ m/$only_database/i);
205 if(defined ($f) and $f==0) {
206 $log->info("Created processes $$ for speedup");
213 if ($use_indexer && $parser->have_rules( 'search', $database )) {
215 my $cfg_name = $use_indexer;
216 $cfg_name =~ s/\-.*$//;
218 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
219 $indexer_config->{database} = $database;
220 $indexer_config->{clean} = $clean;
221 $indexer_config->{label} = $db_config->{name};
223 # force clean if database has links
224 $indexer_config->{clean} = 1 if ($db_config->{links});
226 if ($use_indexer eq 'hyperestraier') {
228 # open Hyper Estraier database
229 use WebPAC::Output::Estraier '0.10';
230 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
232 } elsif ($use_indexer eq 'hyperestraier-native') {
234 # open Hyper Estraier database
235 use WebPAC::Output::EstraierNative;
236 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
238 } elsif ($use_indexer eq 'kinosearch') {
241 use WebPAC::Output::KinoSearch;
242 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
243 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
246 $log->logdie("unknown use_indexer: $use_indexer");
249 $log->logide("can't continue without valid indexer") unless ($indexer);
254 # store Hyper Estraier links to other databases
256 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
257 foreach my $link (@{ $db_config->{links} }) {
258 if ($use_indexer eq 'hyperestraier') {
260 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
262 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
264 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
268 credit => $link->{credit},
273 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
277 next if ($only_links);
283 my $abs_path = abs_path($0);
284 $abs_path =~ s#/[^/]*$#/#;
286 my $db_path = $config->webpac('db_path');
289 $log->info("creating new database '$database' in $db_path");
290 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
292 $log->info("working on database '$database' in $db_path");
295 my $store = new WebPAC::Store(
302 # now, iterate through input formats
306 if (ref($db_config->{input}) eq 'ARRAY') {
307 @inputs = @{ $db_config->{input} };
308 } elsif ($db_config->{input}) {
309 push @inputs, $db_config->{input};
311 $log->info("database $database doesn't have inputs defined");
314 foreach my $input (@inputs) {
316 my $input_name = $input->{name} || $log->logdie("input without a name isn't valid: ",dump($input));
318 next if ($only_input && ($input_name !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
320 my $type = lc($input->{type});
322 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
324 my $input_module = $config->webpac('inputs')->{$type};
326 my @lookups = $parser->have_lookup_create($database, $input);
328 $log->info("working on input '$input_name' in $input->{path} [type: $input->{type}] using $input_module",
329 @lookups ? " creating lookups: ".join(", ", @lookups) : ""
333 # disable modification of records if --stats is in use
334 delete($input->{modify_records});
335 delete($input->{modify_file});
338 my $input_db = new WebPAC::Input(
339 module => $input_module,
340 encoding => $config->webpac('webpac_encoding'),
341 limit => $limit || $input->{limit},
343 recode => $input->{recode},
345 modify_records => $input->{modify_records},
346 modify_file => $input->{modify_file},
348 $log->logdie("can't create input using $input_module") unless ($input);
350 if (defined( $input->{lookup} )) {
351 $log->warn("$database/$input_name has depriciated lookup definition, removing it...");
352 delete( $input->{lookup} );
359 my $rules = $parser->lookup_create_rules($database, $input) || $log->logdie("no rules found for $database/$input");
361 $lookup_coderef = sub {
362 my $rec = shift || die "need rec!";
363 my $mfn = $rec->{'000'}->[0] || die "need mfn in 000";
365 WebPAC::Normalize::data_structure(
368 config => create_ds_config( $db_config, $database, $input, $mfn ),
371 #warn "current lookup: ", dump(WebPAC::Normalize::_get_lookup());
374 WebPAC::Normalize::_set_lookup( undef );
376 $log->debug("created lookup_coderef using:\n$rules");
382 my $maxmfn = $input_db->open(
383 path => $input->{path},
384 code_page => $input->{encoding}, # database encoding
385 lookup_coderef => $lookup_coderef,
386 lookup => $lookup_jar,
390 return $store->load_row(
391 database => $database,
392 input => $input_name,
398 return $store->save_row(
399 database => $database,
400 input => $input_name,
408 my $lookup_data = WebPAC::Normalize::_get_lookup();
410 if (defined( $lookup_data->{$database}->{$input_name} )) {
411 $log->debug("created following lookups: ", sub { dump( $lookup_data ) } );
413 foreach my $key (keys %{ $lookup_data->{$database}->{$input_name} }) {
415 database => $database,
416 input => $input_name,
418 data => $lookup_data->{$database}->{$input_name}->{$key},
424 if ($stats || $validate) {
425 my $path = "out/report/${database}-${input_name}.txt";
426 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
428 print $report_fh "Report for database '$database' input '$input_name' records ",
429 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
430 $log->info("Generating report file $path");
434 if ($parser->have_rules( 'marc', $database, $input_name )) {
435 $marc = new WebPAC::Output::MARC(
436 path => "out/marc/${database}-${input_name}.marc",
442 my $rules = $parser->normalize_rules($database,$input_name) || $log->logdie("no normalize rules found for $database/$input_name");
443 $log->debug("parsed normalize rules:\n$rules");
445 # reset position in database
448 # generate name of config key for indexer (strip everything after -)
449 my $indexer_config = $use_indexer;
450 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
453 my $depends = $parser->depends($database,$input_name);
456 $log->debug("$database/$input_name depends on: ", dump($depends)) if ($depends);
457 $log->logdie("parser->depends didn't return HASH") unless (ref($depends) eq 'HASH');
459 foreach my $db (keys %$depends) {
460 foreach my $i (keys %{$depends->{$db}}) {
461 foreach my $k (keys %{$depends->{$db}->{$i}}) {
463 $log->debug("loading lookup $db/$i");
464 $lookup_hash->{$db}->{$i}->{$k} = $store->load_lookup(
469 $log->debug(sprintf("lookup $db/$i took %.2fs", time() - $t));
474 $log->debug("lookup_hash = ", sub { dump( $lookup_hash ) });
478 foreach my $pos ( 0 ... $input_db->size ) {
480 my $row = $input_db->fetch || next;
484 my $mfn = $row->{'000'}->[0];
486 if (! $mfn || $mfn !~ m#^\d+$#) {
487 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
489 push @{ $row->{'000'} }, $pos;
494 if ( my $errors = $validate->validate_rec( $row, $input_db->dump_ascii ) ) {
495 $log->error( "MFN $mfn validation error:\n",
496 $validate->report_error( $errors )
499 next; # validation doesn't create any output
502 my $ds = WebPAC::Normalize::data_structure(
505 lookup => $lookup_hash,
506 config => create_ds_config( $db_config, $database, $input, $mfn ),
507 marc_encoding => 'utf-8',
508 load_row_coderef => sub {
509 my ($database,$input,$mfn) = @_;
510 return $store->load_row(
511 database => $database,
518 $log->debug("ds = ", sub { dump($ds) }) if ($ds);
521 database => $database,
522 input => $input_name,
525 ) if ($ds && !$stats);
528 id => "${input_name}/${mfn}",
530 type => $config->get($indexer_config)->{type},
531 ) if ($indexer && $ds);
536 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
538 id => $mfn . ( $i ? "/$i" : '' ),
540 leader => WebPAC::Normalize::marc_leader(),
546 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
551 my $errors = $validate->report;
553 $log->info("validation errors:\n$errors\n" );
554 print $report_fh "$errors\n" if ($report_fh);
559 my $s = $input_db->stats;
560 $log->info("statistics of fields usage:\n$s");
561 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
565 $marc->finish if ($marc);
568 close($report_fh) if ($report_fh)
572 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
574 my $dt = time() - $start_t;
575 $log->info("$total_rows records ", $indexer ? "indexed " : "",
576 sprintf("in %.2f sec [%.2f rec/sec]",
577 $dt, ($total_rows / $dt)
584 $log->info("parallel process $$ finished");
591 # wait all children to finish
592 sleep(1) while wait != -1;
593 $log->info("all parallel processes finished");
597 # handle links or merge after indexing
601 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
603 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
606 foreach my $link (@links) {
607 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');