6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.04;
11 use WebPAC::Input 0.11;
12 use WebPAC::Store 0.03;
13 use WebPAC::Normalize 0.11;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
98 Create merged index of databases which have links
113 my ($marc_normalize, $marc_output);
120 my $log = _new WebPAC::Common()->_get_logger();
123 "limit=i" => \$limit,
124 "offset=i" => \$offset,
126 "one=s" => \$only_filter,
127 "only=s" => \$only_filter,
128 "config" => \$config_path,
131 "validate=s" => \$validate_path,
132 "marc-normalize=s" => \$marc_normalize,
133 "marc-output=s" => \$marc_output,
134 "marc-lint!" => \$marc_lint,
135 "marc-dump!" => \$marc_dump,
136 "parallel=i" => \$parallel,
137 "only-links!" => \$only_links,
141 my $config = new WebPAC::Config( path => $config_path );
143 #print "config = ",dump($config) if ($debug);
145 die "no databases in config file!\n" unless ($config->databases);
147 $log->info( "-" x 79 );
151 my $estcmd_path = './estcmd-merge.sh';
153 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
154 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
155 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
156 $log->info("created merge batch file $estcmd_path");
161 $validate = new WebPAC::Validate(
162 path => $validate_path,
163 ) if ($validate_path);
166 my $use_indexer = $config->use_indexer;
168 $log->debug("option --stats disables update of indexing engine...");
169 $use_indexer = undef;
171 $log->info("using $use_indexer indexing engine...");
174 # disable indexing when creating marc
175 $use_indexer = undef if ($marc_normalize);
177 # parse normalize files and create source files for lookup and normalization
179 my $parser = new WebPAC::Parser( config => $config );
182 my $start_t = time();
187 $log->info("Using $parallel processes for speedup");
188 Proc::Queue::size($parallel);
191 while (my ($database, $db_config) = each %{ $config->databases }) {
193 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
194 next if ($only_database && $database !~ m/$only_database/i);
198 if(defined ($f) and $f==0) {
199 $log->info("Created processes $$ for speedup");
208 my $cfg_name = $use_indexer;
209 $cfg_name =~ s/\-.*$//;
211 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
212 $indexer_config->{database} = $database;
213 $indexer_config->{clean} = $clean;
214 $indexer_config->{label} = $db_config->{name};
216 # force clean if database has links
217 $indexer_config->{clean} = 1 if ($db_config->{links});
219 if ($use_indexer eq 'hyperestraier') {
221 # open Hyper Estraier database
222 use WebPAC::Output::Estraier '0.10';
223 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
225 } elsif ($use_indexer eq 'hyperestraier-native') {
227 # open Hyper Estraier database
228 use WebPAC::Output::EstraierNative;
229 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
231 } elsif ($use_indexer eq 'kinosearch') {
234 use WebPAC::Output::KinoSearch;
235 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
236 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
239 $log->logdie("unknown use_indexer: $use_indexer");
242 $log->logide("can't continue without valid indexer") unless ($indexer);
247 # store Hyper Estraier links to other databases
249 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
250 foreach my $link (@{ $db_config->{links} }) {
251 if ($use_indexer eq 'hyperestraier') {
253 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
255 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
257 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
261 credit => $link->{credit},
266 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
270 next if ($only_links);
276 my $abs_path = abs_path($0);
277 $abs_path =~ s#/[^/]*$#/#;
279 my $db_path = $config->get('webpac')->{db_path} . '/' . $database;
282 $log->info("creating new database '$database' in $db_path");
283 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
285 $log->info("working on database '$database' in $db_path");
288 my $db = new WebPAC::Store(
290 database => $database,
296 # now, iterate through input formats
300 if (ref($db_config->{input}) eq 'ARRAY') {
301 @inputs = @{ $db_config->{input} };
302 } elsif ($db_config->{input}) {
303 push @inputs, $db_config->{input};
305 $log->info("database $database doesn't have inputs defined");
308 foreach my $input (@inputs) {
310 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
312 my $type = lc($input->{type});
314 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
316 my $input_module = $config->webpac('inputs')->{$type};
318 my @lookups = $parser->have_lookup_create($database, $input);
320 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
321 @lookups ? "lookup ".join(", ", @lookups) : ""
324 warn "lookups = ", dump( @lookups );
327 # disable modification of records if --stats is in use
328 delete($input->{modify_records});
329 delete($input->{modify_file});
332 warn "parser->depends = ", dump( $parser->{depends} );
333 warn "depends on: ", dump( $parser->depends($database, $input->{name}) );
334 warn "lookup_create_rules = ", dump( $parser->lookup_create_rules($database, $input->{name}) );
335 warn "parser->_lookup_create = ", dump( $parser->{_lookup_create} );
339 my $input_db = new WebPAC::Input(
340 module => $input_module,
341 encoding => $config->webpac('webpac_encoding'),
342 limit => $limit || $input->{limit},
344 lookup_coderef => sub {
345 my $rec = shift || return;
346 $lookup->add( $rec );
348 recode => $input->{recode},
350 modify_records => $input->{modify_records},
351 modify_file => $input->{modify_file},
353 $log->logdie("can't create input using $input_module") unless ($input);
355 if (defined( $input->{lookup} )) {
356 $log->warn("$database/", $input->{name}, " has depriciated lookup definition, removing it...");
357 delete( $input->{lookup} );
360 my $maxmfn = $input_db->open(
361 path => $input->{path},
362 code_page => $input->{encoding}, # database encoding
367 if ($stats || $validate) {
368 my $path = "out/report/" . $database . '-' . $input->{name} . '.txt';
369 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
371 print $report_fh "Report for database '$database' input '$input->{name}' records ",
372 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
373 $log->info("Generating report file $path");
376 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
377 @{ $input->{normalize} } : ( $input->{normalize} );
379 if ($marc_normalize) {
381 path => $marc_normalize,
382 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
386 foreach my $normalize (@norm_array) {
388 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
390 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
392 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
394 $log->info("Using $normalize_path for normalization...");
396 my $marc = new WebPAC::Output::MARC(
397 path => $normalize->{output},
400 ) if ($normalize->{output});
402 # reset position in database
405 # generate name of config key for indexer (strip everything after -)
406 my $indexer_config = $use_indexer;
407 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
409 foreach my $pos ( 0 ... $input_db->size ) {
411 my $row = $input_db->fetch || next;
413 my $mfn = $row->{'000'}->[0];
415 if (! $mfn || $mfn !~ m#^\d+$#) {
416 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
418 push @{ $row->{'000'} }, $pos;
423 if ( my $errors = $validate->validate_errors( $row, $input_db->dump ) ) {
424 $log->error( "MFN $mfn validation error:\n",
425 $validate->report_error( $errors )
430 my $ds_config = dclone($db_config);
432 # default values -> database key
433 $ds_config->{_} = $database;
436 $ds_config->{_mfn} = $mfn;
438 # attach current input
439 $ds_config->{input} = $input;
441 my $ds = WebPAC::Normalize::data_structure(
444 lookup => $lookup ? $lookup->lookup_hash : undef,
445 config => $ds_config,
446 marc_encoding => 'utf-8',
452 prefix => $input->{name},
453 ) if ($ds && !$stats);
456 id => $input->{name} . "/" . $mfn,
458 type => $config->get($indexer_config)->{type},
459 ) if ($indexer && $ds);
464 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
466 id => $mfn . ( $i ? "/$i" : '' ),
468 leader => WebPAC::Normalize::marc_leader(),
474 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
481 my $errors = $validate->report;
483 $log->info("validation errors:\n$errors\n" );
484 print $report_fh "$errors\n" if ($report_fh);
489 my $s = $input_db->stats;
490 $log->info("statistics of fields usage:\n$s");
491 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
495 $marc->finish if ($marc);
498 close($report_fh) if ($report_fh)
503 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
505 my $dt = time() - $start_t;
506 $log->info("$total_rows records ", $indexer ? "indexed " : "",
507 sprintf("in %.2f sec [%.2f rec/sec]",
508 $dt, ($total_rows / $dt)
515 $log->info("parallel process $$ finished");
522 # wait all children to finish
523 sleep(1) while wait != -1;
524 $log->info("all parallel processes finished");
528 # handle links or merge after indexing
532 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
534 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
537 foreach my $link (@links) {
538 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');