6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Parser 0.04;
11 use WebPAC::Lookup 0.03;
12 use WebPAC::Input 0.11;
13 use WebPAC::Store 0.03;
14 use WebPAC::Normalize 0.11;
15 use WebPAC::Output::TT;
16 use WebPAC::Validate 0.06;
17 use WebPAC::Output::MARC;
21 use Time::HiRes qw/time/;
23 use Data::Dump qw/dump/;
24 use Storable qw/dclone/;
26 use Proc::Queue size => 1;
27 use POSIX ":sys_wait_h"; # imports WNOHANG
31 run.pl - start WebPAC indexing
33 B<this command will probably go away. Don't get used to it!>
41 start loading (all) databases at offset 42
45 limit loading to 100 records
49 remove database and Hyper Estraier index before indexing
51 =item --only=database_name/input_filter
53 reindex just single database (legacy name is --one)
55 C</input_filter> is optional part which can be C<name>
58 =item --config conf/config.yml
60 path to YAML configuration file
64 disable indexing, modify_* in configuration and dump statistics about field
65 and subfield usage for each input
67 =item --validate path/to/validation_file
69 turn on extra validation of imput records, see L<WebPAC::Validation>
71 =item --marc-normalize conf/normalize/mapping.pl
73 This option specifies normalisation file for MARC creation
75 =item --marc-output out/marc/test.marc
77 Optional path to output file
81 By default turned on if C<--marc-normalize> is used. You can disable lint
82 messages with C<--no-marc-lint>.
86 Force dump or input and marc record for debugging.
90 Run databases in parallel (aproximatly same as number of processors in
91 machine if you want to use full load)
99 Create merged index of databases which have links
114 my ($marc_normalize, $marc_output);
121 my $log = _new WebPAC::Common()->_get_logger();
124 "limit=i" => \$limit,
125 "offset=i" => \$offset,
127 "one=s" => \$only_filter,
128 "only=s" => \$only_filter,
129 "config" => \$config_path,
132 "validate=s" => \$validate_path,
133 "marc-normalize=s" => \$marc_normalize,
134 "marc-output=s" => \$marc_output,
135 "marc-lint!" => \$marc_lint,
136 "marc-dump!" => \$marc_dump,
137 "parallel=i" => \$parallel,
138 "only-links!" => \$only_links,
142 my $config = new WebPAC::Config( path => $config_path );
144 #print "config = ",dump($config) if ($debug);
146 die "no databases in config file!\n" unless ($config->databases);
148 $log->info( "-" x 79 );
152 my $estcmd_path = './estcmd-merge.sh';
154 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
155 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
156 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
157 $log->info("created merge batch file $estcmd_path");
162 $validate = new WebPAC::Validate(
163 path => $validate_path,
164 ) if ($validate_path);
167 my $use_indexer = $config->use_indexer;
169 $log->debug("option --stats disables update of indexing engine...");
170 $use_indexer = undef;
172 $log->info("using $use_indexer indexing engine...");
175 # disable indexing when creating marc
176 $use_indexer = undef if ($marc_normalize);
178 # parse normalize files and create source files for lookup and normalization
180 my $parser = new WebPAC::Parser( config => $config );
183 my $start_t = time();
188 $log->info("Using $parallel processes for speedup");
189 Proc::Queue::size($parallel);
192 while (my ($database, $db_config) = each %{ $config->databases }) {
194 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
195 next if ($only_database && $database !~ m/$only_database/i);
199 if(defined ($f) and $f==0) {
200 $log->info("Created processes $$ for speedup");
209 my $cfg_name = $use_indexer;
210 $cfg_name =~ s/\-.*$//;
212 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
213 $indexer_config->{database} = $database;
214 $indexer_config->{clean} = $clean;
215 $indexer_config->{label} = $db_config->{name};
217 # force clean if database has links
218 $indexer_config->{clean} = 1 if ($db_config->{links});
220 if ($use_indexer eq 'hyperestraier') {
222 # open Hyper Estraier database
223 use WebPAC::Output::Estraier '0.10';
224 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
226 } elsif ($use_indexer eq 'hyperestraier-native') {
228 # open Hyper Estraier database
229 use WebPAC::Output::EstraierNative;
230 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
232 } elsif ($use_indexer eq 'kinosearch') {
235 use WebPAC::Output::KinoSearch;
236 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
237 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
240 $log->logdie("unknown use_indexer: $use_indexer");
243 $log->logide("can't continue without valid indexer") unless ($indexer);
248 # store Hyper Estraier links to other databases
250 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
251 foreach my $link (@{ $db_config->{links} }) {
252 if ($use_indexer eq 'hyperestraier') {
254 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
256 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
258 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
262 credit => $link->{credit},
267 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
271 next if ($only_links);
277 my $abs_path = abs_path($0);
278 $abs_path =~ s#/[^/]*$#/#;
280 my $db_path = $config->get('webpac')->{db_path} . '/' . $database;
283 $log->info("creating new database '$database' in $db_path");
284 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
286 $log->info("working on database '$database' in $db_path");
289 my $db = new WebPAC::Store(
291 database => $database,
297 # now, iterate through input formats
301 if (ref($db_config->{input}) eq 'ARRAY') {
302 @inputs = @{ $db_config->{input} };
303 } elsif ($db_config->{input}) {
304 push @inputs, $db_config->{input};
306 $log->info("database $database doesn't have inputs defined");
309 foreach my $input (@inputs) {
311 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
313 my $type = lc($input->{type});
315 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
317 my $input_module = $config->webpac('inputs')->{$type};
319 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
320 $input->{lookup} ? "lookup '$input->{lookup}'" : ""
324 # disable modification of records if --stats is in use
325 delete($input->{modify_records});
326 delete($input->{modify_file});
329 warn "parser->depends = ", dump( $parser->{depends} );
330 warn "depends on: ", dump( $parser->depends($database, $input->{name}) );
331 warn "lookup_create_rules = ", dump( $parser->lookup_create_rules($database, $input->{name}) );
332 warn "parser->_lookup_create = ", dump( $parser->{_lookup_create} );
336 my $input_db = new WebPAC::Input(
337 module => $input_module,
338 encoding => $config->webpac('webpac_encoding'),
339 limit => $limit || $input->{limit},
341 lookup_coderef => sub {
342 my $rec = shift || return;
343 $lookup->add( $rec );
345 recode => $input->{recode},
347 modify_records => $input->{modify_records},
348 modify_file => $input->{modify_file},
350 $log->logdie("can't create input using $input_module") unless ($input);
352 if (defined( $input->{lookup} )) {
353 $log->warn("$database/", $input->{name}, " has depriciated lookup definition, removing it...");
354 delete( $input->{lookup} );
357 my $maxmfn = $input_db->open(
358 path => $input->{path},
359 code_page => $input->{encoding}, # database encoding
364 if ($stats || $validate) {
365 my $path = "out/report/" . $database . '-' . $input->{name} . '.txt';
366 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
368 print $report_fh "Report for database '$database' input '$input->{name}' records ",
369 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
370 $log->info("Generating report file $path");
373 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
374 @{ $input->{normalize} } : ( $input->{normalize} );
376 if ($marc_normalize) {
378 path => $marc_normalize,
379 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
383 foreach my $normalize (@norm_array) {
385 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
387 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
389 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
391 $log->info("Using $normalize_path for normalization...");
393 my $marc = new WebPAC::Output::MARC(
394 path => $normalize->{output},
397 ) if ($normalize->{output});
399 # reset position in database
402 # generate name of config key for indexer (strip everything after -)
403 my $indexer_config = $use_indexer;
404 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
406 foreach my $pos ( 0 ... $input_db->size ) {
408 my $row = $input_db->fetch || next;
410 my $mfn = $row->{'000'}->[0];
412 if (! $mfn || $mfn !~ m#^\d+$#) {
413 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
415 push @{ $row->{'000'} }, $pos;
420 if ( my $errors = $validate->validate_errors( $row, $input_db->dump ) ) {
421 $log->error( "MFN $mfn validation error:\n",
422 $validate->report_error( $errors )
427 my $ds_config = dclone($db_config);
429 # default values -> database key
430 $ds_config->{_} = $database;
433 $ds_config->{_mfn} = $mfn;
435 # attach current input
436 $ds_config->{input} = $input;
438 my $ds = WebPAC::Normalize::data_structure(
441 lookup => $lookup ? $lookup->lookup_hash : undef,
442 config => $ds_config,
443 marc_encoding => 'utf-8',
449 prefix => $input->{name},
450 ) if ($ds && !$stats);
453 id => $input->{name} . "/" . $mfn,
455 type => $config->get($indexer_config)->{type},
456 ) if ($indexer && $ds);
461 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
463 id => $mfn . ( $i ? "/$i" : '' ),
465 leader => WebPAC::Normalize::marc_leader(),
471 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
478 my $errors = $validate->report;
480 $log->info("validation errors:\n$errors\n" );
481 print $report_fh "$errors\n" if ($report_fh);
486 my $s = $input_db->stats;
487 $log->info("statistics of fields usage:\n$s");
488 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
492 $marc->finish if ($marc);
495 close($report_fh) if ($report_fh)
500 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
502 my $dt = time() - $start_t;
503 $log->info("$total_rows records ", $indexer ? "indexed " : "",
504 sprintf("in %.2f sec [%.2f rec/sec]",
505 $dt, ($total_rows / $dt)
512 $log->info("parallel process $$ finished");
519 # wait all children to finish
520 sleep(1) while wait != -1;
521 $log->info("all parallel processes finished");
525 # handle links or merge after indexing
529 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
531 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
534 foreach my $link (@links) {
535 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');