6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Lookup 0.03;
11 use WebPAC::Input 0.11;
12 use WebPAC::Store 0.03;
13 use WebPAC::Normalize 0.11;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
98 Create merged index of databases which have links
113 my ($marc_normalize, $marc_output);
120 my $log = _new WebPAC::Common()->_get_logger();
123 "limit=i" => \$limit,
124 "offset=i" => \$offset,
126 "one=s" => \$only_filter,
127 "only=s" => \$only_filter,
128 "config" => \$config_path,
131 "validate=s" => \$validate_path,
132 "marc-normalize=s" => \$marc_normalize,
133 "marc-output=s" => \$marc_output,
134 "marc-lint!" => \$marc_lint,
135 "marc-dump!" => \$marc_dump,
136 "parallel=i" => \$parallel,
137 "only-links!" => \$only_links,
141 my $config = new WebPAC::Config( path => $config_path );
143 #print "config = ",dump($config) if ($debug);
145 die "no databases in config file!\n" unless ($config->databases);
147 $log->info( "-" x 79 );
151 my $estcmd_path = './estcmd-merge.sh';
153 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
154 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
155 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
156 $log->info("created merge batch file $estcmd_path");
161 $validate = new WebPAC::Validate(
162 path => $validate_path,
163 ) if ($validate_path);
166 my $use_indexer = $config->use_indexer;
168 $log->debug("option --stats disables update of indexing engine...");
169 $use_indexer = undef;
171 $log->info("using $use_indexer indexing engine...");
174 # disable indexing when creating marc
175 $use_indexer = undef if ($marc_normalize);
178 my $start_t = time();
183 $log->info("Using $parallel processes for speedup");
184 Proc::Queue::size($parallel);
187 while (my ($database, $db_config) = each %{ $config->databases }) {
189 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
190 next if ($only_database && $database !~ m/$only_database/i);
194 if(defined ($f) and $f==0) {
195 $log->info("Created processes $$ for speedup");
204 my $cfg_name = $use_indexer;
205 $cfg_name =~ s/\-.*$//;
207 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
208 $indexer_config->{database} = $database;
209 $indexer_config->{clean} = $clean;
210 $indexer_config->{label} = $db_config->{name};
212 # force clean if database has links
213 $indexer_config->{clean} = 1 if ($db_config->{links});
215 if ($use_indexer eq 'hyperestraier') {
217 # open Hyper Estraier database
218 use WebPAC::Output::Estraier '0.10';
219 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
221 } elsif ($use_indexer eq 'hyperestraier-native') {
223 # open Hyper Estraier database
224 use WebPAC::Output::EstraierNative;
225 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
227 } elsif ($use_indexer eq 'kinosearch') {
230 use WebPAC::Output::KinoSearch;
231 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
232 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
235 $log->logdie("unknown use_indexer: $use_indexer");
238 $log->logide("can't continue without valid indexer") unless ($indexer);
243 # store Hyper Estraier links to other databases
245 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
246 foreach my $link (@{ $db_config->{links} }) {
247 if ($use_indexer eq 'hyperestraier') {
249 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
251 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
253 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
257 credit => $link->{credit},
262 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
266 next if ($only_links);
272 my $abs_path = abs_path($0);
273 $abs_path =~ s#/[^/]*$#/#;
275 my $db_path = $config->get('webpac')->{db_path} . '/' . $database;
278 $log->info("creating new database '$database' in $db_path");
279 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
281 $log->info("working on database '$database' in $db_path");
284 my $db = new WebPAC::Store(
286 database => $database,
292 # now, iterate through input formats
296 if (ref($db_config->{input}) eq 'ARRAY') {
297 @inputs = @{ $db_config->{input} };
298 } elsif ($db_config->{input}) {
299 push @inputs, $db_config->{input};
301 $log->info("database $database doesn't have inputs defined");
304 foreach my $input (@inputs) {
306 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
308 my $type = lc($input->{type});
310 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
313 if ($input->{lookup}) {
314 $lookup = new WebPAC::Lookup(
315 lookup_file => $input->{lookup},
317 delete( $input->{lookup} );
320 my $input_module = $config->webpac('inputs')->{$type};
322 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
323 $input->{lookup} ? "lookup '$input->{lookup}'" : ""
327 # disable modification of records if --stats is in use
328 delete($input->{modify_records});
329 delete($input->{modify_file});
332 my $input_db = new WebPAC::Input(
333 module => $input_module,
334 encoding => $config->webpac('webpac_encoding'),
335 limit => $limit || $input->{limit},
337 lookup_coderef => sub {
338 my $rec = shift || return;
339 $lookup->add( $rec );
341 recode => $input->{recode},
343 modify_records => $input->{modify_records},
344 modify_file => $input->{modify_file},
346 $log->logdie("can't create input using $input_module") unless ($input);
348 my $maxmfn = $input_db->open(
349 path => $input->{path},
350 code_page => $input->{encoding}, # database encoding
355 if ($stats || $validate) {
356 my $path = "out/report/" . $database . '-' . $input->{name} . '.txt';
357 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
359 print $report_fh "Report for database '$database' input '$input->{name}' records ",
360 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
361 $log->info("Generating report file $path");
364 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
365 @{ $input->{normalize} } : ( $input->{normalize} );
367 if ($marc_normalize) {
369 path => $marc_normalize,
370 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
374 foreach my $normalize (@norm_array) {
376 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
378 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
380 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
382 $log->info("Using $normalize_path for normalization...");
384 my $marc = new WebPAC::Output::MARC(
385 path => $normalize->{output},
388 ) if ($normalize->{output});
390 # reset position in database
393 # generate name of config key for indexer (strip everything after -)
394 my $indexer_config = $use_indexer;
395 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
397 foreach my $pos ( 0 ... $input_db->size ) {
399 my $row = $input_db->fetch || next;
401 my $mfn = $row->{'000'}->[0];
403 if (! $mfn || $mfn !~ m#^\d+$#) {
404 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
406 push @{ $row->{'000'} }, $pos;
411 if ( my $errors = $validate->validate_errors( $row, $input_db->dump ) ) {
412 $log->error( "MFN $mfn validation error:\n",
413 $validate->report_error( $errors )
418 my $ds_config = dclone($db_config);
420 # default values -> database key
421 $ds_config->{_} = $database;
424 $ds_config->{_mfn} = $mfn;
426 # attach current input
427 $ds_config->{input} = $input;
429 my $ds = WebPAC::Normalize::data_structure(
432 lookup => $lookup ? $lookup->lookup_hash : undef,
433 config => $ds_config,
434 marc_encoding => 'utf-8',
440 prefix => $input->{name},
441 ) if ($ds && !$stats);
444 id => $input->{name} . "/" . $mfn,
446 type => $config->get($indexer_config)->{type},
447 ) if ($indexer && $ds);
452 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
454 id => $mfn . ( $i ? "/$i" : '' ),
456 leader => WebPAC::Normalize::marc_leader(),
462 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
469 my $errors = $validate->report;
471 $log->info("validation errors:\n$errors\n" );
472 print $report_fh "$errors\n" if ($report_fh);
477 my $s = $input_db->stats;
478 $log->info("statistics of fields usage:\n$s");
479 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
483 $marc->finish if ($marc);
486 close($report_fh) if ($report_fh)
491 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
493 my $dt = time() - $start_t;
494 $log->info("$total_rows records ", $indexer ? "indexed " : "",
495 sprintf("in %.2f sec [%.2f rec/sec]",
496 $dt, ($total_rows / $dt)
503 $log->info("parallel process $$ finished");
510 # wait all children to finish
511 sleep(1) while wait != -1;
512 $log->info("all parallel processes finished");
516 # handle links or merge after indexing
520 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
522 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
525 foreach my $link (@links) {
526 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');