6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Lookup 0.03;
11 use WebPAC::Input 0.11;
12 use WebPAC::Store 0.03;
13 use WebPAC::Normalize 0.11;
14 use WebPAC::Output::TT;
15 use WebPAC::Validate 0.06;
16 use WebPAC::Output::MARC;
17 use YAML qw/LoadFile/;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing, modify_* in configuration and dump statistics about field
64 and subfield usage for each input
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
98 Create merged index of databases which have links
108 my $config = 'conf/config.yml';
113 my ($marc_normalize, $marc_output);
120 my $log = _new WebPAC::Common()->_get_logger();
122 my $hostname = `hostname`;
124 $hostname =~ s/\..+$//;
125 if (-e "conf/$hostname.yml") {
126 $config = "conf/$hostname.yml";
127 $log->info("using host configuration file: $config");
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config" => \$config,
139 "validate=s" => \$validate_path,
140 "marc-normalize=s" => \$marc_normalize,
141 "marc-output=s" => \$marc_output,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
149 $config = LoadFile($config);
151 #print "config = ",dump($config) if ($debug);
153 die "no databases in config file!\n" unless ($config->{databases});
155 $log->info( "-" x 79 );
159 my $estcmd_path = './estcmd-merge.sh';
161 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
162 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
163 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
164 $log->info("created merge batch file $estcmd_path");
169 $validate = new WebPAC::Validate(
170 path => $validate_path,
171 ) if ($validate_path);
174 my $use_indexer = $config->{use_indexer} || 'hyperestraier';
176 $log->debug("option --stats disables update of indexing engine...");
177 $use_indexer = undef;
179 $log->info("using $use_indexer indexing engine...");
182 # disable indexing when creating marc
183 $use_indexer = undef if ($marc_normalize);
186 my $start_t = time();
191 $log->info("Using $parallel processes for speedup");
192 Proc::Queue::size($parallel);
195 while (my ($database, $db_config) = each %{ $config->{databases} }) {
197 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
198 next if ($only_database && $database !~ m/$only_database/i);
202 if(defined ($f) and $f==0) {
203 $log->info("Created processes $$ for speedup");
212 my $cfg_name = $use_indexer;
213 $cfg_name =~ s/\-.*$//;
215 my $indexer_config = $config->{$cfg_name} || $log->logdie("can't find '$cfg_name' part in confguration");
216 $indexer_config->{database} = $database;
217 $indexer_config->{clean} = $clean;
218 $indexer_config->{label} = $db_config->{name};
220 # force clean if database has links
221 $indexer_config->{clean} = 1 if ($db_config->{links});
223 if ($use_indexer eq 'hyperestraier') {
225 # open Hyper Estraier database
226 use WebPAC::Output::Estraier '0.10';
227 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
229 } elsif ($use_indexer eq 'hyperestraier-native') {
231 # open Hyper Estraier database
232 use WebPAC::Output::EstraierNative;
233 $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
235 } elsif ($use_indexer eq 'kinosearch') {
238 use WebPAC::Output::KinoSearch;
239 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
240 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
243 $log->logdie("unknown use_indexer: $use_indexer");
246 $log->logide("can't continue without valid indexer") unless ($indexer);
251 # store Hyper Estraier links to other databases
253 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
254 foreach my $link (@{ $db_config->{links} }) {
255 if ($use_indexer eq 'hyperestraier') {
257 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
259 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
261 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
265 credit => $link->{credit},
270 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
274 next if ($only_links);
280 my $abs_path = abs_path($0);
281 $abs_path =~ s#/[^/]*$#/#;
283 my $db_path = $config->{webpac}->{db_path} . '/' . $database;
286 $log->info("creating new database '$database' in $db_path");
287 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
289 $log->info("working on database '$database' in $db_path");
292 my $db = new WebPAC::Store(
294 database => $database,
300 # now, iterate through input formats
304 if (ref($db_config->{input}) eq 'ARRAY') {
305 @inputs = @{ $db_config->{input} };
306 } elsif ($db_config->{input}) {
307 push @inputs, $db_config->{input};
309 $log->info("database $database doesn't have inputs defined");
312 my @supported_inputs = keys %{ $config->{webpac}->{inputs} };
314 foreach my $input (@inputs) {
316 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
318 my $type = lc($input->{type});
320 die "I know only how to handle input types ", join(",", @supported_inputs), " not '$type'!\n" unless (grep(/$type/, @supported_inputs));
323 if ($input->{lookup}) {
324 $lookup = new WebPAC::Lookup(
325 lookup_file => $input->{lookup},
327 delete( $input->{lookup} );
330 my $input_module = $config->{webpac}->{inputs}->{$type};
332 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
333 $input->{lookup} ? "lookup '$input->{lookup}'" : ""
337 # disable modification of records if --stats is in use
338 delete($input->{modify_records});
339 delete($input->{modify_file});
342 my $input_db = new WebPAC::Input(
343 module => $input_module,
344 encoding => $config->{webpac}->{webpac_encoding},
345 limit => $limit || $input->{limit},
347 lookup_coderef => sub {
348 my $rec = shift || return;
349 $lookup->add( $rec );
351 recode => $input->{recode},
353 modify_records => $input->{modify_records},
354 modify_file => $input->{modify_file},
356 $log->logdie("can't create input using $input_module") unless ($input);
358 my $maxmfn = $input_db->open(
359 path => $input->{path},
360 code_page => $input->{encoding}, # database encoding
365 if ($stats || $validate) {
366 my $path = "out/report/" . $database . '-' . $input->{name} . '.txt';
367 open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
369 print $report_fh "Report for database '$database' input '$input->{name}' records ",
370 $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
371 $log->info("Generating report file $path");
374 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
375 @{ $input->{normalize} } : ( $input->{normalize} );
377 if ($marc_normalize) {
379 path => $marc_normalize,
380 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
384 foreach my $normalize (@norm_array) {
386 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
388 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
390 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
392 $log->info("Using $normalize_path for normalization...");
394 my $marc = new WebPAC::Output::MARC(
395 path => $normalize->{output},
398 ) if ($normalize->{output});
400 # reset position in database
403 # generate name of config key for indexer (strip everything after -)
404 my $indexer_config = $use_indexer;
405 $indexer_config =~ s/^(\w+)-?.*$/$1/g;
407 foreach my $pos ( 0 ... $input_db->size ) {
409 my $row = $input_db->fetch || next;
411 my $mfn = $row->{'000'}->[0];
413 if (! $mfn || $mfn !~ m#^\d+$#) {
414 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
416 push @{ $row->{'000'} }, $pos;
421 if ( my $errors = $validate->validate_errors( $row, $input_db->dump ) ) {
422 $log->error( "MFN $mfn validation error:\n",
423 $validate->report_error( $errors )
428 my $ds_config = dclone($db_config);
430 # default values -> database key
431 $ds_config->{_} = $database;
434 $ds_config->{_mfn} = $mfn;
436 # attach current input
437 $ds_config->{input} = $input;
439 my $ds = WebPAC::Normalize::data_structure(
442 lookup => $lookup ? $lookup->lookup_hash : undef,
443 config => $ds_config,
444 marc_encoding => 'utf-8',
450 prefix => $input->{name},
451 ) if ($ds && !$stats);
454 id => $input->{name} . "/" . $mfn,
456 type => $config->{$indexer_config}->{type},
457 ) if ($indexer && $ds);
462 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
464 id => $mfn . ( $i ? "/$i" : '' ),
466 leader => WebPAC::Normalize::marc_leader(),
472 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
479 my $errors = $validate->report;
481 $log->info("validation errors:\n$errors\n" );
482 print $report_fh "$errors\n" if ($report_fh);
487 my $s = $input_db->stats;
488 $log->info("statistics of fields usage:\n$s");
489 print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
493 $marc->finish if ($marc);
496 close($report_fh) if ($report_fh)
501 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
503 my $dt = time() - $start_t;
504 $log->info("$total_rows records ", $indexer ? "indexed " : "",
505 sprintf("in %.2f sec [%.2f rec/sec]",
506 $dt, ($total_rows / $dt)
513 $log->info("parallel process $$ finished");
520 # wait all children to finish
521 sleep(1) while wait != -1;
522 $log->info("all parallel processes finished");
526 # handle links or merge after indexing
530 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
532 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
535 foreach my $link (@links) {
536 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');