6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Lookup 0.03;
11 use WebPAC::Input 0.07;
12 use WebPAC::Store 0.03;
13 use WebPAC::Normalize 0.11;
14 use WebPAC::Output::TT;
16 use WebPAC::Output::MARC;
17 use YAML qw/LoadFile/;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing and dump statistics about field and subfield
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
98 Create merged index of databases which have links
108 my $config = 'conf/config.yml';
113 my ($marc_normalize, $marc_output);
120 my $log = _new WebPAC::Common()->_get_logger();
122 my $hostname = `hostname`;
124 $hostname =~ s/\..+$//;
125 if (-e "conf/$hostname.yml") {
126 $config = "conf/$hostname.yml";
127 $log->info("using host configuration file: $config");
131 "limit=i" => \$limit,
132 "offset=i" => \$offset,
134 "one=s" => \$only_filter,
135 "only=s" => \$only_filter,
136 "config" => \$config,
139 "validate=s" => \$validate_path,
140 "marc-normalize=s" => \$marc_normalize,
141 "marc-output=s" => \$marc_output,
142 "marc-lint!" => \$marc_lint,
143 "marc-dump!" => \$marc_dump,
144 "parallel=i" => \$parallel,
145 "only-links!" => \$only_links,
149 $config = LoadFile($config);
151 #print "config = ",dump($config) if ($debug);
153 die "no databases in config file!\n" unless ($config->{databases});
155 $log->info( "-" x 79 );
159 my $estcmd_path = './estcmd-merge.sh';
161 open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
162 print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
163 print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
164 $log->info("created merge batch file $estcmd_path");
169 $validate = new WebPAC::Validate(
170 path => $validate_path,
171 ) if ($validate_path);
174 my $use_indexer = $config->{use_indexer} || 'hyperestraier';
176 $log->debug("option --stats disables update of indexing engine...");
177 $use_indexer = undef;
179 $log->info("using $use_indexer indexing engine...");
182 # disable indexing when creating marc
183 $use_indexer = undef if ($marc_normalize);
186 my $start_t = time();
191 $log->info("Using $parallel processes for speedup");
192 Proc::Queue::size($parallel);
195 while (my ($database, $db_config) = each %{ $config->{databases} }) {
197 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
198 next if ($only_database && $database !~ m/$only_database/i);
202 if(defined ($f) and $f==0) {
203 $log->info("Created processes $$ for speedup");
211 my $indexer_config = $config->{$use_indexer} || $log->logdie("can't find '$use_indexer' part in confguration");
212 $indexer_config->{database} = $database;
213 $indexer_config->{clean} = $clean;
214 $indexer_config->{label} = $db_config->{name};
216 # force clean if database has links
217 $indexer_config->{clean} = 1 if ($db_config->{links});
219 if ($use_indexer eq 'hyperestraier') {
221 # open Hyper Estraier database
222 use WebPAC::Output::Estraier '0.10';
223 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
225 } elsif ($use_indexer eq 'kinosearch') {
228 use WebPAC::Output::KinoSearch;
229 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
230 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
233 $log->logdie("unknown use_indexer: $use_indexer");
236 $log->logide("can't continue without valid indexer") unless ($indexer);
241 # store Hyper Estraier links to other databases
243 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
244 foreach my $link (@{ $db_config->{links} }) {
245 if ($use_indexer eq 'hyperestraier') {
247 print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
249 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
251 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
255 credit => $link->{credit},
260 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
264 next if ($only_links);
270 my $abs_path = abs_path($0);
271 $abs_path =~ s#/[^/]*$#/#;
273 my $db_path = $config->{webpac}->{db_path} . '/' . $database;
276 $log->info("creating new database '$database' in $db_path");
277 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
279 $log->info("working on database '$database' in $db_path");
282 my $db = new WebPAC::Store(
284 database => $database,
290 # now, iterate through input formats
294 if (ref($db_config->{input}) eq 'ARRAY') {
295 @inputs = @{ $db_config->{input} };
296 } elsif ($db_config->{input}) {
297 push @inputs, $db_config->{input};
299 $log->info("database $database doesn't have inputs defined");
302 my @supported_inputs = keys %{ $config->{webpac}->{inputs} };
304 foreach my $input (@inputs) {
306 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
308 my $type = lc($input->{type});
310 die "I know only how to handle input types ", join(",", @supported_inputs), " not '$type'!\n" unless (grep(/$type/, @supported_inputs));
313 if ($input->{lookup}) {
314 $lookup = new WebPAC::Lookup(
315 lookup_file => $input->{lookup},
317 delete( $input->{lookup} );
320 my $input_module = $config->{webpac}->{inputs}->{$type};
322 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
323 $input->{lookup} ? "lookup '$input->{lookup}'" : ""
326 my $input_db = new WebPAC::Input(
327 module => $input_module,
328 encoding => $config->{webpac}->{webpac_encoding},
329 limit => $limit || $input->{limit},
331 lookup_coderef => sub {
332 my $rec = shift || return;
333 $lookup->add( $rec );
335 recode => $input->{recode},
337 modify_records => $input->{modify_records},
339 $log->logdie("can't create input using $input_module") unless ($input);
341 my $maxmfn = $input_db->open(
342 path => $input->{path},
343 code_page => $input->{encoding}, # database encoding
347 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
348 @{ $input->{normalize} } : ( $input->{normalize} );
350 if ($marc_normalize) {
352 path => $marc_normalize,
353 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
357 foreach my $normalize (@norm_array) {
359 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
361 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
363 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
365 $log->info("Using $normalize_path for normalization...");
367 my $marc = new WebPAC::Output::MARC(
368 path => $normalize->{output},
371 ) if ($normalize->{output});
373 # reset position in database
376 foreach my $pos ( 0 ... $input_db->size ) {
378 my $row = $input_db->fetch || next;
380 my $mfn = $row->{'000'}->[0];
382 if (! $mfn || $mfn !~ m#^\d+$#) {
383 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
385 push @{ $row->{'000'} }, $pos;
390 my @errors = $validate->validate_errors( $row );
391 $log->error( "MFN $mfn validation errors:\n", join("\n", @errors) ) if (@errors);
394 my $ds_config = dclone($db_config);
396 # default values -> database key
397 $ds_config->{_} = $database;
400 $ds_config->{_mfn} = $mfn;
402 # attach current input
403 $ds_config->{input} = $input;
405 my $ds = WebPAC::Normalize::data_structure(
408 lookup => $lookup ? $lookup->lookup_hash : undef,
409 config => $ds_config,
410 marc_encoding => 'utf-8',
416 prefix => $input->{name},
417 ) if ($ds && !$stats);
420 id => $input->{name} . "/" . $mfn,
422 type => $config->{$use_indexer}->{type},
423 ) if ($indexer && $ds);
428 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
430 id => $mfn . ( $i ? "/$i" : '' ),
432 leader => WebPAC::Normalize::marc_leader(),
438 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
444 $log->info("statistics of fields usage:\n", $input_db->stats) if ($stats);
447 $marc->finish if ($marc);
453 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
455 my $dt = time() - $start_t;
456 $log->info("$total_rows records ", $indexer ? "indexed " : "",
457 sprintf("in %.2f sec [%.2f rec/sec]",
458 $dt, ($total_rows / $dt)
465 $log->info("parallel process $$ finished");
472 # wait all children to finish
473 sleep(1) while wait != -1;
474 $log->info("all parallel processes finished");
478 # handle links or merge after indexing
482 print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
484 chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
487 foreach my $link (@links) {
488 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');