6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Lookup 0.03;
11 use WebPAC::Input 0.07;
12 use WebPAC::Store 0.03;
13 use WebPAC::Normalize 0.11;
14 use WebPAC::Output::TT;
16 use WebPAC::Output::MARC;
17 use YAML qw/LoadFile/;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing and dump statistics about field and subfield
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
104 my $config = 'conf/config.yml';
109 my ($marc_normalize, $marc_output);
116 "limit=i" => \$limit,
117 "offset=i" => \$offset,
119 "one=s" => \$only_filter,
120 "only=s" => \$only_filter,
121 "config" => \$config,
124 "validate=s" => \$validate_path,
125 "marc-normalize=s" => \$marc_normalize,
126 "marc-output=s" => \$marc_output,
127 "marc-lint!" => \$marc_lint,
128 "marc-dump!" => \$marc_dump,
129 "parallel=i" => \$parallel,
130 "only-links!" => \$only_links,
133 $config = LoadFile($config);
135 print "config = ",dump($config) if ($debug);
137 die "no databases in config file!\n" unless ($config->{databases});
139 my $log = _new WebPAC::Common()->_get_logger();
140 $log->info( "-" x 79 );
143 $validate = new WebPAC::Validate(
144 path => $validate_path,
145 ) if ($validate_path);
147 my $use_indexer = $config->{use_indexer} || 'hyperestraier';
149 $log->debug("option --stats disables update of indexing engine...");
150 $use_indexer = undef;
152 $log->info("using $use_indexer indexing engine...");
155 # disable indexing when creating marc
156 $use_indexer = undef if ($marc_normalize);
159 my $start_t = time();
164 $log->info("Using $parallel processes for speedup");
165 Proc::Queue::size($parallel);
168 while (my ($database, $db_config) = each %{ $config->{databases} }) {
170 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
171 next if ($only_database && $database !~ m/$only_database/i);
175 if(defined ($f) and $f==0) {
176 $log->info("Created processes $$ for speedup");
184 my $indexer_config = $config->{$use_indexer} || $log->logdie("can't find '$use_indexer' part in confguration");
185 $indexer_config->{database} = $database;
186 $indexer_config->{clean} = $clean;
187 $indexer_config->{label} = $db_config->{name};
189 if ($use_indexer eq 'hyperestraier') {
191 # open Hyper Estraier database
192 use WebPAC::Output::Estraier '0.10';
193 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
195 } elsif ($use_indexer eq 'kinosearch') {
198 use WebPAC::Output::KinoSearch;
199 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
200 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
203 $log->logdie("unknown use_indexer: $use_indexer");
206 $log->logide("can't continue without valid indexer") unless ($indexer);
211 # store Hyper Estraier links to other databases
213 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
214 foreach my $link (@{ $db_config->{links} }) {
215 if ($use_indexer eq 'hyperestraier') {
216 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
218 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
222 credit => $link->{credit},
226 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
230 next if ($only_links);
236 my $abs_path = abs_path($0);
237 $abs_path =~ s#/[^/]*$#/#;
239 my $db_path = $config->{webpac}->{db_path} . '/' . $database;
242 $log->info("creating new database '$database' in $db_path");
243 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
245 $log->info("working on database '$database' in $db_path");
248 my $db = new WebPAC::Store(
250 database => $database,
256 # now, iterate through input formats
260 if (ref($db_config->{input}) eq 'ARRAY') {
261 @inputs = @{ $db_config->{input} };
262 } elsif ($db_config->{input}) {
263 push @inputs, $db_config->{input};
265 $log->info("database $database doesn't have inputs defined");
268 my @supported_inputs = keys %{ $config->{webpac}->{inputs} };
270 foreach my $input (@inputs) {
272 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
274 my $type = lc($input->{type});
276 die "I know only how to handle input types ", join(",", @supported_inputs), " not '$type'!\n" unless (grep(/$type/, @supported_inputs));
279 if ($input->{lookup}) {
280 $lookup = new WebPAC::Lookup(
281 lookup_file => $input->{lookup},
283 delete( $input->{lookup} );
286 my $input_module = $config->{webpac}->{inputs}->{$type};
288 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
289 $input->{lookup} ? "lookup '$input->{lookup}'" : ""
292 my $input_db = new WebPAC::Input(
293 module => $input_module,
294 encoding => $config->{webpac}->{webpac_encoding},
295 limit => $limit || $input->{limit},
297 lookup_coderef => sub {
298 my $rec = shift || return;
299 $lookup->add( $rec );
301 recode => $input->{recode},
303 modify_records => $input->{modify_records},
305 $log->logdie("can't create input using $input_module") unless ($input);
307 my $maxmfn = $input_db->open(
308 path => $input->{path},
309 code_page => $input->{encoding}, # database encoding
313 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
314 @{ $input->{normalize} } : ( $input->{normalize} );
316 if ($marc_normalize) {
318 path => $marc_normalize,
319 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
323 foreach my $normalize (@norm_array) {
325 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
327 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
329 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
331 $log->info("Using $normalize_path for normalization...");
333 my $marc = new WebPAC::Output::MARC(
334 path => $normalize->{output},
337 ) if ($normalize->{output});
339 # reset position in database
342 foreach my $pos ( 0 ... $input_db->size ) {
344 my $row = $input_db->fetch || next;
346 my $mfn = $row->{'000'}->[0];
348 if (! $mfn || $mfn !~ m#^\d+$#) {
349 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
351 push @{ $row->{'000'} }, $pos;
356 my @errors = $validate->validate_errors( $row );
357 $log->error( "MFN $mfn validation errors:\n", join("\n", @errors) ) if (@errors);
360 my $ds_config = dclone($db_config);
362 # default values -> database key
363 $ds_config->{_} = $database;
366 $ds_config->{_mfn} = $mfn;
368 # attach current input
369 $ds_config->{input} = $input;
371 my $ds = WebPAC::Normalize::data_structure(
374 lookup => $lookup ? $lookup->lookup_hash : undef,
375 config => $ds_config,
376 marc_encoding => 'utf-8',
382 prefix => $input->{name},
383 ) if ($ds && !$stats);
386 id => $input->{name} . "/" . $mfn,
388 type => $config->{$use_indexer}->{type},
389 ) if ($indexer && $ds);
394 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
396 id => $mfn . ( $i ? "/$i" : '' ),
398 leader => WebPAC::Normalize::marc_leader(),
404 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
410 $log->info("statistics of fields usage:\n", $input_db->stats) if ($stats);
413 $marc->finish if ($marc);
419 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
421 my $dt = time() - $start_t;
422 $log->info("$total_rows records ", $indexer ? "indexed " : "",
423 sprintf("in %.2f sec [%.2f rec/sec]",
424 $dt, ($total_rows / $dt)
431 $log->info("parallel process $$ finished");
438 # wait all children to finish
439 sleep(1) while wait != -1;
440 $log->info("all parallel processes finished");
444 foreach my $link (@links) {
445 $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');