6 use File::Temp qw/tempdir/;
9 use WebPAC::Common 0.02;
10 use WebPAC::Lookup 0.03;
11 use WebPAC::Input 0.07;
12 use WebPAC::Store 0.03;
13 use WebPAC::Normalize 0.11;
14 use WebPAC::Output::TT;
16 use WebPAC::Output::MARC;
17 use YAML qw/LoadFile/;
20 use Time::HiRes qw/time/;
22 use Data::Dump qw/dump/;
23 use Storable qw/dclone/;
25 use Proc::Queue size => 1;
26 use POSIX ":sys_wait_h"; # imports WNOHANG
30 run.pl - start WebPAC indexing
32 B<this command will probably go away. Don't get used to it!>
40 start loading (all) databases at offset 42
44 limit loading to 100 records
48 remove database and Hyper Estraier index before indexing
50 =item --only=database_name/input_filter
52 reindex just single database (legacy name is --one)
54 C</input_filter> is optional part which can be C<name>
57 =item --config conf/config.yml
59 path to YAML configuration file
63 disable indexing and dump statistics about field and subfield
66 =item --validate path/to/validation_file
68 turn on extra validation of imput records, see L<WebPAC::Validation>
70 =item --marc-normalize conf/normalize/mapping.pl
72 This option specifies normalisation file for MARC creation
74 =item --marc-output out/marc/test.marc
76 Optional path to output file
80 By default turned on if C<--marc-normalize> is used. You can disable lint
81 messages with C<--no-marc-lint>.
85 Force dump or input and marc record for debugging.
89 Run databases in parallel (aproximatly same as number of processors in
90 machine if you want to use full load)
100 my $config = 'conf/config.yml';
105 my ($marc_normalize, $marc_output);
112 "limit=i" => \$limit,
113 "offset=i" => \$offset,
115 "one=s" => \$only_filter,
116 "only=s" => \$only_filter,
117 "config" => \$config,
120 "validate=s" => \$validate_path,
121 "marc-normalize=s" => \$marc_normalize,
122 "marc-output=s" => \$marc_output,
123 "marc-lint!" => \$marc_lint,
124 "marc-dump!" => \$marc_dump,
125 "parallel=i" => \$parallel,
128 $config = LoadFile($config);
130 print "config = ",dump($config) if ($debug);
132 die "no databases in config file!\n" unless ($config->{databases});
134 my $log = _new WebPAC::Common()->_get_logger();
135 $log->info( "-" x 79 );
138 $validate = new WebPAC::Validate(
139 path => $validate_path,
140 ) if ($validate_path);
142 my $use_indexer = $config->{use_indexer} || 'hyperestraier';
144 $log->debug("option --stats disables update of indexing engine...");
145 $use_indexer = undef;
147 $log->info("using $use_indexer indexing engine...");
150 # disable indexing when creating marc
151 $use_indexer = undef if ($marc_normalize);
154 my $start_t = time();
160 $log->info("Using $parallel processes for speedup");
161 Proc::Queue::size($parallel);
164 while (my ($database, $db_config) = each %{ $config->{databases} }) {
166 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
167 next if ($only_database && $database !~ m/$only_database/i);
171 if(defined ($f) and $f==0) {
172 $log->info("Created processes $$ for speedup");
179 my $indexer_config = $config->{$use_indexer} || $log->logdie("can't find '$use_indexer' part in confguration");
180 $indexer_config->{database} = $database;
181 $indexer_config->{clean} = $clean;
182 $indexer_config->{label} = $db_config->{name};
184 if ($use_indexer eq 'hyperestraier') {
186 # open Hyper Estraier database
187 use WebPAC::Output::Estraier '0.10';
188 $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
190 } elsif ($use_indexer eq 'kinosearch') {
193 use WebPAC::Output::KinoSearch;
194 $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
195 $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
198 $log->logdie("unknown use_indexer: $use_indexer");
201 $log->logide("can't continue without valid indexer") unless ($indexer);
208 my $abs_path = abs_path($0);
209 $abs_path =~ s#/[^/]*$#/#;
211 my $db_path = $config->{webpac}->{db_path} . '/' . $database;
214 $log->info("creating new database '$database' in $db_path");
215 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
217 $log->info("working on database '$database' in $db_path");
220 my $db = new WebPAC::Store(
222 database => $database,
228 # now, iterate through input formats
232 if (ref($db_config->{input}) eq 'ARRAY') {
233 @inputs = @{ $db_config->{input} };
234 } elsif ($db_config->{input}) {
235 push @inputs, $db_config->{input};
237 $log->info("database $database doesn't have inputs defined");
240 my @supported_inputs = keys %{ $config->{webpac}->{inputs} };
242 foreach my $input (@inputs) {
244 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
246 my $type = lc($input->{type});
248 die "I know only how to handle input types ", join(",", @supported_inputs), " not '$type'!\n" unless (grep(/$type/, @supported_inputs));
251 if ($input->{lookup}) {
252 $lookup = new WebPAC::Lookup(
253 lookup_file => $input->{lookup},
255 delete( $input->{lookup} );
258 my $input_module = $config->{webpac}->{inputs}->{$type};
260 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
261 $input->{lookup} ? "lookup '$input->{lookup}'" : ""
264 my $input_db = new WebPAC::Input(
265 module => $input_module,
266 encoding => $config->{webpac}->{webpac_encoding},
267 limit => $limit || $input->{limit},
269 lookup_coderef => sub {
270 my $rec = shift || return;
271 $lookup->add( $rec );
273 recode => $input->{recode},
275 modify_records => $input->{modify_records},
277 $log->logdie("can't create input using $input_module") unless ($input);
279 my $maxmfn = $input_db->open(
280 path => $input->{path},
281 code_page => $input->{encoding}, # database encoding
285 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
286 @{ $input->{normalize} } : ( $input->{normalize} );
288 if ($marc_normalize) {
290 path => $marc_normalize,
291 output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
295 foreach my $normalize (@norm_array) {
297 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
299 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
301 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
303 $log->info("Using $normalize_path for normalization...");
305 my $marc = new WebPAC::Output::MARC(
306 path => $normalize->{output},
309 ) if ($normalize->{output});
311 # reset position in database
314 foreach my $pos ( 0 ... $input_db->size ) {
316 my $row = $input_db->fetch || next;
318 my $mfn = $row->{'000'}->[0];
320 if (! $mfn || $mfn !~ m#^\d+$#) {
321 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
323 push @{ $row->{'000'} }, $pos;
328 my @errors = $validate->validate_errors( $row );
329 $log->error( "MFN $mfn validation errors:\n", join("\n", @errors) ) if (@errors);
332 my $ds_config = dclone($db_config);
334 # default values -> database key
335 $ds_config->{_} = $database;
338 $ds_config->{_mfn} = $mfn;
340 # attach current input
341 $ds_config->{input} = $input;
343 my $ds = WebPAC::Normalize::data_structure(
346 lookup => $lookup ? $lookup->lookup_hash : undef,
347 config => $ds_config,
348 marc_encoding => 'utf-8',
354 prefix => $input->{name},
355 ) if ($ds && !$stats);
358 id => $input->{name} . "/" . $mfn,
360 type => $config->{$use_indexer}->{type},
361 ) if ($indexer && $ds);
366 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
368 id => $mfn . ( $i ? "/$i" : '' ),
370 leader => WebPAC::Normalize::marc_leader(),
376 $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
382 $log->info("statistics of fields usage:\n", $input_db->stats) if ($stats);
385 $marc->finish if ($marc);
391 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
393 my $dt = time() - $start_t;
394 $log->info("$total_rows records ", $indexer ? "indexed " : "",
395 sprintf("in %.2f sec [%.2f rec/sec]",
396 $dt, ($total_rows / $dt)
401 # add Hyper Estraier links to other databases
403 if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
404 foreach my $link (@{ $db_config->{links} }) {
405 if ($use_indexer eq 'hyperestraier') {
406 $log->info("saving link $database -> $link->{to} [$link->{credit}]");
410 credit => $link->{credit},
413 $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
420 $log->info("parallel process $$ finished");
427 # wait all children to finish
428 sleep(1) while wait != -1;
429 $log->info("all parallel processes finished");
432 foreach my $link (@links) {
433 $log->info("adding link $link->{from} -> $link->{to} [$link->{credit}]");
434 $indexer->add_link( %{ $link } );