9 use base qw/WebPAC::Common/;
11 use Encode qw/from_to/;
15 WebPAC::Input - read different file formats into WebPAC
23 our $VERSION = '0.14';
27 This module implements input as database which have fixed and known
28 I<size> while indexing and single unique numeric identifier for database
29 position ranging from 1 to I<size>.
31 Simply, something that is indexed by unmber from 1 .. I<size>.
33 Examples of such databases are CDS/ISIS files, MARC files, lines in
36 Specific file formats are implemented using low-level interface modules,
37 located in C<WebPAC::Input::*> namespace which export C<open_db>,
38 C<fetch_rec> and optional C<init> functions.
40 Perhaps a little code snippet.
44 my $db = WebPAC::Input->new(
45 module => 'WebPAC::Input::ISIS',
48 $db->open( path => '/path/to/database' );
49 print "database size: ",$db->size,"\n";
50 while (my $rec = $db->fetch) {
51 # do something with $rec
60 Create new input database object.
62 my $db = new WebPAC::Input(
63 module => 'WebPAC::Input::MARC',
64 encoding => 'ISO-8859-2',
65 recode => 'char pairs',
69 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
70 L<WebPAC::Input::MARC>.
72 Optional parametar C<encoding> specify application code page (which will be
73 used internally). This should probably be your terminal encoding, and by
74 default, it C<ISO-8859-2>.
76 C<recode> is optional string constisting of character or words pairs that
77 should be replaced in input stream.
79 C<no_progress_bar> disables progress bar output on C<STDOUT>
81 This function will also call low-level C<init> if it exists with same
91 my $log = $self->_get_logger;
93 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
94 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
95 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem});
97 $log->logconfess("specify low-level file format module") unless ($self->{module});
98 my $module_path = $self->{module};
99 $module_path =~ s#::#/#g;
100 $module_path .= '.pm';
101 $log->debug("require low-level module $self->{module} from $module_path");
103 require $module_path;
105 # check if required subclasses are implemented
106 foreach my $subclass (qw/open_db fetch_rec init dump_rec/) {
110 $self->{'encoding'} ||= 'ISO-8859-2';
112 $self ? return $self : return undef;
117 This function will read whole database in memory and produce lookups.
119 my $store; # simple in-memory hash
122 path => '/path/to/database/file',
123 code_page => 'cp852',
127 lookup_coderef => sub {
132 900 => { '^a' => { ' : ' => '^b' } },
133 901 => { '*' => { '^b' => ' ; ' } },
135 modify_file => 'conf/modify/mapping.map',
138 $store->{ $a->{id} } = $a->{row};
142 return defined($store->{ $a->{id} }) &&
143 $store->{ $a->{id} };
148 By default, C<code_page> is assumed to be C<cp852>.
150 C<offset> is optional parametar to position at some offset before reading from database.
152 C<limit> is optional parametar to read just C<limit> records from database
154 C<stats> create optional report about usage of fields and subfields
156 C<lookup_coderef> is closure to called to save data into lookups
158 C<modify_records> specify mapping from subfields to delimiters or from
159 delimiters to subfields, as well as oprations on fields (if subfield is
162 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
163 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
164 overrides C<modify_records> if both exists for same input.
166 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
167 is documented in example above.
169 Returns size of database, regardless of C<offset> and C<limit>
170 parametars, see also C<size>.
178 my $log = $self->_get_logger();
180 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
181 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
182 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
184 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
186 $log->logcroak("need path") if (! $arg->{'path'});
187 my $code_page = $arg->{'code_page'} || 'cp852';
189 # store data in object
190 $self->{'input_code_page'} = $code_page;
191 foreach my $v (qw/path offset limit/) {
192 $self->{$v} = $arg->{$v} if ($arg->{$v});
195 if ($arg->{load_row} || $arg->{save_row}) {
196 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
197 ref($arg->{load_row}) eq 'CODE' &&
198 ref($arg->{save_row}) eq 'CODE'
200 $self->{load_row} = $arg->{load_row};
201 $self->{save_row} = $arg->{save_row};
202 $log->debug("using load_row and save_row instead of in-memory hash");
209 if ($self->{recode}) {
210 my @r = split(/\s/, $self->{recode});
212 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
217 $recode_map->{$from} = $to;
220 $recode_regex = join '|' => keys %{ $recode_map };
222 $log->debug("using recode regex: $recode_regex");
228 if (my $p = $arg->{modify_file}) {
229 $log->debug("using modify_file $p");
230 $rec_regex = $self->modify_file_regexps( $p );
231 } elsif (my $h = $arg->{modify_records}) {
232 $log->debug("using modify_records ", Dumper( $h ));
233 $rec_regex = $self->modify_record_regexps(%{ $h });
235 $log->debug("rec_regex: ", Dumper($rec_regex)) if ($rec_regex);
237 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
239 my $ll_db = $class->new(
240 path => $arg->{path},
242 # my ($l,$f_nr) = @_;
243 # return unless defined($l);
244 # from_to($l, $code_page, $self->{'encoding'});
245 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
251 unless (defined($ll_db)) {
252 $log->logwarn("can't open database $arg->{path}, skipping...");
256 my $size = $ll_db->size;
259 $log->logwarn("no records in database $arg->{path}, skipping...");
266 if (my $s = $self->{offset}) {
267 $log->debug("skipping to MFN $s");
270 $self->{offset} = $from_rec;
273 if ($self->{limit}) {
274 $log->debug("limiting to ",$self->{limit}," records");
275 $to_rec = $from_rec + $self->{limit} - 1;
276 $to_rec = $size if ($to_rec > $size);
279 # store size for later
280 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
282 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
285 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
287 $log->debug("position: $pos\n");
289 my $rec = $ll_db->fetch_rec($pos, sub {
291 # return unless defined($l);
292 # return $l unless ($rec_regex && $f_nr);
294 $log->debug("-=> $f_nr ## $l");
296 # codepage conversion and recode_regex
297 from_to($l, $code_page, $self->{'encoding'});
298 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
301 if ($rec_regex && defined($rec_regex->{$f_nr})) {
302 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
304 foreach my $r (@{ $rec_regex->{$f_nr} }) {
308 $log->debug("REGEX on $f_nr eval \$l =~ $r\n## old l: [$old_l]\n## new l: [$l]");
310 $log->error("error applying regex: $r") if ($@);
314 $log->debug("<=- $f_nr ## $l");
318 $log->debug(sub { Dumper($rec) });
321 $log->warn("record $pos empty? skipping...");
326 if ($self->{save_row}) {
327 $self->{save_row}->({
332 $self->{data}->{$pos} = $rec;
336 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
338 # update counters for statistics
339 if ($self->{stats}) {
341 # fetch clean record with regexpes applied for statistics
342 my $rec = $ll_db->fetch_rec($pos);
344 foreach my $fld (keys %{ $rec }) {
345 $self->{_stats}->{fld}->{ $fld }++;
347 $log->logdie("invalid record fild $fld, not ARRAY")
348 unless (ref($rec->{ $fld }) eq 'ARRAY');
350 foreach my $row (@{ $rec->{$fld} }) {
352 if (ref($row) eq 'HASH') {
354 foreach my $sf (keys %{ $row }) {
355 next if ($sf eq 'subfields');
356 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
357 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
358 if (ref($row->{$sf}) eq 'ARRAY');
362 $self->{_stats}->{repeatable}->{ $fld }++;
368 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
373 $self->{last_pcnt} = 0;
375 # store max mfn and return it.
376 $self->{max_pos} = $to_rec;
377 $log->debug("max_pos: $to_rec");
380 $self->{ll_db} = $ll_db;
387 Fetch next record from database. It will also displays progress bar.
389 my $rec = $isis->fetch;
391 Record from this function should probably go to C<data_structure> for
399 my $log = $self->_get_logger();
401 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
403 if ($self->{pos} == -1) {
404 $self->{pos} = $self->{offset};
409 my $mfn = $self->{pos};
411 if ($mfn > $self->{max_pos}) {
412 $self->{pos} = $self->{max_pos};
413 $log->debug("at EOF");
417 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
421 if ($self->{load_row}) {
422 $rec = $self->{load_row}->({ id => $mfn });
424 $rec = $self->{data}->{$mfn};
432 Returns current record number (MFN).
436 First record in database has position 1.
448 Returns number of records in database
452 Result from this function can be used to loop through all records
454 foreach my $mfn ( 1 ... $isis->size ) { ... }
456 because it takes into account C<offset> and C<limit>.
462 return $self->{size};
467 Seek to specified MFN in file.
471 First record in database has position 1.
477 my $pos = shift || return;
479 my $log = $self->_get_logger();
482 $log->warn("seek before first record");
484 } elsif ($pos > $self->{max_pos}) {
485 $log->warn("seek beyond last record");
486 $pos = $self->{max_pos};
489 return $self->{pos} = (($pos - 1) || -1);
494 Dump statistics about field and subfield usage
503 my $log = $self->_get_logger();
505 my $s = $self->{_stats};
507 $log->warn("called stats, but there is no statistics collected");
515 my $f = $_ || die "no field";
516 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
517 $max_fld = $v if ($v > $max_fld);
519 my $o = sprintf("%4s %d ~", $f, $v);
521 if (defined($s->{sf}->{$f})) {
523 $o .= sprintf(" %s:%d%s", $_,
524 $s->{sf}->{$f}->{$_}->{count},
525 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
527 } sort keys %{ $s->{sf}->{$f} };
530 if (my $v_r = $s->{repeatable}->{$f}) {
531 $o .= " ($v_r)" if ($v_r != $v);
535 } sort { $a cmp $b } keys %{ $s->{fld} }
538 $log->debug( sub { Dumper($s) } );
545 Display humanly readable dump of record
552 return $self->{ll_db}->dump_rec( $self->{pos} );
556 =head2 modify_record_regexps
558 Generate hash with regexpes to be applied using l<filter>.
560 my $regexpes = $input->modify_record_regexps(
561 900 => { '^a' => { ' : ' => '^b' } },
562 901 => { '*' => { '^b' => ' ; ' } },
568 my ($sf,$from,$to) = @_;
571 's/\Q'. $sf .'\E([^\^]*?)\Q'. $from .'\E([^\^]*?)/'. $sf .'$1'. $to .'$2/';
574 's/\Q'. $from .'\E/'. $to .'/g';
578 sub modify_record_regexps {
580 my $modify_record = {@_};
584 my $log = $self->_get_logger();
586 foreach my $f (keys %$modify_record) {
587 $log->debug("field: $f");
589 foreach my $sf (keys %{ $modify_record->{$f} }) {
590 $log->debug("subfield: $sf");
592 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
593 my $to = $modify_record->{$f}->{$sf}->{$from};
594 #die "no field?" unless defined($to);
595 $log->debug("transform: |$from| -> |$to|");
597 my $regex = _get_regex($sf,$from,$to);
598 push @{ $regexpes->{$f} }, $regex;
599 $log->debug("regex: $regex");
607 =head2 modify_file_regexps
609 Generate hash with regexpes to be applied using l<filter> from
610 pseudo hash/yaml format for regex mappings.
612 It should be obvious:
619 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
620 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
622 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
624 On undef path it will just return.
628 sub modify_file_regexps {
631 my $modify_path = shift || return;
633 my $log = $self->_get_logger();
637 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
643 next if (/^#/ || /^\s*$/);
645 if (/^\s*(\d+)\s*$/) {
647 $log->debug("field: $f");
649 } elsif (/^\s*'([^']*)'\s*$/) {
651 $log->die("can't define subfiled before field in: $_") unless ($f);
652 $log->debug("subfield: $sf");
653 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
654 my ($from,$to) = ($1, $2);
656 $log->debug("transform: |$from| -> |$to|");
658 my $regex = _get_regex($sf,$from,$to);
659 push @{ $regexpes->{$f} }, $regex;
660 $log->debug("regex: $regex");
669 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
671 =head1 COPYRIGHT & LICENSE
673 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
675 This program is free software; you can redistribute it and/or modify it
676 under the same terms as Perl itself.
680 1; # End of WebPAC::Input