9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/decode from_to/;
15 WebPAC::Input - read different file formats into WebPAC
19 our $VERSION = '0.19';
23 This module implements input as database which have fixed and known
24 I<size> while indexing and single unique numeric identifier for database
25 position ranging from 1 to I<size>.
27 Simply, something that is indexed by unmber from 1 .. I<size>.
29 Examples of such databases are CDS/ISIS files, MARC files, lines in
32 Specific file formats are implemented using low-level interface modules,
33 located in C<WebPAC::Input::*> namespace which export C<open_db>,
34 C<fetch_rec> and optional C<init> functions.
36 Perhaps a little code snippet.
40 my $db = WebPAC::Input->new(
41 module => 'WebPAC::Input::ISIS',
44 $db->open( path => '/path/to/database' );
45 print "database size: ",$db->size,"\n";
46 while (my $rec = $db->fetch) {
47 # do something with $rec
56 Create new input database object.
58 my $db = new WebPAC::Input(
59 module => 'WebPAC::Input::MARC',
60 recode => 'char pairs',
63 mapping => [ 'foo', 'bar', 'baz' ],
67 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
68 L<WebPAC::Input::MARC>.
70 C<recode> is optional string constisting of character or words pairs that
71 should be replaced in input stream.
73 C<no_progress_bar> disables progress bar output on C<STDOUT>
75 This function will also call low-level C<init> if it exists with same
85 my $log = $self->_get_logger;
87 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
88 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
89 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup};
90 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem};
92 $log->logconfess("specify low-level file format module") unless ($self->{module});
93 my $module_path = $self->{module};
94 $module_path =~ s#::#/#g;
95 $module_path .= '.pm';
96 $log->debug("require low-level module $self->{module} from $module_path");
100 $self ? return $self : return undef;
105 This function will read whole database in memory and produce lookups.
107 my $store; # simple in-memory hash
110 path => '/path/to/database/file',
111 input_encoding => 'cp852',
112 strict_encoding => 0,
116 lookup_coderef => sub {
121 900 => { '^a' => { ' : ' => '^b' } },
122 901 => { '*' => { '^b' => ' ; ' } },
124 modify_file => 'conf/modify/mapping.map',
127 $store->{ $a->{id} } = $a->{row};
131 return defined($store->{ $a->{id} }) &&
132 $store->{ $a->{id} };
137 By default, C<input_encoding> is assumed to be C<cp852>.
139 C<offset> is optional parametar to position at some offset before reading from database.
141 C<limit> is optional parametar to read just C<limit> records from database
143 C<stats> create optional report about usage of fields and subfields
145 C<lookup_coderef> is closure to called to save data into lookups
147 C<modify_records> specify mapping from subfields to delimiters or from
148 delimiters to subfields, as well as oprations on fields (if subfield is
151 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
152 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
153 overrides C<modify_records> if both exists for same input.
155 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
156 is documented in example above.
158 C<strict_encoding> should really default to 1, but it doesn't for now.
160 Returns size of database, regardless of C<offset> and C<limit>
161 parametars, see also C<size>.
169 my $log = $self->_get_logger();
170 $log->debug( "arguments: ",dump( $arg ));
172 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
173 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
174 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
175 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
176 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
178 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
180 $log->logcroak("need path") if (! $arg->{'path'});
181 my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852';
183 # store data in object
184 foreach my $v (qw/path offset limit/) {
185 $self->{$v} = $arg->{$v} if ($arg->{$v});
188 if ($arg->{load_row} || $arg->{save_row}) {
189 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
190 ref($arg->{load_row}) eq 'CODE' &&
191 ref($arg->{save_row}) eq 'CODE'
193 $self->{load_row} = $arg->{load_row};
194 $self->{save_row} = $arg->{save_row};
195 $log->debug("using load_row and save_row instead of in-memory hash");
202 if ($self->{recode}) {
203 my @r = split(/\s/, $self->{recode});
205 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
210 $recode_map->{$from} = $to;
213 $recode_regex = join '|' => keys %{ $recode_map };
215 $log->debug("using recode regex: $recode_regex");
221 if (my $p = $arg->{modify_file}) {
222 $log->debug("using modify_file $p");
223 $rec_regex = $self->modify_file_regexps( $p );
224 } elsif (my $h = $arg->{modify_records}) {
225 $log->debug("using modify_records ", sub { dump( $h ) });
226 $rec_regex = $self->modify_record_regexps(%{ $h });
228 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
230 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
232 my $ll_db = $class->new(
233 path => $arg->{path},
234 input_config => $arg->{input_config} || $self->{input_config},
236 # my ($l,$f_nr) = @_;
237 # return unless defined($l);
238 # $l = decode($input_encoding, $l);
239 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
245 unless (defined($ll_db)) {
246 $log->logwarn("can't open database $arg->{path}, skipping...");
250 my $size = $ll_db->size;
253 $log->logwarn("no records in database $arg->{path}, skipping...");
260 if (my $s = $self->{offset}) {
261 $log->debug("skipping to MFN $s");
264 $self->{offset} = $from_rec;
267 if ($self->{limit}) {
268 $log->debug("limiting to ",$self->{limit}," records");
269 $to_rec = $from_rec + $self->{limit} - 1;
270 $to_rec = $size if ($to_rec > $size);
273 # store size for later
274 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
276 my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really
278 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec]",
279 " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '',
280 $self->{stats} ? ' [stats]' : '',
284 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
286 $log->debug("position: $pos\n");
288 my $rec = $ll_db->fetch_rec($pos, sub {
289 my ($l,$f_nr,$debug) = @_;
290 # return unless defined($l);
291 # return $l unless ($rec_regex && $f_nr);
293 return unless ( defined($l) && defined($f_nr) );
295 warn "-=> $f_nr ## |$l|\n" if ($debug);
296 $log->debug("-=> $f_nr ## $l");
298 # codepage conversion and recode_regex
299 $l = decode($input_encoding, $l, 1);
300 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
303 if ($rec_regex && defined($rec_regex->{$f_nr})) {
304 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
306 foreach my $r (@{ $rec_regex->{$f_nr} }) {
308 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
309 eval '$l =~ ' . $r->{regex};
311 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
312 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
313 $d .= ' ' . $r->{debug} if defined($r->{debug});
314 $log->debug("MODIFY $d");
315 warn "*** $d\n" if ($debug);
318 $log->error("error applying regex: $r") if ($@);
322 $log->debug("<=- $f_nr ## |$l|");
323 warn "<=- $f_nr ## $l\n" if ($debug);
327 $log->debug(sub { dump($rec) });
330 $log->warn("record $pos empty? skipping...");
335 if ($self->{save_row}) {
336 $self->{save_row}->({
341 $self->{data}->{$pos} = $rec;
345 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
347 # update counters for statistics
348 if ($self->{stats}) {
350 # fetch clean record with regexpes applied for statistics
351 my $rec = $ll_db->fetch_rec($pos);
353 foreach my $fld (keys %{ $rec }) {
354 $self->{_stats}->{fld}->{ $fld }++;
356 #$log->logdie("invalid record fild $fld, not ARRAY")
357 next unless (ref($rec->{ $fld }) eq 'ARRAY');
359 foreach my $row (@{ $rec->{$fld} }) {
361 if (ref($row) eq 'HASH') {
363 foreach my $sf (keys %{ $row }) {
364 next if ($sf eq 'subfields');
365 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
366 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
367 if (ref($row->{$sf}) eq 'ARRAY');
371 $self->{_stats}->{repeatable}->{ $fld }++;
377 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
382 $self->{last_pcnt} = 0;
384 # store max mfn and return it.
385 $self->{max_pos} = $to_rec;
386 $log->debug("max_pos: $to_rec");
389 $self->{ll_db} = $ll_db;
396 Fetch next record from database. It will also displays progress bar.
398 my $rec = $isis->fetch;
400 Record from this function should probably go to C<data_structure> for
408 my $log = $self->_get_logger();
410 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
412 if ($self->{pos} == -1) {
413 $self->{pos} = $self->{offset};
418 my $mfn = $self->{pos};
420 if ($mfn > $self->{max_pos}) {
421 $self->{pos} = $self->{max_pos};
422 $log->debug("at EOF");
426 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
430 if ($self->{load_row}) {
431 $rec = $self->{load_row}->({ id => $mfn });
433 $rec = $self->{data}->{$mfn};
441 Returns current record number (MFN).
445 First record in database has position 1.
457 Returns number of records in database
461 Result from this function can be used to loop through all records
463 foreach my $mfn ( 1 ... $isis->size ) { ... }
465 because it takes into account C<offset> and C<limit>.
471 return $self->{size};
476 Seek to specified MFN in file.
480 First record in database has position 1.
488 my $log = $self->_get_logger();
490 $log->logconfess("called without pos") unless defined($pos);
493 $log->warn("seek before first record");
495 } elsif ($pos > $self->{max_pos}) {
496 $log->warn("seek beyond last record");
497 $pos = $self->{max_pos};
500 return $self->{pos} = (($pos - 1) || -1);
505 Dump statistics about field and subfield usage
514 my $log = $self->_get_logger();
516 my $s = $self->{_stats};
518 $log->warn("called stats, but there is no statistics collected");
527 die "no field in ", dump( $s->{fld} ) unless defined( $f );
528 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
529 $max_fld = $v if ($v > $max_fld);
531 my $o = sprintf("%4s %d ~", $f, $v);
533 if (defined($s->{sf}->{$f})) {
534 my @subfields = keys %{ $s->{sf}->{$f} };
536 $o .= sprintf(" %s:%d%s", $_,
537 $s->{sf}->{$f}->{$_}->{count},
538 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
541 # first indicators and other special subfields
542 sort( grep { length($_) > 1 } @subfields ),
543 # then subfileds (single char)
544 sort( grep { length($_) == 1 } @subfields ),
548 if (my $v_r = $s->{repeatable}->{$f}) {
549 $o .= " ($v_r)" if ($v_r != $v);
554 if ( $a =~ m/^\d+$/ && $b =~ m/^\d+$/ ) {
559 } keys %{ $s->{fld} }
562 $log->debug( sub { dump($s) } );
569 Display humanly readable dump of record
576 return unless $self->{ll_db};
578 if ($self->{ll_db}->can('dump_ascii')) {
579 return $self->{ll_db}->dump_ascii( $self->{pos} );
581 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
587 Helper function called which create regexps to be execute on code.
589 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
590 _get_regex( 900, '^b', ' : ^b' );
592 It supports perl regexps with C<regex:> prefix to from value and has
593 additional logic to skip empty subfields.
598 my ($sf,$from,$to) = @_;
604 if ($from =~ m/^regex:(.+)$/) {
607 $from = '\Q' . $from . '\E';
610 my $need_subfield_data = '*'; # no
611 # if from is also subfield, require some data in between
612 # to correctly skip empty subfields
613 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
615 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
618 's/'. $from .'/'. $to .'/g';
623 =head2 modify_record_regexps
625 Generate hash with regexpes to be applied using L<filter>.
627 my $regexpes = $input->modify_record_regexps(
628 900 => { '^a' => { ' : ' => '^b' } },
629 901 => { '*' => { '^b' => ' ; ' } },
634 sub modify_record_regexps {
636 my $modify_record = {@_};
640 my $log = $self->_get_logger();
642 foreach my $f (keys %$modify_record) {
643 $log->debug("field: $f");
645 foreach my $sf (keys %{ $modify_record->{$f} }) {
646 $log->debug("subfield: $sf");
648 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
649 my $to = $modify_record->{$f}->{$sf}->{$from};
650 #die "no field?" unless defined($to);
651 my $d = "|$from| -> |$to|";
652 $log->debug("transform: $d");
654 my $regex = _get_regex($sf,$from,$to);
655 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
656 $log->debug("regex: $regex");
664 =head2 modify_file_regexps
666 Generate hash with regexpes to be applied using L<filter> from
667 pseudo hash/yaml format for regex mappings.
669 It should be obvious:
676 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
677 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
679 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
681 On undef path it will just return.
685 sub modify_file_regexps {
688 my $modify_path = shift || return;
690 my $log = $self->_get_logger();
694 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
700 next if (/^#/ || /^\s*$/);
702 if (/^\s*(\d+)\s*$/) {
704 $log->debug("field: $f");
706 } elsif (/^\s*'([^']*)'\s*$/) {
708 $log->die("can't define subfiled before field in: $_") unless ($f);
709 $log->debug("subfield: $sf");
710 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
711 my ($from,$to) = ($1, $2);
713 $log->debug("transform: |$from| -> |$to|");
715 my $regex = _get_regex($sf,$from,$to);
716 push @{ $regexpes->{$f} }, {
718 file => $modify_path,
721 $log->debug("regex: $regex");
730 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
732 =head1 COPYRIGHT & LICENSE
734 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
736 This program is free software; you can redistribute it and/or modify it
737 under the same terms as Perl itself.
741 1; # End of WebPAC::Input