9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/decode from_to/;
16 WebPAC::Input - read different file formats into WebPAC
20 our $VERSION = '0.19';
24 This module implements input as database which have fixed and known
25 I<size> while indexing and single unique numeric identifier for database
26 position ranging from 1 to I<size>.
28 Simply, something that is indexed by unmber from 1 .. I<size>.
30 Examples of such databases are CDS/ISIS files, MARC files, lines in
33 Specific file formats are implemented using low-level interface modules,
34 located in C<WebPAC::Input::*> namespace which export C<open_db>,
35 C<fetch_rec> and optional C<init> functions.
37 Perhaps a little code snippet.
41 my $db = WebPAC::Input->new(
42 module => 'WebPAC::Input::ISIS',
45 $db->open( path => '/path/to/database' );
46 print "database size: ",$db->size,"\n";
47 while (my $rec = $db->fetch) {
48 # do something with $rec
57 Create new input database object.
59 my $db = new WebPAC::Input(
60 module => 'WebPAC::Input::MARC',
61 recode => 'char pairs',
64 mapping => [ 'foo', 'bar', 'baz' ],
68 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
69 L<WebPAC::Input::MARC>.
71 C<recode> is optional string constisting of character or words pairs that
72 should be replaced in input stream.
74 C<no_progress_bar> disables progress bar output on C<STDOUT>
76 This function will also call low-level C<init> if it exists with same
86 my $log = $self->_get_logger;
88 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
89 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
90 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup};
91 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem};
93 $log->logconfess("specify low-level file format module") unless ($self->{module});
94 my $module_path = $self->{module};
95 $module_path =~ s#::#/#g;
96 $module_path .= '.pm';
97 $log->debug("require low-level module $self->{module} from $module_path");
101 $self ? return $self : return undef;
106 This function will read whole database in memory and produce lookups.
108 my $store; # simple in-memory hash
111 path => '/path/to/database/file',
112 input_encoding => 'cp852',
113 strict_encoding => 0,
117 lookup_coderef => sub {
122 900 => { '^a' => { ' : ' => '^b' } },
123 901 => { '*' => { '^b' => ' ; ' } },
125 modify_file => 'conf/modify/mapping.map',
128 $store->{ $a->{id} } = $a->{row};
132 return defined($store->{ $a->{id} }) &&
133 $store->{ $a->{id} };
138 By default, C<input_encoding> is assumed to be C<cp852>.
140 C<offset> is optional parametar to position at some offset before reading from database.
142 C<limit> is optional parametar to read just C<limit> records from database
144 C<stats> create optional report about usage of fields and subfields
146 C<lookup_coderef> is closure to called to save data into lookups
148 C<modify_records> specify mapping from subfields to delimiters or from
149 delimiters to subfields, as well as oprations on fields (if subfield is
152 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
153 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
154 overrides C<modify_records> if both exists for same input.
156 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
157 is documented in example above.
159 C<strict_encoding> should really default to 1, but it doesn't for now.
161 Returns size of database, regardless of C<offset> and C<limit>
162 parametars, see also C<size>.
170 my $log = $self->_get_logger();
171 $log->debug( "arguments: ",dump( $arg ));
173 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
174 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
175 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
176 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
177 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
179 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
181 $log->logcroak("need path") if (! $arg->{'path'});
182 my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852';
184 # store data in object
185 foreach my $v (qw/path offset limit/) {
186 $self->{$v} = $arg->{$v} if defined $arg->{$v};
189 if ($arg->{load_row} || $arg->{save_row}) {
190 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
191 ref($arg->{load_row}) eq 'CODE' &&
192 ref($arg->{save_row}) eq 'CODE'
194 $self->{load_row} = $arg->{load_row};
195 $self->{save_row} = $arg->{save_row};
196 $log->debug("using load_row and save_row instead of in-memory hash");
203 if ($self->{recode}) {
204 my @r = split(/\s/, $self->{recode});
206 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
211 $recode_map->{$from} = $to;
214 $recode_regex = join '|' => keys %{ $recode_map };
216 $log->debug("using recode regex: $recode_regex");
222 if (my $p = $arg->{modify_file}) {
223 $log->debug("using modify_file $p");
224 $rec_regex = $self->modify_file_regexps( $p );
225 } elsif (my $h = $arg->{modify_records}) {
226 $log->debug("using modify_records ", sub { dump( $h ) });
227 $rec_regex = $self->modify_record_regexps(%{ $h });
229 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
231 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
233 $arg->{$_} = $self->{$_} foreach qw(offset limit);
235 my $ll_db = $class->new(
236 path => $arg->{path},
237 input_config => $arg->{input_config} || $self->{input_config},
239 # my ($l,$f_nr) = @_;
240 # return unless defined($l);
241 # $l = decode($input_encoding, $l);
242 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
248 # save for dump and input_module
249 $self->{ll_db} = $ll_db;
251 unless (defined($ll_db)) {
252 $log->logwarn("can't open database $arg->{path}, skipping...");
256 my $size = $ll_db->size;
259 $log->logwarn("no records in database $arg->{path}, skipping...");
266 if (my $s = $self->{offset}) {
267 $log->debug("skipping to MFN $s");
270 $self->{offset} = $from_rec;
273 if ($self->{limit}) {
274 $log->debug("limiting to ",$self->{limit}," records");
275 $to_rec = $from_rec + $self->{limit} - 1;
276 $to_rec = $size if ($to_rec > $size);
279 # store size for later
280 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
282 my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really
284 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec]",
285 " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '',
286 $self->{stats} ? ' [stats]' : '',
290 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
292 $log->debug("position: $pos\n");
294 my $rec = $ll_db->fetch_rec($pos, sub {
295 my ($l,$f_nr,$debug) = @_;
296 # return unless defined($l);
297 # return $l unless ($rec_regex && $f_nr);
299 return unless ( defined($l) && defined($f_nr) );
301 warn "-=> $f_nr ## |$l|\n" if ($debug);
302 $log->debug("-=> $f_nr ## $l");
304 # codepage conversion and recode_regex
305 $l = decode($input_encoding, $l, 1);
306 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
309 if ($rec_regex && defined($rec_regex->{$f_nr})) {
310 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
312 foreach my $r (@{ $rec_regex->{$f_nr} }) {
314 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
315 eval '$l =~ ' . $r->{regex};
317 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
318 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
319 $d .= ' ' . $r->{debug} if defined($r->{debug});
320 $log->debug("MODIFY $d");
321 warn "*** $d\n" if ($debug);
324 $log->error("error applying regex: ",dump($r), $@) if $@;
328 $log->debug("<=- $f_nr ## |$l|");
329 warn "<=- $f_nr ## $l\n" if ($debug);
333 $log->debug(sub { dump($rec) });
336 $log->warn("record $pos empty? skipping...");
341 if ($self->{save_row}) {
342 $self->{save_row}->({
347 $self->{data}->{$pos} = $rec;
351 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
353 # update counters for statistics
354 if ($self->{stats}) {
356 # fetch clean record with regexpes applied for statistics
357 my $rec = $ll_db->fetch_rec($pos);
359 foreach my $fld (keys %{ $rec }) {
360 $self->{_stats}->{fld}->{ $fld }++;
362 #$log->logdie("invalid record fild $fld, not ARRAY")
363 next unless (ref($rec->{ $fld }) eq 'ARRAY');
365 foreach my $row (@{ $rec->{$fld} }) {
367 if (ref($row) eq 'HASH') {
369 foreach my $sf (keys %{ $row }) {
370 next if ($sf eq 'subfields');
371 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
372 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
373 if (ref($row->{$sf}) eq 'ARRAY');
377 $self->{_stats}->{repeatable}->{ $fld }++;
383 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
388 $self->{last_pcnt} = 0;
390 # store max mfn and return it.
391 $self->{max_pos} = $to_rec;
392 $log->debug("max_pos: $to_rec");
397 sub input_module { $_[0]->{ll_db} }
401 Fetch next record from database. It will also displays progress bar.
403 my $rec = $isis->fetch;
405 Record from this function should probably go to C<data_structure> for
413 my $log = $self->_get_logger();
415 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
417 if ($self->{pos} == -1) {
418 $self->{pos} = $self->{offset};
423 my $mfn = $self->{pos};
425 if ($mfn > $self->{max_pos}) {
426 $self->{pos} = $self->{max_pos};
427 $log->debug("at EOF");
431 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
435 if ($self->{load_row}) {
436 $rec = $self->{load_row}->({ id => $mfn });
438 $rec = $self->{data}->{$mfn};
446 Returns current record number (MFN).
450 First record in database has position 1.
462 Returns number of records in database
466 Result from this function can be used to loop through all records
468 foreach my $mfn ( 1 ... $isis->size ) { ... }
470 because it takes into account C<offset> and C<limit>.
476 return $self->{size};
481 Seek to specified MFN in file.
485 First record in database has position 1.
493 my $log = $self->_get_logger();
495 $log->logconfess("called without pos") unless defined($pos);
498 $log->warn("seek before first record");
500 } elsif ($pos > $self->{max_pos}) {
501 $log->warn("seek beyond last record");
502 $pos = $self->{max_pos};
505 return $self->{pos} = (($pos - 1) || -1);
510 Dump statistics about field and subfield usage
519 my $log = $self->_get_logger();
521 my $s = $self->{_stats};
523 $log->warn("called stats, but there is no statistics collected");
532 die "no field in ", dump( $s->{fld} ) unless defined( $f );
533 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
534 $max_fld = $v if ($v > $max_fld);
536 my $o = sprintf("%4s %d ~", $f, $v);
538 if (defined($s->{sf}->{$f})) {
539 my @subfields = keys %{ $s->{sf}->{$f} };
541 $o .= sprintf(" %s:%d%s", $_,
542 $s->{sf}->{$f}->{$_}->{count},
543 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
546 # first indicators and other special subfields
547 sort( grep { length($_) > 1 } @subfields ),
548 # then subfileds (single char)
549 sort( grep { length($_) == 1 } @subfields ),
553 if (my $v_r = $s->{repeatable}->{$f}) {
554 $o .= " ($v_r)" if ($v_r != $v);
559 if ( $a =~ m/^\d+$/ && $b =~ m/^\d+$/ ) {
564 } keys %{ $s->{fld} }
567 $log->debug( sub { dump($s) } );
569 my $path = 'var/stats.yml';
570 YAML::DumpFile( $path, $s );
571 $log->info( 'created ', $path, ' with ', -s $path, ' bytes' );
578 Display humanly readable dump of record
585 return unless $self->{ll_db};
587 if ($self->{ll_db}->can('dump_ascii')) {
588 return $self->{ll_db}->dump_ascii( $self->{pos} );
590 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
596 Helper function called which create regexps to be execute on code.
598 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
599 _get_regex( 900, '^b', ' : ^b' );
601 It supports perl regexps with C<regex:> prefix to from value and has
602 additional logic to skip empty subfields.
607 my ($sf,$from,$to) = @_;
613 if ($from =~ m/^regex:(.+)$/) {
616 $from = '\Q' . $from . '\E';
619 my $need_subfield_data = '*'; # no
620 # if from is also subfield, require some data in between
621 # to correctly skip empty subfields
622 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
624 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
627 's/'. $from .'/'. $to .'/g';
632 =head2 modify_record_regexps
634 Generate hash with regexpes to be applied using L<filter>.
636 my $regexpes = $input->modify_record_regexps(
637 900 => { '^a' => { ' : ' => '^b' } },
638 901 => { '*' => { '^b' => ' ; ' } },
643 sub modify_record_regexps {
645 my $modify_record = {@_};
649 my $log = $self->_get_logger();
651 foreach my $f (keys %$modify_record) {
652 $log->debug("field: $f");
654 foreach my $sf (keys %{ $modify_record->{$f} }) {
655 $log->debug("subfield: $sf");
657 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
658 my $to = $modify_record->{$f}->{$sf}->{$from};
659 #die "no field?" unless defined($to);
660 my $d = "|$from| -> |$to|";
661 $log->debug("transform: $d");
663 my $regex = _get_regex($sf,$from,$to);
664 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
665 $log->debug("regex: $regex");
673 =head2 modify_file_regexps
675 Generate hash with regexpes to be applied using L<filter> from
676 pseudo hash/yaml format for regex mappings.
678 It should be obvious:
685 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
686 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
688 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
690 On undef path it will just return.
694 sub modify_file_regexps {
697 my $modify_path = shift || return;
699 my $log = $self->_get_logger();
703 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
709 next if (/^#/ || /^\s*$/);
711 if (/^\s*(\d+)\s*$/) {
713 $log->debug("field: $f");
715 } elsif (/^\s*'([^']*)'\s*$/) {
717 $log->die("can't define subfiled before field in: $_") unless ($f);
718 $log->debug("subfield: $sf");
719 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
720 my ($from,$to) = ($1, $2);
722 $log->debug("transform: |$from| -> |$to|");
724 my $regex = _get_regex($sf,$from,$to);
725 push @{ $regexpes->{$f} }, {
727 file => $modify_path,
730 $log->debug("regex: $regex");
732 die "can't parse: $_";
741 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
743 =head1 COPYRIGHT & LICENSE
745 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
747 This program is free software; you can redistribute it and/or modify it
748 under the same terms as Perl itself.
752 1; # End of WebPAC::Input