9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/decode from_to/;
16 WebPAC::Input - read different file formats into WebPAC
20 our $VERSION = '0.19';
24 This module implements input as database which have fixed and known
25 I<size> while indexing and single unique numeric identifier for database
26 position ranging from 1 to I<size>.
28 Simply, something that is indexed by unmber from 1 .. I<size>.
30 Examples of such databases are CDS/ISIS files, MARC files, lines in
33 Specific file formats are implemented using low-level interface modules,
34 located in C<WebPAC::Input::*> namespace which export C<open_db>,
35 C<fetch_rec> and optional C<init> functions.
37 Perhaps a little code snippet.
41 my $db = WebPAC::Input->new(
42 module => 'WebPAC::Input::ISIS',
45 $db->open( path => '/path/to/database' );
46 print "database size: ",$db->size,"\n";
47 while (my $rec = $db->fetch) {
48 # do something with $rec
57 Create new input database object.
59 my $db = new WebPAC::Input(
60 module => 'WebPAC::Input::MARC',
61 recode => 'char pairs',
64 mapping => [ 'foo', 'bar', 'baz' ],
68 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
69 L<WebPAC::Input::MARC>.
71 C<recode> is optional string constisting of character or words pairs that
72 should be replaced in input stream.
74 C<no_progress_bar> disables progress bar output on C<STDOUT>
76 This function will also call low-level C<init> if it exists with same
86 my $log = $self->_get_logger;
88 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
89 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
90 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup};
91 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem};
93 $log->logconfess("specify low-level file format module") unless ($self->{module});
94 my $module_path = $self->{module};
95 $module_path =~ s#::#/#g;
96 $module_path .= '.pm';
97 $log->debug("require low-level module $self->{module} from $module_path");
101 $self ? return $self : return undef;
106 This function will read whole database in memory and produce lookups.
108 my $store; # simple in-memory hash
111 path => '/path/to/database/file',
112 input_encoding => 'cp852',
113 strict_encoding => 0,
117 lookup_coderef => sub {
122 900 => { '^a' => { ' : ' => '^b' } },
123 901 => { '*' => { '^b' => ' ; ' } },
125 modify_file => 'conf/modify/mapping.map',
128 $store->{ $a->{id} } = $a->{row};
132 return defined($store->{ $a->{id} }) &&
133 $store->{ $a->{id} };
138 By default, C<input_encoding> is assumed to be C<cp852>.
140 C<offset> is optional parametar to skip records at beginning.
142 C<limit> is optional parametar to read just C<limit> records from database
144 C<stats> create optional report about usage of fields and subfields
146 C<lookup_coderef> is closure to called to save data into lookups
148 C<modify_records> specify mapping from subfields to delimiters or from
149 delimiters to subfields, as well as oprations on fields (if subfield is
152 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
153 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
154 overrides C<modify_records> if both exists for same input.
156 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
157 is documented in example above.
159 C<strict_encoding> should really default to 1, but it doesn't for now.
161 Returns size of database, regardless of C<offset> and C<limit>
162 parametars, see also C<size>.
170 my $log = $self->_get_logger();
171 $log->debug( "arguments: ",dump( $arg ));
173 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
174 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
175 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
176 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
177 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
179 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
181 $log->logcroak("need path") if (! $arg->{'path'});
182 my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852';
184 # store data in object
185 $self->{$_} = $arg->{$_} foreach grep { defined $arg->{$_} } qw(path offset limit);
187 if ($arg->{load_row} || $arg->{save_row}) {
188 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
189 ref($arg->{load_row}) eq 'CODE' &&
190 ref($arg->{save_row}) eq 'CODE'
192 $self->{load_row} = $arg->{load_row};
193 $self->{save_row} = $arg->{save_row};
194 $log->debug("using load_row and save_row instead of in-memory hash");
201 if ($self->{recode}) {
202 my @r = split(/\s/, $self->{recode});
204 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
209 $from =~ s/^\\x([0-9a-f]{2})/chr(hex($1))/eig;
210 $recode_map->{$from} = $to;
213 $recode_regex = join '|' => keys %{ $recode_map };
215 $log->debug("using recode regex: $recode_regex");
221 if (my $p = $arg->{modify_file}) {
222 $log->debug("using modify_file $p");
223 $rec_regex = $self->modify_file_regexps( $p );
224 } elsif (my $h = $arg->{modify_records}) {
225 $log->debug("using modify_records ", sub { dump( $h ) });
226 $rec_regex = $self->modify_record_regexps(%{ $h });
228 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
230 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
232 $arg->{$_} = $self->{$_} foreach qw(offset limit);
234 my $ll_db = $class->new(
235 path => $arg->{path},
236 input_config => $arg->{input_config} || $self->{input_config},
238 # my ($l,$f_nr) = @_;
239 # return unless defined($l);
240 # $l = decode($input_encoding, $l);
241 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
247 # save for dump and input_module
248 $self->{ll_db} = $ll_db;
250 unless (defined($ll_db)) {
251 $log->logwarn("can't open database $arg->{path}, skipping...");
255 my $size = $ll_db->size;
258 $log->logwarn("no records in database $arg->{path}, skipping...");
265 if (my $s = $self->{offset}) {
266 $log->debug("offset $s records");
269 $self->{offset} = $from_rec - 1;
272 if ($self->{limit}) {
273 $log->debug("limiting to ",$self->{limit}," records");
274 $to_rec = $from_rec + $self->{limit} - 1;
275 $to_rec = $size if ($to_rec > $size);
278 my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really
280 $log->info("processing ", $self->{size} || 'all', "/$size records [$from_rec-$to_rec]",
281 " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '',
282 $self->{stats} ? ' [stats]' : '',
288 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
290 $log->debug("position: $pos\n");
292 $self->{size}++; # XXX I could move this more down if I didn't want empty records...
294 my $rec = $ll_db->fetch_rec($pos, sub {
295 my ($l,$f_nr,$debug) = @_;
296 # return unless defined($l);
297 # return $l unless ($rec_regex && $f_nr);
299 return unless ( defined($l) && defined($f_nr) );
301 my $marc_subfields = $l =~ s/\x1F(\w)/\^$1/g; # fix MARC subfiled delimiters to ^
303 warn "-=> $f_nr ## |$l|\n" if ($debug);
304 $log->debug("-=> $f_nr ## $l");
306 # codepage conversion and recode_regex
307 $l = decode($input_encoding, $l, 1);
308 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
311 if ($rec_regex && defined($rec_regex->{$f_nr})) {
312 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
314 foreach my $r (@{ $rec_regex->{$f_nr} }) {
316 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
317 eval '$l =~ ' . $r->{regex};
319 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
320 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
321 $d .= ' ' . $r->{debug} if defined($r->{debug});
322 $log->debug("MODIFY $d");
323 warn "*** $d\n" if ($debug);
326 $log->error("error applying regex: ",dump($r), $@) if $@;
330 $l =~ s/\^(\w)/\x1F$1/g if $marc_subfields;
332 $log->debug("<=- $f_nr ## |$l|");
333 warn "<=- $f_nr ## $l\n" if ($debug);
337 $log->debug(sub { dump($rec) });
340 $log->warn("record $pos empty? skipping...");
345 if ($self->{save_row}) {
346 $self->{save_row}->({
351 $self->{data}->{$pos} = $rec;
355 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
357 # update counters for statistics
358 if ($self->{stats}) {
360 # fetch clean record with regexpes applied for statistics
361 my $rec = $ll_db->fetch_rec($pos);
363 foreach my $fld (keys %{ $rec }) {
364 $self->{_stats}->{fld}->{ $fld }++;
366 #$log->logdie("invalid record fild $fld, not ARRAY")
367 next unless (ref($rec->{ $fld }) eq 'ARRAY');
369 foreach my $row (@{ $rec->{$fld} }) {
371 if (ref($row) eq 'HASH') {
373 foreach my $sf (keys %{ $row }) {
374 next if ($sf eq 'subfields');
375 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
376 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
377 if (ref($row->{$sf}) eq 'ARRAY');
381 $self->{_stats}->{repeatable}->{ $fld }++;
387 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
392 $self->{last_pcnt} = 0;
394 # store max mfn and return it.
395 $self->{max_pos} = $to_rec;
396 $log->debug("max_pos: $to_rec");
401 sub input_module { $_[0]->{ll_db} }
405 Fetch next record from database. It will also displays progress bar.
407 my $rec = $isis->fetch;
409 Record from this function should probably go to C<data_structure> for
417 my $log = $self->_get_logger();
419 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
421 if ($self->{pos} == -1) {
422 $self->{pos} = $self->{offset} + 1;
427 my $mfn = $self->{pos};
429 if ($mfn > $self->{max_pos}) {
430 $self->{pos} = $self->{max_pos};
431 $log->debug("at EOF");
435 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
439 if ($self->{load_row}) {
440 $rec = $self->{load_row}->({ id => $mfn });
442 $rec = $self->{data}->{$mfn};
450 Returns current record number (MFN).
454 First record in database has position 1.
466 Returns number of records in database
470 Result from this function can be used to loop through all records
472 foreach my $mfn ( 1 ... $isis->size ) { ... }
474 because it takes into account C<offset> and C<limit>.
480 return $self->{size}; # FIXME this is buggy if open is called multiple times!
485 Seek to specified MFN in file.
489 First record in database has position 1.
497 my $log = $self->_get_logger();
499 $log->logconfess("called without pos") unless defined($pos);
502 $log->warn("seek before first record");
504 } elsif ($pos > $self->{max_pos}) {
505 $log->warn("seek beyond last record");
506 $pos = $self->{max_pos};
509 return $self->{pos} = (($pos - 1) || -1);
514 Dump statistics about field and subfield usage
523 my $log = $self->_get_logger();
525 my $s = $self->{_stats};
527 $log->warn("called stats, but there is no statistics collected");
536 die "no field in ", dump( $s->{fld} ) unless defined( $f );
537 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
538 $max_fld = $v if ($v > $max_fld);
540 my $o = sprintf("%4s %d ~", $f, $v);
542 if (defined($s->{sf}->{$f})) {
543 my @subfields = keys %{ $s->{sf}->{$f} };
545 $o .= sprintf(" %s:%d%s", $_,
546 $s->{sf}->{$f}->{$_}->{count},
547 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
550 # first indicators and other special subfields
551 sort( grep { length($_) > 1 } @subfields ),
552 # then subfileds (single char)
553 sort( grep { length($_) == 1 } @subfields ),
557 if (my $v_r = $s->{repeatable}->{$f}) {
558 $o .= " ($v_r)" if ($v_r != $v);
563 if ( $a =~ m/^\d+$/ && $b =~ m/^\d+$/ ) {
568 } keys %{ $s->{fld} }
571 $log->debug( sub { dump($s) } );
573 my $path = 'var/stats.yml';
574 YAML::DumpFile( $path, $s );
575 $log->info( 'created ', $path, ' with ', -s $path, ' bytes' );
582 Display humanly readable dump of record
589 return unless $self->{ll_db};
591 if ($self->{ll_db}->can('dump_ascii')) {
592 return $self->{ll_db}->dump_ascii( $self->{pos} );
594 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
600 Helper function called which create regexps to be execute on code.
602 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
603 _get_regex( 900, '^b', ' : ^b' );
605 It supports perl regexps with C<regex:> prefix to from value and has
606 additional logic to skip empty subfields.
611 my ($sf,$from,$to) = @_;
617 if ($from =~ m/^regex:(.+)$/) {
620 $from = '\Q' . $from . '\E';
623 my $need_subfield_data = '*'; # no
624 # if from is also subfield, require some data in between
625 # to correctly skip empty subfields
626 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
628 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
631 's/'. $from .'/'. $to .'/g';
636 =head2 modify_record_regexps
638 Generate hash with regexpes to be applied using L<filter>.
640 my $regexpes = $input->modify_record_regexps(
641 900 => { '^a' => { ' : ' => '^b' } },
642 901 => { '*' => { '^b' => ' ; ' } },
647 sub modify_record_regexps {
649 my $modify_record = {@_};
653 my $log = $self->_get_logger();
655 foreach my $f (keys %$modify_record) {
656 $log->debug("field: $f");
658 foreach my $sf (keys %{ $modify_record->{$f} }) {
659 $log->debug("subfield: $sf");
661 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
662 my $to = $modify_record->{$f}->{$sf}->{$from};
663 #die "no field?" unless defined($to);
664 my $d = "|$from| -> |$to|";
665 $log->debug("transform: $d");
667 my $regex = _get_regex($sf,$from,$to);
668 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
669 $log->debug("regex: $regex");
677 =head2 modify_file_regexps
679 Generate hash with regexpes to be applied using L<filter> from
680 pseudo hash/yaml format for regex mappings.
682 It should be obvious:
689 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
690 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
692 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
694 On undef path it will just return.
698 sub modify_file_regexps {
701 my $modify_path = shift || return;
703 my $log = $self->_get_logger();
707 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
713 next if (/^#/ || /^\s*$/);
715 if (/^\s*(\d+)\s*$/) {
717 $log->debug("field: $f");
719 } elsif (/^\s*'([^']*)'\s*$/) {
721 $log->die("can't define subfiled before field in: $_") unless ($f);
722 $log->debug("subfield: $sf");
723 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
724 my ($from,$to) = ($1, $2);
726 $log->debug("transform: |$from| -> |$to|");
728 my $regex = _get_regex($sf,$from,$to);
729 push @{ $regexpes->{$f} }, {
731 file => $modify_path,
734 $log->debug("regex: $regex");
736 die "can't parse: $_";
745 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
747 =head1 COPYRIGHT & LICENSE
749 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
751 This program is free software; you can redistribute it and/or modify it
752 under the same terms as Perl itself.
756 1; # End of WebPAC::Input