9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/decode from_to/;
16 WebPAC::Input - read different file formats into WebPAC
20 our $VERSION = '0.19';
24 This module implements input as database which have fixed and known
25 I<size> while indexing and single unique numeric identifier for database
26 position ranging from 1 to I<size>.
28 Simply, something that is indexed by unmber from 1 .. I<size>.
30 Examples of such databases are CDS/ISIS files, MARC files, lines in
33 Specific file formats are implemented using low-level interface modules,
34 located in C<WebPAC::Input::*> namespace which export C<open_db>,
35 C<fetch_rec> and optional C<init> functions.
37 Perhaps a little code snippet.
41 my $db = WebPAC::Input->new(
42 module => 'WebPAC::Input::ISIS',
45 $db->open( path => '/path/to/database' );
46 print "database size: ",$db->size,"\n";
47 while (my $rec = $db->fetch) {
48 # do something with $rec
57 Create new input database object.
59 my $db = new WebPAC::Input(
60 module => 'WebPAC::Input::MARC',
61 recode => 'char pairs',
64 mapping => [ 'foo', 'bar', 'baz' ],
68 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
69 L<WebPAC::Input::MARC>.
71 C<recode> is optional string constisting of character or words pairs that
72 should be replaced in input stream.
74 C<no_progress_bar> disables progress bar output on C<STDOUT>
76 This function will also call low-level C<init> if it exists with same
86 my $log = $self->_get_logger;
88 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
89 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
90 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup};
91 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem};
93 $log->logconfess("specify low-level file format module") unless ($self->{module});
94 my $module_path = $self->{module};
95 $module_path =~ s#::#/#g;
96 $module_path .= '.pm';
97 $log->debug("require low-level module $self->{module} from $module_path");
101 $self ? return $self : return undef;
106 This function will read whole database in memory and produce lookups.
108 my $store; # simple in-memory hash
111 path => '/path/to/database/file',
112 input_encoding => 'cp852',
113 strict_encoding => 0,
117 lookup_coderef => sub {
122 900 => { '^a' => { ' : ' => '^b' } },
123 901 => { '*' => { '^b' => ' ; ' } },
125 modify_file => 'conf/modify/mapping.map',
128 $store->{ $a->{id} } = $a->{row};
132 return defined($store->{ $a->{id} }) &&
133 $store->{ $a->{id} };
138 By default, C<input_encoding> is assumed to be C<cp852>.
140 C<offset> is optional parametar to skip records at beginning.
142 C<limit> is optional parametar to read just C<limit> records from database
144 C<stats> create optional report about usage of fields and subfields
146 C<lookup_coderef> is closure to called to save data into lookups
148 C<modify_records> specify mapping from subfields to delimiters or from
149 delimiters to subfields, as well as oprations on fields (if subfield is
152 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
153 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
154 overrides C<modify_records> if both exists for same input.
156 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
157 is documented in example above.
159 C<strict_encoding> should really default to 1, but it doesn't for now.
161 Returns size of database, regardless of C<offset> and C<limit>
162 parametars, see also C<size>.
170 my $log = $self->_get_logger();
171 $log->debug( "arguments: ",dump( $arg ));
173 $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
174 $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
175 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
176 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
177 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
179 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
181 $log->logcroak("need path") if (! $arg->{'path'});
182 my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852';
184 # store data in object
185 $self->{$_} = $arg->{$_} foreach grep { defined $arg->{$_} } qw(path offset limit);
187 if ($arg->{load_row} || $arg->{save_row}) {
188 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
189 ref($arg->{load_row}) eq 'CODE' &&
190 ref($arg->{save_row}) eq 'CODE'
192 $self->{load_row} = $arg->{load_row};
193 $self->{save_row} = $arg->{save_row};
194 $log->debug("using load_row and save_row instead of in-memory hash");
201 if ($self->{recode}) {
202 my @r = split(/\s/, $self->{recode});
204 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
209 $recode_map->{$from} = $to;
212 $recode_regex = join '|' => keys %{ $recode_map };
214 $log->debug("using recode regex: $recode_regex");
220 if (my $p = $arg->{modify_file}) {
221 $log->debug("using modify_file $p");
222 $rec_regex = $self->modify_file_regexps( $p );
223 } elsif (my $h = $arg->{modify_records}) {
224 $log->debug("using modify_records ", sub { dump( $h ) });
225 $rec_regex = $self->modify_record_regexps(%{ $h });
227 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
229 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
231 $arg->{$_} = $self->{$_} foreach qw(offset limit);
233 my $ll_db = $class->new(
234 path => $arg->{path},
235 input_config => $arg->{input_config} || $self->{input_config},
237 # my ($l,$f_nr) = @_;
238 # return unless defined($l);
239 # $l = decode($input_encoding, $l);
240 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
246 # save for dump and input_module
247 $self->{ll_db} = $ll_db;
249 unless (defined($ll_db)) {
250 $log->logwarn("can't open database $arg->{path}, skipping...");
254 my $size = $ll_db->size;
257 $log->logwarn("no records in database $arg->{path}, skipping...");
264 if (my $s = $self->{offset}) {
265 $log->debug("offset $s records");
268 $self->{offset} = $from_rec - 1;
271 if ($self->{limit}) {
272 $log->debug("limiting to ",$self->{limit}," records");
273 $to_rec = $from_rec + $self->{limit} - 1;
274 $to_rec = $size if ($to_rec > $size);
277 my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really
279 $log->info("processing ", $self->{size} || 'all', "/$size records [$from_rec-$to_rec]",
280 " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '',
281 $self->{stats} ? ' [stats]' : '',
287 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
289 $log->debug("position: $pos\n");
291 $self->{size}++; # XXX I could move this more down if I didn't want empty records...
293 my $rec = $ll_db->fetch_rec($pos, sub {
294 my ($l,$f_nr,$debug) = @_;
295 # return unless defined($l);
296 # return $l unless ($rec_regex && $f_nr);
298 return unless ( defined($l) && defined($f_nr) );
300 my $marc_subfields = $l =~ s/\x1F(\w)/\^$1/g; # fix MARC subfiled delimiters to ^
302 warn "-=> $f_nr ## |$l|\n" if ($debug);
303 $log->debug("-=> $f_nr ## $l");
305 # codepage conversion and recode_regex
306 $l = decode($input_encoding, $l, 1);
307 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
310 if ($rec_regex && defined($rec_regex->{$f_nr})) {
311 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
313 foreach my $r (@{ $rec_regex->{$f_nr} }) {
315 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
316 eval '$l =~ ' . $r->{regex};
318 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
319 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
320 $d .= ' ' . $r->{debug} if defined($r->{debug});
321 $log->debug("MODIFY $d");
322 warn "*** $d\n" if ($debug);
325 $log->error("error applying regex: ",dump($r), $@) if $@;
329 $l =~ s/\^(\w)/\x1F$1/g if $marc_subfields;
331 $log->debug("<=- $f_nr ## |$l|");
332 warn "<=- $f_nr ## $l\n" if ($debug);
336 $log->debug(sub { dump($rec) });
339 $log->warn("record $pos empty? skipping...");
344 if ($self->{save_row}) {
345 $self->{save_row}->({
350 $self->{data}->{$pos} = $rec;
354 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
356 # update counters for statistics
357 if ($self->{stats}) {
359 # fetch clean record with regexpes applied for statistics
360 my $rec = $ll_db->fetch_rec($pos);
362 foreach my $fld (keys %{ $rec }) {
363 $self->{_stats}->{fld}->{ $fld }++;
365 #$log->logdie("invalid record fild $fld, not ARRAY")
366 next unless (ref($rec->{ $fld }) eq 'ARRAY');
368 foreach my $row (@{ $rec->{$fld} }) {
370 if (ref($row) eq 'HASH') {
372 foreach my $sf (keys %{ $row }) {
373 next if ($sf eq 'subfields');
374 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
375 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
376 if (ref($row->{$sf}) eq 'ARRAY');
380 $self->{_stats}->{repeatable}->{ $fld }++;
386 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
391 $self->{last_pcnt} = 0;
393 # store max mfn and return it.
394 $self->{max_pos} = $to_rec;
395 $log->debug("max_pos: $to_rec");
400 sub input_module { $_[0]->{ll_db} }
404 Fetch next record from database. It will also displays progress bar.
406 my $rec = $isis->fetch;
408 Record from this function should probably go to C<data_structure> for
416 my $log = $self->_get_logger();
418 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
420 if ($self->{pos} == -1) {
421 $self->{pos} = $self->{offset} + 1;
426 my $mfn = $self->{pos};
428 if ($mfn > $self->{max_pos}) {
429 $self->{pos} = $self->{max_pos};
430 $log->debug("at EOF");
434 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
438 if ($self->{load_row}) {
439 $rec = $self->{load_row}->({ id => $mfn });
441 $rec = $self->{data}->{$mfn};
449 Returns current record number (MFN).
453 First record in database has position 1.
465 Returns number of records in database
469 Result from this function can be used to loop through all records
471 foreach my $mfn ( 1 ... $isis->size ) { ... }
473 because it takes into account C<offset> and C<limit>.
479 return $self->{size}; # FIXME this is buggy if open is called multiple times!
484 Seek to specified MFN in file.
488 First record in database has position 1.
496 my $log = $self->_get_logger();
498 $log->logconfess("called without pos") unless defined($pos);
501 $log->warn("seek before first record");
503 } elsif ($pos > $self->{max_pos}) {
504 $log->warn("seek beyond last record");
505 $pos = $self->{max_pos};
508 return $self->{pos} = (($pos - 1) || -1);
513 Dump statistics about field and subfield usage
522 my $log = $self->_get_logger();
524 my $s = $self->{_stats};
526 $log->warn("called stats, but there is no statistics collected");
535 die "no field in ", dump( $s->{fld} ) unless defined( $f );
536 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
537 $max_fld = $v if ($v > $max_fld);
539 my $o = sprintf("%4s %d ~", $f, $v);
541 if (defined($s->{sf}->{$f})) {
542 my @subfields = keys %{ $s->{sf}->{$f} };
544 $o .= sprintf(" %s:%d%s", $_,
545 $s->{sf}->{$f}->{$_}->{count},
546 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
549 # first indicators and other special subfields
550 sort( grep { length($_) > 1 } @subfields ),
551 # then subfileds (single char)
552 sort( grep { length($_) == 1 } @subfields ),
556 if (my $v_r = $s->{repeatable}->{$f}) {
557 $o .= " ($v_r)" if ($v_r != $v);
562 if ( $a =~ m/^\d+$/ && $b =~ m/^\d+$/ ) {
567 } keys %{ $s->{fld} }
570 $log->debug( sub { dump($s) } );
572 my $path = 'var/stats.yml';
573 YAML::DumpFile( $path, $s );
574 $log->info( 'created ', $path, ' with ', -s $path, ' bytes' );
581 Display humanly readable dump of record
588 return unless $self->{ll_db};
590 if ($self->{ll_db}->can('dump_ascii')) {
591 return $self->{ll_db}->dump_ascii( $self->{pos} );
593 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
599 Helper function called which create regexps to be execute on code.
601 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
602 _get_regex( 900, '^b', ' : ^b' );
604 It supports perl regexps with C<regex:> prefix to from value and has
605 additional logic to skip empty subfields.
610 my ($sf,$from,$to) = @_;
616 if ($from =~ m/^regex:(.+)$/) {
619 $from = '\Q' . $from . '\E';
622 my $need_subfield_data = '*'; # no
623 # if from is also subfield, require some data in between
624 # to correctly skip empty subfields
625 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
627 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
630 's/'. $from .'/'. $to .'/g';
635 =head2 modify_record_regexps
637 Generate hash with regexpes to be applied using L<filter>.
639 my $regexpes = $input->modify_record_regexps(
640 900 => { '^a' => { ' : ' => '^b' } },
641 901 => { '*' => { '^b' => ' ; ' } },
646 sub modify_record_regexps {
648 my $modify_record = {@_};
652 my $log = $self->_get_logger();
654 foreach my $f (keys %$modify_record) {
655 $log->debug("field: $f");
657 foreach my $sf (keys %{ $modify_record->{$f} }) {
658 $log->debug("subfield: $sf");
660 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
661 my $to = $modify_record->{$f}->{$sf}->{$from};
662 #die "no field?" unless defined($to);
663 my $d = "|$from| -> |$to|";
664 $log->debug("transform: $d");
666 my $regex = _get_regex($sf,$from,$to);
667 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
668 $log->debug("regex: $regex");
676 =head2 modify_file_regexps
678 Generate hash with regexpes to be applied using L<filter> from
679 pseudo hash/yaml format for regex mappings.
681 It should be obvious:
688 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
689 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
691 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
693 On undef path it will just return.
697 sub modify_file_regexps {
700 my $modify_path = shift || return;
702 my $log = $self->_get_logger();
706 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
712 next if (/^#/ || /^\s*$/);
714 if (/^\s*(\d+)\s*$/) {
716 $log->debug("field: $f");
718 } elsif (/^\s*'([^']*)'\s*$/) {
720 $log->die("can't define subfiled before field in: $_") unless ($f);
721 $log->debug("subfield: $sf");
722 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
723 my ($from,$to) = ($1, $2);
725 $log->debug("transform: |$from| -> |$to|");
727 my $regex = _get_regex($sf,$from,$to);
728 push @{ $regexpes->{$f} }, {
730 file => $modify_path,
733 $log->debug("regex: $regex");
735 die "can't parse: $_";
744 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
746 =head1 COPYRIGHT & LICENSE
748 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
750 This program is free software; you can redistribute it and/or modify it
751 under the same terms as Perl itself.
755 1; # End of WebPAC::Input