9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/from_to/;
15 WebPAC::Input - read different file formats into WebPAC
23 our $VERSION = '0.18';
27 This module implements input as database which have fixed and known
28 I<size> while indexing and single unique numeric identifier for database
29 position ranging from 1 to I<size>.
31 Simply, something that is indexed by unmber from 1 .. I<size>.
33 Examples of such databases are CDS/ISIS files, MARC files, lines in
36 Specific file formats are implemented using low-level interface modules,
37 located in C<WebPAC::Input::*> namespace which export C<open_db>,
38 C<fetch_rec> and optional C<init> functions.
40 Perhaps a little code snippet.
44 my $db = WebPAC::Input->new(
45 module => 'WebPAC::Input::ISIS',
48 $db->open( path => '/path/to/database' );
49 print "database size: ",$db->size,"\n";
50 while (my $rec = $db->fetch) {
51 # do something with $rec
60 Create new input database object.
62 my $db = new WebPAC::Input(
63 module => 'WebPAC::Input::MARC',
64 encoding => 'ISO-8859-2',
65 recode => 'char pairs',
68 mapping => [ 'foo', 'bar', 'baz' ],
72 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
73 L<WebPAC::Input::MARC>.
75 Optional parametar C<encoding> specify application code page (which will be
76 used internally). This should probably be your terminal encoding, and by
77 default, it C<ISO-8859-2>.
79 C<recode> is optional string constisting of character or words pairs that
80 should be replaced in input stream.
82 C<no_progress_bar> disables progress bar output on C<STDOUT>
84 This function will also call low-level C<init> if it exists with same
94 my $log = $self->_get_logger;
96 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
97 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
98 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem});
100 $log->logconfess("specify low-level file format module") unless ($self->{module});
101 my $module_path = $self->{module};
102 $module_path =~ s#::#/#g;
103 $module_path .= '.pm';
104 $log->debug("require low-level module $self->{module} from $module_path");
106 require $module_path;
108 $self->{'encoding'} ||= 'ISO-8859-2';
110 $self ? return $self : return undef;
115 This function will read whole database in memory and produce lookups.
117 my $store; # simple in-memory hash
120 path => '/path/to/database/file',
121 code_page => 'cp852',
125 lookup_coderef => sub {
130 900 => { '^a' => { ' : ' => '^b' } },
131 901 => { '*' => { '^b' => ' ; ' } },
133 modify_file => 'conf/modify/mapping.map',
136 $store->{ $a->{id} } = $a->{row};
140 return defined($store->{ $a->{id} }) &&
141 $store->{ $a->{id} };
146 By default, C<code_page> is assumed to be C<cp852>.
148 C<offset> is optional parametar to position at some offset before reading from database.
150 C<limit> is optional parametar to read just C<limit> records from database
152 C<stats> create optional report about usage of fields and subfields
154 C<lookup_coderef> is closure to called to save data into lookups
156 C<modify_records> specify mapping from subfields to delimiters or from
157 delimiters to subfields, as well as oprations on fields (if subfield is
160 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
161 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
162 overrides C<modify_records> if both exists for same input.
164 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
165 is documented in example above.
167 Returns size of database, regardless of C<offset> and C<limit>
168 parametars, see also C<size>.
176 my $log = $self->_get_logger();
178 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
179 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
180 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
182 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
184 $log->logcroak("need path") if (! $arg->{'path'});
185 my $code_page = $arg->{'code_page'} || 'cp852';
187 # store data in object
188 $self->{'input_code_page'} = $code_page;
189 foreach my $v (qw/path offset limit/) {
190 $self->{$v} = $arg->{$v} if ($arg->{$v});
193 if ($arg->{load_row} || $arg->{save_row}) {
194 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
195 ref($arg->{load_row}) eq 'CODE' &&
196 ref($arg->{save_row}) eq 'CODE'
198 $self->{load_row} = $arg->{load_row};
199 $self->{save_row} = $arg->{save_row};
200 $log->debug("using load_row and save_row instead of in-memory hash");
207 if ($self->{recode}) {
208 my @r = split(/\s/, $self->{recode});
210 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
215 $recode_map->{$from} = $to;
218 $recode_regex = join '|' => keys %{ $recode_map };
220 $log->debug("using recode regex: $recode_regex");
226 if (my $p = $arg->{modify_file}) {
227 $log->debug("using modify_file $p");
228 $rec_regex = $self->modify_file_regexps( $p );
229 } elsif (my $h = $arg->{modify_records}) {
230 $log->debug("using modify_records ", sub { dump( $h ) });
231 $rec_regex = $self->modify_record_regexps(%{ $h });
233 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
235 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
237 my $ll_db = $class->new(
238 path => $arg->{path},
239 input_config => $arg->{input_config},
241 # my ($l,$f_nr) = @_;
242 # return unless defined($l);
243 # from_to($l, $code_page, $self->{'encoding'});
244 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
250 unless (defined($ll_db)) {
251 $log->logwarn("can't open database $arg->{path}, skipping...");
255 my $size = $ll_db->size;
258 $log->logwarn("no records in database $arg->{path}, skipping...");
265 if (my $s = $self->{offset}) {
266 $log->debug("skipping to MFN $s");
269 $self->{offset} = $from_rec;
272 if ($self->{limit}) {
273 $log->debug("limiting to ",$self->{limit}," records");
274 $to_rec = $from_rec + $self->{limit} - 1;
275 $to_rec = $size if ($to_rec > $size);
278 # store size for later
279 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
281 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
284 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
286 $log->debug("position: $pos\n");
288 my $rec = $ll_db->fetch_rec($pos, sub {
289 my ($l,$f_nr,$debug) = @_;
290 # return unless defined($l);
291 # return $l unless ($rec_regex && $f_nr);
293 return unless ( defined($l) && defined($f_nr) );
295 warn "-=> $f_nr ## |$l|\n" if ($debug);
296 $log->debug("-=> $f_nr ## $l");
298 # codepage conversion and recode_regex
299 from_to($l, $code_page, $self->{'encoding'});
300 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
303 if ($rec_regex && defined($rec_regex->{$f_nr})) {
304 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
306 foreach my $r (@{ $rec_regex->{$f_nr} }) {
308 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
309 eval '$l =~ ' . $r->{regex};
311 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
312 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
313 $d .= ' ' . $r->{debug} if defined($r->{debug});
314 $log->debug("MODIFY $d");
315 warn "*** $d\n" if ($debug);
318 $log->error("error applying regex: $r") if ($@);
322 $log->debug("<=- $f_nr ## |$l|");
323 warn "<=- $f_nr ## $l\n" if ($debug);
327 $log->debug(sub { dump($rec) });
330 $log->warn("record $pos empty? skipping...");
335 if ($self->{save_row}) {
336 $self->{save_row}->({
341 $self->{data}->{$pos} = $rec;
345 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
347 # update counters for statistics
348 if ($self->{stats}) {
350 # fetch clean record with regexpes applied for statistics
351 my $rec = $ll_db->fetch_rec($pos);
353 foreach my $fld (keys %{ $rec }) {
354 $self->{_stats}->{fld}->{ $fld }++;
356 $log->logdie("invalid record fild $fld, not ARRAY")
357 unless (ref($rec->{ $fld }) eq 'ARRAY');
359 foreach my $row (@{ $rec->{$fld} }) {
361 if (ref($row) eq 'HASH') {
363 foreach my $sf (keys %{ $row }) {
364 next if ($sf eq 'subfields');
365 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
366 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
367 if (ref($row->{$sf}) eq 'ARRAY');
371 $self->{_stats}->{repeatable}->{ $fld }++;
377 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
382 $self->{last_pcnt} = 0;
384 # store max mfn and return it.
385 $self->{max_pos} = $to_rec;
386 $log->debug("max_pos: $to_rec");
389 $self->{ll_db} = $ll_db;
396 Fetch next record from database. It will also displays progress bar.
398 my $rec = $isis->fetch;
400 Record from this function should probably go to C<data_structure> for
408 my $log = $self->_get_logger();
410 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
412 if ($self->{pos} == -1) {
413 $self->{pos} = $self->{offset};
418 my $mfn = $self->{pos};
420 if ($mfn > $self->{max_pos}) {
421 $self->{pos} = $self->{max_pos};
422 $log->debug("at EOF");
426 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
430 if ($self->{load_row}) {
431 $rec = $self->{load_row}->({ id => $mfn });
433 $rec = $self->{data}->{$mfn};
441 Returns current record number (MFN).
445 First record in database has position 1.
457 Returns number of records in database
461 Result from this function can be used to loop through all records
463 foreach my $mfn ( 1 ... $isis->size ) { ... }
465 because it takes into account C<offset> and C<limit>.
471 return $self->{size};
476 Seek to specified MFN in file.
480 First record in database has position 1.
488 my $log = $self->_get_logger();
490 $log->logconfess("called without pos") unless defined($pos);
493 $log->warn("seek before first record");
495 } elsif ($pos > $self->{max_pos}) {
496 $log->warn("seek beyond last record");
497 $pos = $self->{max_pos};
500 return $self->{pos} = (($pos - 1) || -1);
505 Dump statistics about field and subfield usage
514 my $log = $self->_get_logger();
516 my $s = $self->{_stats};
518 $log->warn("called stats, but there is no statistics collected");
527 die "no field in ", dump( $s->{fld} ) unless defined( $f );
528 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
529 $max_fld = $v if ($v > $max_fld);
531 my $o = sprintf("%4s %d ~", $f, $v);
533 if (defined($s->{sf}->{$f})) {
535 $o .= sprintf(" %s:%d%s", $_,
536 $s->{sf}->{$f}->{$_}->{count},
537 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
539 } sort keys %{ $s->{sf}->{$f} };
542 if (my $v_r = $s->{repeatable}->{$f}) {
543 $o .= " ($v_r)" if ($v_r != $v);
547 } sort { $a <=> $b } keys %{ $s->{fld} }
550 $log->debug( sub { dump($s) } );
557 Display humanly readable dump of record
564 return unless $self->{ll_db};
566 if ($self->{ll_db}->can('dump_ascii')) {
567 return $self->{ll_db}->dump_ascii( $self->{pos} );
569 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
575 Helper function called which create regexps to be execute on code.
577 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
578 _get_regex( 900, '^b', ' : ^b' );
580 It supports perl regexps with C<regex:> prefix to from value and has
581 additional logic to skip empty subfields.
586 my ($sf,$from,$to) = @_;
592 if ($from =~ m/^regex:(.+)$/) {
595 $from = '\Q' . $from . '\E';
598 my $need_subfield_data = '*'; # no
599 # if from is also subfield, require some data in between
600 # to correctly skip empty subfields
601 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
603 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
606 's/'. $from .'/'. $to .'/g';
611 =head2 modify_record_regexps
613 Generate hash with regexpes to be applied using L<filter>.
615 my $regexpes = $input->modify_record_regexps(
616 900 => { '^a' => { ' : ' => '^b' } },
617 901 => { '*' => { '^b' => ' ; ' } },
622 sub modify_record_regexps {
624 my $modify_record = {@_};
628 my $log = $self->_get_logger();
630 foreach my $f (keys %$modify_record) {
631 $log->debug("field: $f");
633 foreach my $sf (keys %{ $modify_record->{$f} }) {
634 $log->debug("subfield: $sf");
636 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
637 my $to = $modify_record->{$f}->{$sf}->{$from};
638 #die "no field?" unless defined($to);
639 my $d = "|$from| -> |$to|";
640 $log->debug("transform: $d");
642 my $regex = _get_regex($sf,$from,$to);
643 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
644 $log->debug("regex: $regex");
652 =head2 modify_file_regexps
654 Generate hash with regexpes to be applied using L<filter> from
655 pseudo hash/yaml format for regex mappings.
657 It should be obvious:
664 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
665 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
667 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
669 On undef path it will just return.
673 sub modify_file_regexps {
676 my $modify_path = shift || return;
678 my $log = $self->_get_logger();
682 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
688 next if (/^#/ || /^\s*$/);
690 if (/^\s*(\d+)\s*$/) {
692 $log->debug("field: $f");
694 } elsif (/^\s*'([^']*)'\s*$/) {
696 $log->die("can't define subfiled before field in: $_") unless ($f);
697 $log->debug("subfield: $sf");
698 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
699 my ($from,$to) = ($1, $2);
701 $log->debug("transform: |$from| -> |$to|");
703 my $regex = _get_regex($sf,$from,$to);
704 push @{ $regexpes->{$f} }, {
706 file => $modify_path,
709 $log->debug("regex: $regex");
718 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
720 =head1 COPYRIGHT & LICENSE
722 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
724 This program is free software; you can redistribute it and/or modify it
725 under the same terms as Perl itself.
729 1; # End of WebPAC::Input