9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/from_to/;
15 WebPAC::Input - read different file formats into WebPAC
23 our $VERSION = '0.18';
27 This module implements input as database which have fixed and known
28 I<size> while indexing and single unique numeric identifier for database
29 position ranging from 1 to I<size>.
31 Simply, something that is indexed by unmber from 1 .. I<size>.
33 Examples of such databases are CDS/ISIS files, MARC files, lines in
36 Specific file formats are implemented using low-level interface modules,
37 located in C<WebPAC::Input::*> namespace which export C<open_db>,
38 C<fetch_rec> and optional C<init> functions.
40 Perhaps a little code snippet.
44 my $db = WebPAC::Input->new(
45 module => 'WebPAC::Input::ISIS',
48 $db->open( path => '/path/to/database' );
49 print "database size: ",$db->size,"\n";
50 while (my $rec = $db->fetch) {
51 # do something with $rec
60 Create new input database object.
62 my $db = new WebPAC::Input(
63 module => 'WebPAC::Input::MARC',
64 encoding => 'ISO-8859-2',
65 recode => 'char pairs',
68 mapping => [ 'foo', 'bar', 'baz' ],
72 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
73 L<WebPAC::Input::MARC>.
75 Optional parametar C<encoding> specify application code page (which will be
76 used internally). This should probably be your terminal encoding, and by
77 default, it C<ISO-8859-2>.
79 C<recode> is optional string constisting of character or words pairs that
80 should be replaced in input stream.
82 C<no_progress_bar> disables progress bar output on C<STDOUT>
84 This function will also call low-level C<init> if it exists with same
94 my $log = $self->_get_logger;
96 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
97 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
98 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem});
100 $log->logconfess("specify low-level file format module") unless ($self->{module});
101 my $module_path = $self->{module};
102 $module_path =~ s#::#/#g;
103 $module_path .= '.pm';
104 $log->debug("require low-level module $self->{module} from $module_path");
106 require $module_path;
108 $self->{'encoding'} ||= 'ISO-8859-2';
110 $self ? return $self : return undef;
115 This function will read whole database in memory and produce lookups.
117 my $store; # simple in-memory hash
120 path => '/path/to/database/file',
121 code_page => 'cp852',
125 lookup_coderef => sub {
130 900 => { '^a' => { ' : ' => '^b' } },
131 901 => { '*' => { '^b' => ' ; ' } },
133 modify_file => 'conf/modify/mapping.map',
136 $store->{ $a->{id} } = $a->{row};
140 return defined($store->{ $a->{id} }) &&
141 $store->{ $a->{id} };
146 By default, C<code_page> is assumed to be C<cp852>.
148 C<offset> is optional parametar to position at some offset before reading from database.
150 C<limit> is optional parametar to read just C<limit> records from database
152 C<stats> create optional report about usage of fields and subfields
154 C<lookup_coderef> is closure to called to save data into lookups
156 C<modify_records> specify mapping from subfields to delimiters or from
157 delimiters to subfields, as well as oprations on fields (if subfield is
160 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
161 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
162 overrides C<modify_records> if both exists for same input.
164 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
165 is documented in example above.
167 Returns size of database, regardless of C<offset> and C<limit>
168 parametars, see also C<size>.
176 my $log = $self->_get_logger();
178 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
179 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
180 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
182 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
184 $log->logcroak("need path") if (! $arg->{'path'});
185 my $code_page = $arg->{'code_page'} || 'cp852';
187 # store data in object
188 $self->{'input_code_page'} = $code_page;
189 foreach my $v (qw/path offset limit/) {
190 $self->{$v} = $arg->{$v} if ($arg->{$v});
193 if ($arg->{load_row} || $arg->{save_row}) {
194 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
195 ref($arg->{load_row}) eq 'CODE' &&
196 ref($arg->{save_row}) eq 'CODE'
198 $self->{load_row} = $arg->{load_row};
199 $self->{save_row} = $arg->{save_row};
200 $log->debug("using load_row and save_row instead of in-memory hash");
207 if ($self->{recode}) {
208 my @r = split(/\s/, $self->{recode});
210 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
215 $recode_map->{$from} = $to;
218 $recode_regex = join '|' => keys %{ $recode_map };
220 $log->debug("using recode regex: $recode_regex");
226 if (my $p = $arg->{modify_file}) {
227 $log->debug("using modify_file $p");
228 $rec_regex = $self->modify_file_regexps( $p );
229 } elsif (my $h = $arg->{modify_records}) {
230 $log->debug("using modify_records ", sub { dump( $h ) });
231 $rec_regex = $self->modify_record_regexps(%{ $h });
233 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
235 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
237 my $ll_db = $class->new(
238 path => $arg->{path},
239 input_config => $arg->{input_config} || $self->{input_config},
241 # my ($l,$f_nr) = @_;
242 # return unless defined($l);
243 # from_to($l, $code_page, $self->{'encoding'});
244 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
250 unless (defined($ll_db)) {
251 $log->logwarn("can't open database $arg->{path}, skipping...");
255 my $size = $ll_db->size;
258 $log->logwarn("no records in database $arg->{path}, skipping...");
265 if (my $s = $self->{offset}) {
266 $log->debug("skipping to MFN $s");
269 $self->{offset} = $from_rec;
272 if ($self->{limit}) {
273 $log->debug("limiting to ",$self->{limit}," records");
274 $to_rec = $from_rec + $self->{limit} - 1;
275 $to_rec = $size if ($to_rec > $size);
278 # store size for later
279 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
281 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
284 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
286 $log->debug("position: $pos\n");
288 my $rec = $ll_db->fetch_rec($pos, sub {
289 my ($l,$f_nr,$debug) = @_;
290 # return unless defined($l);
291 # return $l unless ($rec_regex && $f_nr);
293 return unless ( defined($l) && defined($f_nr) );
295 warn "-=> $f_nr ## |$l|\n" if ($debug);
296 $log->debug("-=> $f_nr ## $l");
298 # codepage conversion and recode_regex
299 from_to($l, $code_page, $self->{'encoding'});
300 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
303 if ($rec_regex && defined($rec_regex->{$f_nr})) {
304 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
306 foreach my $r (@{ $rec_regex->{$f_nr} }) {
308 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
309 eval '$l =~ ' . $r->{regex};
311 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
312 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
313 $d .= ' ' . $r->{debug} if defined($r->{debug});
314 $log->debug("MODIFY $d");
315 warn "*** $d\n" if ($debug);
318 $log->error("error applying regex: $r") if ($@);
322 $log->debug("<=- $f_nr ## |$l|");
323 warn "<=- $f_nr ## $l\n" if ($debug);
327 $log->debug(sub { dump($rec) });
330 $log->warn("record $pos empty? skipping...");
335 if ($self->{save_row}) {
336 $self->{save_row}->({
341 $self->{data}->{$pos} = $rec;
345 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
347 # update counters for statistics
348 if ($self->{stats}) {
350 # fetch clean record with regexpes applied for statistics
351 my $rec = $ll_db->fetch_rec($pos);
353 foreach my $fld (keys %{ $rec }) {
354 $self->{_stats}->{fld}->{ $fld }++;
356 $log->logdie("invalid record fild $fld, not ARRAY")
357 unless (ref($rec->{ $fld }) eq 'ARRAY');
359 foreach my $row (@{ $rec->{$fld} }) {
361 if (ref($row) eq 'HASH') {
363 foreach my $sf (keys %{ $row }) {
364 next if ($sf eq 'subfields');
365 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
366 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
367 if (ref($row->{$sf}) eq 'ARRAY');
371 $self->{_stats}->{repeatable}->{ $fld }++;
377 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
382 $self->{last_pcnt} = 0;
384 # store max mfn and return it.
385 $self->{max_pos} = $to_rec;
386 $log->debug("max_pos: $to_rec");
389 $self->{ll_db} = $ll_db;
396 Fetch next record from database. It will also displays progress bar.
398 my $rec = $isis->fetch;
400 Record from this function should probably go to C<data_structure> for
408 my $log = $self->_get_logger();
410 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
412 if ($self->{pos} == -1) {
413 $self->{pos} = $self->{offset};
418 my $mfn = $self->{pos};
420 if ($mfn > $self->{max_pos}) {
421 $self->{pos} = $self->{max_pos};
422 $log->debug("at EOF");
426 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
430 if ($self->{load_row}) {
431 $rec = $self->{load_row}->({ id => $mfn });
433 $rec = $self->{data}->{$mfn};
441 Returns current record number (MFN).
445 First record in database has position 1.
457 Returns number of records in database
461 Result from this function can be used to loop through all records
463 foreach my $mfn ( 1 ... $isis->size ) { ... }
465 because it takes into account C<offset> and C<limit>.
471 return $self->{size};
476 Seek to specified MFN in file.
480 First record in database has position 1.
488 my $log = $self->_get_logger();
490 $log->logconfess("called without pos") unless defined($pos);
493 $log->warn("seek before first record");
495 } elsif ($pos > $self->{max_pos}) {
496 $log->warn("seek beyond last record");
497 $pos = $self->{max_pos};
500 return $self->{pos} = (($pos - 1) || -1);
505 Dump statistics about field and subfield usage
514 my $log = $self->_get_logger();
516 my $s = $self->{_stats};
518 $log->warn("called stats, but there is no statistics collected");
527 die "no field in ", dump( $s->{fld} ) unless defined( $f );
528 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
529 $max_fld = $v if ($v > $max_fld);
531 my $o = sprintf("%4s %d ~", $f, $v);
533 if (defined($s->{sf}->{$f})) {
534 my @subfields = keys %{ $s->{sf}->{$f} };
535 warn "$f has subfields ",dump( @subfields );
537 $o .= sprintf(" %s:%d%s", $_,
538 $s->{sf}->{$f}->{$_}->{count},
539 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
543 sort( grep( /^i[12]$/, @subfields ) ),
545 sort( grep( !/^i[12]$/, @subfields ) ),
549 if (my $v_r = $s->{repeatable}->{$f}) {
550 $o .= " ($v_r)" if ($v_r != $v);
554 } sort { $a <=> $b } keys %{ $s->{fld} }
557 $log->debug( sub { dump($s) } );
564 Display humanly readable dump of record
571 return unless $self->{ll_db};
573 if ($self->{ll_db}->can('dump_ascii')) {
574 return $self->{ll_db}->dump_ascii( $self->{pos} );
576 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
582 Helper function called which create regexps to be execute on code.
584 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
585 _get_regex( 900, '^b', ' : ^b' );
587 It supports perl regexps with C<regex:> prefix to from value and has
588 additional logic to skip empty subfields.
593 my ($sf,$from,$to) = @_;
599 if ($from =~ m/^regex:(.+)$/) {
602 $from = '\Q' . $from . '\E';
605 my $need_subfield_data = '*'; # no
606 # if from is also subfield, require some data in between
607 # to correctly skip empty subfields
608 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
610 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
613 's/'. $from .'/'. $to .'/g';
618 =head2 modify_record_regexps
620 Generate hash with regexpes to be applied using L<filter>.
622 my $regexpes = $input->modify_record_regexps(
623 900 => { '^a' => { ' : ' => '^b' } },
624 901 => { '*' => { '^b' => ' ; ' } },
629 sub modify_record_regexps {
631 my $modify_record = {@_};
635 my $log = $self->_get_logger();
637 foreach my $f (keys %$modify_record) {
638 $log->debug("field: $f");
640 foreach my $sf (keys %{ $modify_record->{$f} }) {
641 $log->debug("subfield: $sf");
643 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
644 my $to = $modify_record->{$f}->{$sf}->{$from};
645 #die "no field?" unless defined($to);
646 my $d = "|$from| -> |$to|";
647 $log->debug("transform: $d");
649 my $regex = _get_regex($sf,$from,$to);
650 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
651 $log->debug("regex: $regex");
659 =head2 modify_file_regexps
661 Generate hash with regexpes to be applied using L<filter> from
662 pseudo hash/yaml format for regex mappings.
664 It should be obvious:
671 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
672 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
674 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
676 On undef path it will just return.
680 sub modify_file_regexps {
683 my $modify_path = shift || return;
685 my $log = $self->_get_logger();
689 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
695 next if (/^#/ || /^\s*$/);
697 if (/^\s*(\d+)\s*$/) {
699 $log->debug("field: $f");
701 } elsif (/^\s*'([^']*)'\s*$/) {
703 $log->die("can't define subfiled before field in: $_") unless ($f);
704 $log->debug("subfield: $sf");
705 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
706 my ($from,$to) = ($1, $2);
708 $log->debug("transform: |$from| -> |$to|");
710 my $regex = _get_regex($sf,$from,$to);
711 push @{ $regexpes->{$f} }, {
713 file => $modify_path,
716 $log->debug("regex: $regex");
725 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
727 =head1 COPYRIGHT & LICENSE
729 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
731 This program is free software; you can redistribute it and/or modify it
732 under the same terms as Perl itself.
736 1; # End of WebPAC::Input