9 use base qw/WebPAC::Common/;
10 use Data::Dump qw/dump/;
11 use Encode qw/from_to/;
15 WebPAC::Input - read different file formats into WebPAC
23 our $VERSION = '0.18';
27 This module implements input as database which have fixed and known
28 I<size> while indexing and single unique numeric identifier for database
29 position ranging from 1 to I<size>.
31 Simply, something that is indexed by unmber from 1 .. I<size>.
33 Examples of such databases are CDS/ISIS files, MARC files, lines in
36 Specific file formats are implemented using low-level interface modules,
37 located in C<WebPAC::Input::*> namespace which export C<open_db>,
38 C<fetch_rec> and optional C<init> functions.
40 Perhaps a little code snippet.
44 my $db = WebPAC::Input->new(
45 module => 'WebPAC::Input::ISIS',
48 $db->open( path => '/path/to/database' );
49 print "database size: ",$db->size,"\n";
50 while (my $rec = $db->fetch) {
51 # do something with $rec
60 Create new input database object.
62 my $db = new WebPAC::Input(
63 module => 'WebPAC::Input::MARC',
64 encoding => 'ISO-8859-2',
65 recode => 'char pairs',
69 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
70 L<WebPAC::Input::MARC>.
72 Optional parametar C<encoding> specify application code page (which will be
73 used internally). This should probably be your terminal encoding, and by
74 default, it C<ISO-8859-2>.
76 C<recode> is optional string constisting of character or words pairs that
77 should be replaced in input stream.
79 C<no_progress_bar> disables progress bar output on C<STDOUT>
81 This function will also call low-level C<init> if it exists with same
91 my $log = $self->_get_logger;
93 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
94 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
95 $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem});
97 $log->logconfess("specify low-level file format module") unless ($self->{module});
98 my $module_path = $self->{module};
99 $module_path =~ s#::#/#g;
100 $module_path .= '.pm';
101 $log->debug("require low-level module $self->{module} from $module_path");
103 require $module_path;
105 $self->{'encoding'} ||= 'ISO-8859-2';
107 $self ? return $self : return undef;
112 This function will read whole database in memory and produce lookups.
114 my $store; # simple in-memory hash
117 path => '/path/to/database/file',
118 code_page => 'cp852',
122 lookup_coderef => sub {
127 900 => { '^a' => { ' : ' => '^b' } },
128 901 => { '*' => { '^b' => ' ; ' } },
130 modify_file => 'conf/modify/mapping.map',
133 $store->{ $a->{id} } = $a->{row};
137 return defined($store->{ $a->{id} }) &&
138 $store->{ $a->{id} };
143 By default, C<code_page> is assumed to be C<cp852>.
145 C<offset> is optional parametar to position at some offset before reading from database.
147 C<limit> is optional parametar to read just C<limit> records from database
149 C<stats> create optional report about usage of fields and subfields
151 C<lookup_coderef> is closure to called to save data into lookups
153 C<modify_records> specify mapping from subfields to delimiters or from
154 delimiters to subfields, as well as oprations on fields (if subfield is
157 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
158 (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
159 overrides C<modify_records> if both exists for same input.
161 C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
162 is documented in example above.
164 Returns size of database, regardless of C<offset> and C<limit>
165 parametars, see also C<size>.
173 my $log = $self->_get_logger();
175 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
176 $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
177 if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
179 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
181 $log->logcroak("need path") if (! $arg->{'path'});
182 my $code_page = $arg->{'code_page'} || 'cp852';
184 # store data in object
185 $self->{'input_code_page'} = $code_page;
186 foreach my $v (qw/path offset limit/) {
187 $self->{$v} = $arg->{$v} if ($arg->{$v});
190 if ($arg->{load_row} || $arg->{save_row}) {
191 $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless (
192 ref($arg->{load_row}) eq 'CODE' &&
193 ref($arg->{save_row}) eq 'CODE'
195 $self->{load_row} = $arg->{load_row};
196 $self->{save_row} = $arg->{save_row};
197 $log->debug("using load_row and save_row instead of in-memory hash");
204 if ($self->{recode}) {
205 my @r = split(/\s/, $self->{recode});
207 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
212 $recode_map->{$from} = $to;
215 $recode_regex = join '|' => keys %{ $recode_map };
217 $log->debug("using recode regex: $recode_regex");
223 if (my $p = $arg->{modify_file}) {
224 $log->debug("using modify_file $p");
225 $rec_regex = $self->modify_file_regexps( $p );
226 } elsif (my $h = $arg->{modify_records}) {
227 $log->debug("using modify_records ", sub { dump( $h ) });
228 $rec_regex = $self->modify_record_regexps(%{ $h });
230 $log->debug("rec_regex: ", sub { dump($rec_regex) }) if ($rec_regex);
232 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
234 my $ll_db = $class->new(
235 path => $arg->{path},
237 # my ($l,$f_nr) = @_;
238 # return unless defined($l);
239 # from_to($l, $code_page, $self->{'encoding'});
240 # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
246 unless (defined($ll_db)) {
247 $log->logwarn("can't open database $arg->{path}, skipping...");
251 my $size = $ll_db->size;
254 $log->logwarn("no records in database $arg->{path}, skipping...");
261 if (my $s = $self->{offset}) {
262 $log->debug("skipping to MFN $s");
265 $self->{offset} = $from_rec;
268 if ($self->{limit}) {
269 $log->debug("limiting to ",$self->{limit}," records");
270 $to_rec = $from_rec + $self->{limit} - 1;
271 $to_rec = $size if ($to_rec > $size);
274 # store size for later
275 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
277 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
280 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
282 $log->debug("position: $pos\n");
284 my $rec = $ll_db->fetch_rec($pos, sub {
285 my ($l,$f_nr,$debug) = @_;
286 # return unless defined($l);
287 # return $l unless ($rec_regex && $f_nr);
289 return unless ( defined($l) && defined($f_nr) );
291 warn "-=> $f_nr ## |$l|\n" if ($debug);
292 $log->debug("-=> $f_nr ## $l");
294 # codepage conversion and recode_regex
295 from_to($l, $code_page, $self->{'encoding'});
296 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
299 if ($rec_regex && defined($rec_regex->{$f_nr})) {
300 $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
302 foreach my $r (@{ $rec_regex->{$f_nr} }) {
304 $log->logconfess("expected regex in ", dump( $r )) unless defined($r->{regex});
305 eval '$l =~ ' . $r->{regex};
307 my $d = "|$old_l| -> |$l| "; # . $r->{regex};
308 $d .= ' +' . $r->{line} . ' ' . $r->{file} if defined($r->{line});
309 $d .= ' ' . $r->{debug} if defined($r->{debug});
310 $log->debug("MODIFY $d");
311 warn "*** $d\n" if ($debug);
314 $log->error("error applying regex: $r") if ($@);
318 $log->debug("<=- $f_nr ## |$l|");
319 warn "<=- $f_nr ## $l\n" if ($debug);
323 $log->debug(sub { dump($rec) });
326 $log->warn("record $pos empty? skipping...");
331 if ($self->{save_row}) {
332 $self->{save_row}->({
337 $self->{data}->{$pos} = $rec;
341 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
343 # update counters for statistics
344 if ($self->{stats}) {
346 # fetch clean record with regexpes applied for statistics
347 my $rec = $ll_db->fetch_rec($pos);
349 foreach my $fld (keys %{ $rec }) {
350 $self->{_stats}->{fld}->{ $fld }++;
352 $log->logdie("invalid record fild $fld, not ARRAY")
353 unless (ref($rec->{ $fld }) eq 'ARRAY');
355 foreach my $row (@{ $rec->{$fld} }) {
357 if (ref($row) eq 'HASH') {
359 foreach my $sf (keys %{ $row }) {
360 next if ($sf eq 'subfields');
361 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
362 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
363 if (ref($row->{$sf}) eq 'ARRAY');
367 $self->{_stats}->{repeatable}->{ $fld }++;
373 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
378 $self->{last_pcnt} = 0;
380 # store max mfn and return it.
381 $self->{max_pos} = $to_rec;
382 $log->debug("max_pos: $to_rec");
385 $self->{ll_db} = $ll_db;
392 Fetch next record from database. It will also displays progress bar.
394 my $rec = $isis->fetch;
396 Record from this function should probably go to C<data_structure> for
404 my $log = $self->_get_logger();
406 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
408 if ($self->{pos} == -1) {
409 $self->{pos} = $self->{offset};
414 my $mfn = $self->{pos};
416 if ($mfn > $self->{max_pos}) {
417 $self->{pos} = $self->{max_pos};
418 $log->debug("at EOF");
422 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
426 if ($self->{load_row}) {
427 $rec = $self->{load_row}->({ id => $mfn });
429 $rec = $self->{data}->{$mfn};
437 Returns current record number (MFN).
441 First record in database has position 1.
453 Returns number of records in database
457 Result from this function can be used to loop through all records
459 foreach my $mfn ( 1 ... $isis->size ) { ... }
461 because it takes into account C<offset> and C<limit>.
467 return $self->{size};
472 Seek to specified MFN in file.
476 First record in database has position 1.
484 my $log = $self->_get_logger();
486 $log->logconfess("called without pos") unless defined($pos);
489 $log->warn("seek before first record");
491 } elsif ($pos > $self->{max_pos}) {
492 $log->warn("seek beyond last record");
493 $pos = $self->{max_pos};
496 return $self->{pos} = (($pos - 1) || -1);
501 Dump statistics about field and subfield usage
510 my $log = $self->_get_logger();
512 my $s = $self->{_stats};
514 $log->warn("called stats, but there is no statistics collected");
523 die "no field in ", dump( $s->{fld} ) unless defined( $f );
524 my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
525 $max_fld = $v if ($v > $max_fld);
527 my $o = sprintf("%4s %d ~", $f, $v);
529 if (defined($s->{sf}->{$f})) {
531 $o .= sprintf(" %s:%d%s", $_,
532 $s->{sf}->{$f}->{$_}->{count},
533 $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
535 } sort keys %{ $s->{sf}->{$f} };
538 if (my $v_r = $s->{repeatable}->{$f}) {
539 $o .= " ($v_r)" if ($v_r != $v);
543 } sort { $a <=> $b } keys %{ $s->{fld} }
546 $log->debug( sub { dump($s) } );
553 Display humanly readable dump of record
560 return unless $self->{ll_db};
562 if ($self->{ll_db}->can('dump_rec')) {
563 return $self->{ll_db}->dump_ascii( $self->{pos} );
565 return dump( $self->{ll_db}->fetch_rec( $self->{pos} ) );
571 Helper function called which create regexps to be execute on code.
573 _get_regex( 900, 'regex:[0-9]+' ,'numbers' );
574 _get_regex( 900, '^b', ' : ^b' );
576 It supports perl regexps with C<regex:> prefix to from value and has
577 additional logic to skip empty subfields.
582 my ($sf,$from,$to) = @_;
588 if ($from =~ m/^regex:(.+)$/) {
591 $from = '\Q' . $from . '\E';
594 my $need_subfield_data = '*'; # no
595 # if from is also subfield, require some data in between
596 # to correctly skip empty subfields
597 $need_subfield_data = '+' if ($from =~ m/^\\Q\^/);
599 's/\Q'. $sf .'\E([^\^]' . $need_subfield_data . '?)'. $from .'([^\^]*?)/'. $sf .'$1'. $to .'$2/';
602 's/'. $from .'/'. $to .'/g';
607 =head2 modify_record_regexps
609 Generate hash with regexpes to be applied using L<filter>.
611 my $regexpes = $input->modify_record_regexps(
612 900 => { '^a' => { ' : ' => '^b' } },
613 901 => { '*' => { '^b' => ' ; ' } },
618 sub modify_record_regexps {
620 my $modify_record = {@_};
624 my $log = $self->_get_logger();
626 foreach my $f (keys %$modify_record) {
627 $log->debug("field: $f");
629 foreach my $sf (keys %{ $modify_record->{$f} }) {
630 $log->debug("subfield: $sf");
632 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
633 my $to = $modify_record->{$f}->{$sf}->{$from};
634 #die "no field?" unless defined($to);
635 my $d = "|$from| -> |$to|";
636 $log->debug("transform: $d");
638 my $regex = _get_regex($sf,$from,$to);
639 push @{ $regexpes->{$f} }, { regex => $regex, debug => $d };
640 $log->debug("regex: $regex");
648 =head2 modify_file_regexps
650 Generate hash with regexpes to be applied using L<filter> from
651 pseudo hash/yaml format for regex mappings.
653 It should be obvious:
660 In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
661 In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
663 my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
665 On undef path it will just return.
669 sub modify_file_regexps {
672 my $modify_path = shift || return;
674 my $log = $self->_get_logger();
678 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
684 next if (/^#/ || /^\s*$/);
686 if (/^\s*(\d+)\s*$/) {
688 $log->debug("field: $f");
690 } elsif (/^\s*'([^']*)'\s*$/) {
692 $log->die("can't define subfiled before field in: $_") unless ($f);
693 $log->debug("subfield: $sf");
694 } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
695 my ($from,$to) = ($1, $2);
697 $log->debug("transform: |$from| -> |$to|");
699 my $regex = _get_regex($sf,$from,$to);
700 push @{ $regexpes->{$f} }, {
702 file => $modify_path,
705 $log->debug("regex: $regex");
714 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
716 =head1 COPYRIGHT & LICENSE
718 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
720 This program is free software; you can redistribute it and/or modify it
721 under the same terms as Perl itself.
725 1; # End of WebPAC::Input