X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=lib%2FWebPAC%2FInput.pm;h=df2e052ce4a3a4b6e9c78addcf9c956a72ba09aa;hb=aa692c48ec45de3a07334778ed9e06ac48703e9e;hp=37d51662506df71584898fdb487fad9cb91d04bf;hpb=6119950d4d7d8522b78dd380afc29efa89065fcb;p=webpac2 diff --git a/lib/WebPAC/Input.pm b/lib/WebPAC/Input.pm index 37d5166..df2e052 100644 --- a/lib/WebPAC/Input.pm +++ b/lib/WebPAC/Input.pm @@ -3,24 +3,21 @@ package WebPAC::Input; use warnings; use strict; -use blib; +use lib 'lib'; use WebPAC::Common; use base qw/WebPAC::Common/; use Data::Dump qw/dump/; -use Encode qw/from_to/; +use Encode qw/decode from_to/; +use YAML; =head1 NAME WebPAC::Input - read different file formats into WebPAC -=head1 VERSION - -Version 0.18 - =cut -our $VERSION = '0.18'; +our $VERSION = '0.19'; =head1 SYNOPSIS @@ -61,7 +58,6 @@ Create new input database object. my $db = new WebPAC::Input( module => 'WebPAC::Input::MARC', - encoding => 'ISO-8859-2', recode => 'char pairs', no_progress_bar => 1, input_config => { @@ -72,10 +68,6 @@ Create new input database object. C is low-level file format module. See L and L. -Optional parametar C specify application code page (which will be -used internally). This should probably be your terminal encoding, and by -default, it C. - C is optional string constisting of character or words pairs that should be replaced in input stream. @@ -93,9 +85,10 @@ sub new { my $log = $self->_get_logger; - $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup}); - $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup}); - $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem}); + $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page}; + $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding}; + $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup}; + $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem}; $log->logconfess("specify low-level file format module") unless ($self->{module}); my $module_path = $self->{module}; @@ -105,8 +98,6 @@ sub new { require $module_path; - $self->{'encoding'} ||= 'ISO-8859-2'; - $self ? return $self : return undef; } @@ -118,7 +109,8 @@ This function will read whole database in memory and produce lookups. $input->open( path => '/path/to/database/file', - code_page => 'cp852', + input_encoding => 'cp852', + strict_encoding => 0, limit => 500, offset => 6000, stats => 1, @@ -143,9 +135,9 @@ This function will read whole database in memory and produce lookups. ); -By default, C is assumed to be C. +By default, C is assumed to be C. -C is optional parametar to position at some offset before reading from database. +C is optional parametar to skip records at beginning. C is optional parametar to read just C records from database @@ -164,6 +156,8 @@ overrides C if both exists for same input. C and C are low-level implementation of store engine. Calling convention is documented in example above. +C should really default to 1, but it doesn't for now. + Returns size of database, regardless of C and C parametars, see also C. @@ -174,7 +168,10 @@ sub open { my $arg = {@_}; my $log = $self->_get_logger(); + $log->debug( "arguments: ",dump( $arg )); + $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding}; + $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page}; $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup}); $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef})) if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE'); @@ -182,13 +179,10 @@ sub open { $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef"); $log->logcroak("need path") if (! $arg->{'path'}); - my $code_page = $arg->{'code_page'} || 'cp852'; + my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852'; # store data in object - $self->{'input_code_page'} = $code_page; - foreach my $v (qw/path offset limit/) { - $self->{$v} = $arg->{$v} if ($arg->{$v}); - } + $self->{$_} = $arg->{$_} foreach grep { defined $arg->{$_} } qw(path offset limit); if ($arg->{load_row} || $arg->{save_row}) { $log->logconfess("save_row and load_row must be defined in pair and be CODE") unless ( @@ -212,6 +206,7 @@ sub open { while (@r) { my $from = shift @r; my $to = shift @r; + $from =~ s/^\\x([0-9a-f]{2})/chr(hex($1))/eig; $recode_map->{$from} = $to; } @@ -234,19 +229,24 @@ sub open { my $class = $self->{module} || $log->logconfess("can't get low-level module name!"); + $arg->{$_} = $self->{$_} foreach qw(offset limit); + my $ll_db = $class->new( path => $arg->{path}, input_config => $arg->{input_config} || $self->{input_config}, # filter => sub { # my ($l,$f_nr) = @_; # return unless defined($l); -# from_to($l, $code_page, $self->{'encoding'}); +# $l = decode($input_encoding, $l); # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map); # return $l; # }, %{ $arg }, ); + # save for dump and input_module + $self->{ll_db} = $ll_db; + unless (defined($ll_db)) { $log->logwarn("can't open database $arg->{path}, skipping..."); return; @@ -263,10 +263,10 @@ sub open { my $to_rec = $size; if (my $s = $self->{offset}) { - $log->debug("skipping to MFN $s"); - $from_rec = $s; + $log->debug("offset $s records"); + $from_rec = $s + 1; } else { - $self->{offset} = $from_rec; + $self->{offset} = $from_rec - 1; } if ($self->{limit}) { @@ -275,16 +275,22 @@ sub open { $to_rec = $size if ($to_rec > $size); } - # store size for later - $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0; + my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really - $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : ''); + $log->info("processing ", $self->{size} || 'all', "/$size records [$from_rec-$to_rec]", + " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '', + $self->{stats} ? ' [stats]' : '', + ); + + $self->{size} = 0; # read database for (my $pos = $from_rec; $pos <= $to_rec; $pos++) { $log->debug("position: $pos\n"); + $self->{size}++; # XXX I could move this more down if I didn't want empty records... + my $rec = $ll_db->fetch_rec($pos, sub { my ($l,$f_nr,$debug) = @_; # return unless defined($l); @@ -292,11 +298,13 @@ sub open { return unless ( defined($l) && defined($f_nr) ); + my $marc_subfields = $l =~ s/\x1F(\w)/\^$1/g; # fix MARC subfiled delimiters to ^ + warn "-=> $f_nr ## |$l|\n" if ($debug); $log->debug("-=> $f_nr ## $l"); # codepage conversion and recode_regex - from_to($l, $code_page, $self->{'encoding'}); + $l = decode($input_encoding, $l, 1); $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map); # apply regexps @@ -315,10 +323,12 @@ sub open { warn "*** $d\n" if ($debug); } - $log->error("error applying regex: $r") if ($@); + $log->error("error applying regex: ",dump($r), $@) if $@; } } + $l =~ s/\^(\w)/\x1F$1/g if $marc_subfields; + $log->debug("<=- $f_nr ## |$l|"); warn "<=- $f_nr ## $l\n" if ($debug); return $l; @@ -385,12 +395,11 @@ sub open { $self->{max_pos} = $to_rec; $log->debug("max_pos: $to_rec"); - # save for dump - $self->{ll_db} = $ll_db; - return $size; } +sub input_module { $_[0]->{ll_db} } + =head2 fetch Fetch next record from database. It will also displays progress bar. @@ -410,7 +419,7 @@ sub fetch { $log->logconfess("it seems that you didn't load database!") unless ($self->{pos}); if ($self->{pos} == -1) { - $self->{pos} = $self->{offset}; + $self->{pos} = $self->{offset} + 1; } else { $self->{pos}++; } @@ -468,7 +477,7 @@ because it takes into account C and C. sub size { my $self = shift; - return $self->{size}; + return $self->{size}; # FIXME this is buggy if open is called multiple times! } =head2 seek @@ -561,6 +570,10 @@ sub stats { $log->debug( sub { dump($s) } ); + my $path = 'var/stats.yml'; + YAML::DumpFile( $path, $s ); + $log->info( 'created ', $path, ' with ', -s $path, ' bytes' ); + return $out; } @@ -691,7 +704,7 @@ sub modify_file_regexps { my $regexpes; - CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!"); + CORE::open(my $fh, '<:encoding(utf-8)', $modify_path) || $log->logdie("can't open modify file $modify_path: $!"); my ($f,$sf); @@ -719,6 +732,8 @@ sub modify_file_regexps { line => $., }; $log->debug("regex: $regex"); + } else { + die "can't parse: $_"; } }