From 640203e47fc3ef06a8b73b01bdb9ee76a2baa569 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sat, 2 Aug 2008 23:46:41 +0000 Subject: [PATCH] Make cleanup of encodings, moving webpac closer to having internal utf-8 representation. This will break current code, but is really neceserry step toward checking input encoding for validity git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1100 07558da8-63fa-0310-ba24-9fe276d99e06 --- Makefile.PL | 9 +++++--- conf/log.conf | 2 +- lib/WebPAC/Input.pm | 47 ++++++++++++++++++++------------------- lib/WebPAC/Input/Excel.pm | 3 +++ run.pl | 8 +++---- t/2-input-excel.t | 34 +++++++++++++--------------- t/5-output-excel.t | 4 ++-- t/6-unit.t | 3 +-- 8 files changed, 57 insertions(+), 53 deletions(-) diff --git a/Makefile.PL b/Makefile.PL index 4d26f1f..5c0807f 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -13,7 +13,7 @@ requires 'File::Slurp'; requires 'Log::Log4perl' => '1.02'; requires 'Cwd'; requires 'Storable'; -requires 'DBM::Deep'; +#requires 'DBM::Deep'; # 'Template'; requires 'Time::HiRes'; requires 'File::Temp'; @@ -28,7 +28,6 @@ requires 'MARC::Record' => '2.0'; requires 'Data::Dump'; requires 'MARC::Lint' => '1.43'; requires 'Business::ISBN' => '2.02'; # WebPAC::Normalize::ISBN -requires 'Proc::Queue'; requires 'PPI'; requires 'XML::LibXML'; requires 'Pod::Usage'; @@ -60,9 +59,13 @@ features( 'Jifty', ], 'WebPAC::Output::Excel' => [ - -default => 0, + -default => 1, 'Spreadsheet::WriteExcel' => 2.14, ], + 'Parallel execution (probably broken)' => [ + -default => 0, + 'Proc::Queue', + ], ); build_requires 'Test::More'; diff --git a/conf/log.conf b/conf/log.conf index abbd490..56768ea 100644 --- a/conf/log.conf +++ b/conf/log.conf @@ -42,7 +42,7 @@ log4perl.rootLogger=INFO, LOG, SCREEN #log4perl.logger.WebPAC.Input.ISI=DEBUG #log4perl.logger.WebPAC.Input.XML=DEBUG #log4perl.logger.WebPAC.Input.PDF=DEBUG -log4perl.logger.WebPAC.Input.Excel=DEBUG +#log4perl.logger.WebPAC.Input.Excel=DEBUG #log4perl.logger.WebPAC.Normalize.=DEBUG #log4perl.logger.WebPAC.Normalize.new=DEBUG diff --git a/lib/WebPAC/Input.pm b/lib/WebPAC/Input.pm index 37d5166..218a969 100644 --- a/lib/WebPAC/Input.pm +++ b/lib/WebPAC/Input.pm @@ -8,19 +8,15 @@ use blib; use WebPAC::Common; use base qw/WebPAC::Common/; use Data::Dump qw/dump/; -use Encode qw/from_to/; +use Encode qw/decode from_to/; =head1 NAME WebPAC::Input - read different file formats into WebPAC -=head1 VERSION - -Version 0.18 - =cut -our $VERSION = '0.18'; +our $VERSION = '0.19'; =head1 SYNOPSIS @@ -61,7 +57,6 @@ Create new input database object. my $db = new WebPAC::Input( module => 'WebPAC::Input::MARC', - encoding => 'ISO-8859-2', recode => 'char pairs', no_progress_bar => 1, input_config => { @@ -72,10 +67,6 @@ Create new input database object. C is low-level file format module. See L and L. -Optional parametar C specify application code page (which will be -used internally). This should probably be your terminal encoding, and by -default, it C. - C is optional string constisting of character or words pairs that should be replaced in input stream. @@ -93,9 +84,10 @@ sub new { my $log = $self->_get_logger; - $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup}); - $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup}); - $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem}); + $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page}; + $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding}; + $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup}; + $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem}; $log->logconfess("specify low-level file format module") unless ($self->{module}); my $module_path = $self->{module}; @@ -105,8 +97,6 @@ sub new { require $module_path; - $self->{'encoding'} ||= 'ISO-8859-2'; - $self ? return $self : return undef; } @@ -118,7 +108,8 @@ This function will read whole database in memory and produce lookups. $input->open( path => '/path/to/database/file', - code_page => 'cp852', + input_encoding => 'cp852', + strict_encoding => 0, limit => 500, offset => 6000, stats => 1, @@ -143,7 +134,7 @@ This function will read whole database in memory and produce lookups. ); -By default, C is assumed to be C. +By default, C is assumed to be C. C is optional parametar to position at some offset before reading from database. @@ -164,6 +155,8 @@ overrides C if both exists for same input. C and C are low-level implementation of store engine. Calling convention is documented in example above. +C should really default to 1, but it doesn't for now. + Returns size of database, regardless of C and C parametars, see also C. @@ -174,7 +167,10 @@ sub open { my $arg = {@_}; my $log = $self->_get_logger(); + $log->debug( "arguments: ",dump( $arg )); + $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding}; + $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page}; $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup}); $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef})) if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE'); @@ -182,10 +178,9 @@ sub open { $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef"); $log->logcroak("need path") if (! $arg->{'path'}); - my $code_page = $arg->{'code_page'} || 'cp852'; + my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852'; # store data in object - $self->{'input_code_page'} = $code_page; foreach my $v (qw/path offset limit/) { $self->{$v} = $arg->{$v} if ($arg->{$v}); } @@ -240,7 +235,7 @@ sub open { # filter => sub { # my ($l,$f_nr) = @_; # return unless defined($l); -# from_to($l, $code_page, $self->{'encoding'}); +# $l = decode($input_encoding, $l); # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map); # return $l; # }, @@ -278,7 +273,12 @@ sub open { # store size for later $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0; - $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : ''); + my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really + + $log->info("processing $self->{size}/$size records [$from_rec-$to_rec]", + " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '', + $self->{stats} ? ' [stats]' : '', + ); # read database for (my $pos = $from_rec; $pos <= $to_rec; $pos++) { @@ -296,7 +296,8 @@ sub open { $log->debug("-=> $f_nr ## $l"); # codepage conversion and recode_regex - from_to($l, $code_page, $self->{'encoding'}); +# $l = decode($input_encoding, $l, 1); + from_to( $l, $input_encoding, 'utf-8', 1 ); $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map); # apply regexps diff --git a/lib/WebPAC/Input/Excel.pm b/lib/WebPAC/Input/Excel.pm index 72d6095..92c4e37 100644 --- a/lib/WebPAC/Input/Excel.pm +++ b/lib/WebPAC/Input/Excel.pm @@ -141,6 +141,9 @@ sub size { my $self = shift; return $self->{size}; } + +sub default_encoding { 'UTF-16' } + =head1 AUTHOR Dobrica Pavlinusic, C<< >> diff --git a/run.pl b/run.pl index cd392f0..7aa499c 100755 --- a/run.pl +++ b/run.pl @@ -23,7 +23,6 @@ use Data::Dump qw/dump/; use Storable qw/dclone/; use Pod::Usage qw/pod2usage/; -use Proc::Queue size => 1; use POSIX ":sys_wait_h"; # imports WNOHANG =head1 NAME @@ -207,6 +206,8 @@ my $start_t = time(); my @links; if ($parallel) { + eval 'use Proc::Queue size => 1;'; + die $@ if $@; $log->info("Using $parallel processes for speedup"); Proc::Queue::size($parallel); } @@ -349,7 +350,7 @@ foreach my $database ( sort keys %{ $config->databases } ) { $log->info("database $database doesn't have inputs defined"); } ); - if ( -e 'out/debug' ) { # fixme flag? + if ( -e 'out/debug' ) { # FIXME flag? my $out; foreach my $i ( @inputs ) { warn dump( $i ); @@ -399,7 +400,6 @@ foreach my $database ( sort keys %{ $config->databases } ) { my $input_db = new WebPAC::Input( module => $input_module, - encoding => $config->webpac('webpac_encoding'), limit => $limit || $input->{limit}, offset => $offset, recode => $input->{recode}, @@ -444,7 +444,7 @@ foreach my $database ( sort keys %{ $config->databases } ) { my $maxmfn = $input_db->open( path => $input->{path}, - code_page => $input->{encoding}, # database encoding + input_encoding => $input->{encoding}, # database encoding lookup_coderef => $lookup_coderef, lookup => $lookup_jar, %{ $input }, diff --git a/t/2-input-excel.t b/t/2-input-excel.t index b46ebf2..962f48c 100755 --- a/t/2-input-excel.t +++ b/t/2-input-excel.t @@ -3,7 +3,7 @@ use strict; use blib; -use Test::More tests => 45; +use Test::More tests => 49; BEGIN { use_ok( 'WebPAC::Test' ); @@ -13,25 +13,24 @@ use_ok( 'WebPAC::Input' ); my $module = 'WebPAC::Input::Excel'; diag "testing with $module"; -our ($input,$db,$size); +ok(my $input = new WebPAC::Input( + module => $module, + no_progress_bar => 1, + %LOG, +), "new"); -sub open_xls { +our ($db,$size); - ok($input = new WebPAC::Input( - module => $module, - no_progress_bar => 1, - %LOG, - ), "new $module"); +sub open_xls { + my $args = shift; - my $path = shift; - $path = "$abs_path/data/$path"; - ok(my $db = $input->open( path => $path ), "open $path"); + $args->{path} = "$abs_path/data/" . $args->{path} || die "no path?"; + ok(my $db = $input->open( %$args ), "open " . dump( $args )); ok($size = $input->size, "size $size"); return $db; } - -open_xls('excel_95.xls'); +open_xls({ path => 'excel_95.xls' }); foreach my $mfn ( 1 ... $size ) { my $rec = $input->fetch; @@ -46,8 +45,7 @@ foreach my $mfn ( 1 ... $size ) { diag "rec: ", dump($rec), "\n" if $debug; } -open_xls('stara-signatura.xls'); - -foreach my $mfn ( 20 .. 23 ) { - diag dump( $input->fetch( $mfn ) ) if $debug; -} +open_xls({ path => 'stara-signatura.xls', input_encoding => 'utf-16' }); +ok( $input->seek( 790 ), 'seek 790'); +ok( my $rec = $input->fetch, 'fetch' ); +diag dump( $rec ); diff --git a/t/5-output-excel.t b/t/5-output-excel.t index 490fc17..e6c8bb0 100755 --- a/t/5-output-excel.t +++ b/t/5-output-excel.t @@ -45,7 +45,7 @@ cmp_ok( $out->path, 'eq', $path, 'path' ); diag dump( @expected ) if $debug; -ok(my $input = WebPAC::Input->new( module => 'WebPAC::Input::Excel', no_progress_bar => 1, %LOG ), 'new input' ); +ok(my $input = WebPAC::Input->new( module => 'WebPAC::Input::Excel', no_progress_bar => 1, encoding => 'utf-16', %LOG ), 'new input' ); ok(my $db = $input->open( path => $path ), "input->open $path"); cmp_ok( $input->size, '==', $#expected, "same size" ); @@ -54,5 +54,5 @@ foreach my $mfn ( 1 ... $input->size ) { my $ds = shift @expected; my $rec = $input->fetch; diag dump( $ds, $rec ) if $debug; - is_deeply( $ds, $rec, "$mfn same?" ); + is_deeply( $rec, $ds, "$mfn same?" ); } diff --git a/t/6-unit.t b/t/6-unit.t index ab27783..a9a6935 100755 --- a/t/6-unit.t +++ b/t/6-unit.t @@ -39,14 +39,13 @@ my $lookup_file = "$abs_path../conf/lookup/isis.pm"; ok(my $isis = new WebPAC::Input( module => 'WebPAC::Input::ISIS', - code_page => 'ISO-8859-2', # application encoding limit => 100, no_progress_bar => 1, ), "new Input::ISIS"); ok(my $maxmfn = $isis->open( path => $isis_file, - code_page => 'cp852', # database encoding + input_encoding => 'cp852', # database encoding lookup_coderef => sub { my $rec = shift || return; ok($rec, 'lookup_coderef has rec'); -- 2.20.1