requires 'Log::Log4perl' => '1.02';
requires 'Cwd';
requires 'Storable';
-requires 'DBM::Deep';
+#requires 'DBM::Deep';
# 'Template';
requires 'Time::HiRes';
requires 'File::Temp';
requires 'Data::Dump';
requires 'MARC::Lint' => '1.43';
requires 'Business::ISBN' => '2.02'; # WebPAC::Normalize::ISBN
-requires 'Proc::Queue';
requires 'PPI';
requires 'XML::LibXML';
requires 'Pod::Usage';
'Jifty',
],
'WebPAC::Output::Excel' => [
- -default => 0,
+ -default => 1,
'Spreadsheet::WriteExcel' => 2.14,
],
+ 'Parallel execution (probably broken)' => [
+ -default => 0,
+ 'Proc::Queue',
+ ],
);
build_requires 'Test::More';
#log4perl.logger.WebPAC.Input.ISI=DEBUG
#log4perl.logger.WebPAC.Input.XML=DEBUG
#log4perl.logger.WebPAC.Input.PDF=DEBUG
-log4perl.logger.WebPAC.Input.Excel=DEBUG
+#log4perl.logger.WebPAC.Input.Excel=DEBUG
#log4perl.logger.WebPAC.Normalize.=DEBUG
#log4perl.logger.WebPAC.Normalize.new=DEBUG
use WebPAC::Common;
use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
-use Encode qw/from_to/;
+use Encode qw/decode from_to/;
=head1 NAME
WebPAC::Input - read different file formats into WebPAC
-=head1 VERSION
-
-Version 0.18
-
=cut
-our $VERSION = '0.18';
+our $VERSION = '0.19';
=head1 SYNOPSIS
my $db = new WebPAC::Input(
module => 'WebPAC::Input::MARC',
- encoding => 'ISO-8859-2',
recode => 'char pairs',
no_progress_bar => 1,
input_config => {
C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
L<WebPAC::Input::MARC>.
-Optional parametar C<encoding> specify application code page (which will be
-used internally). This should probably be your terminal encoding, and by
-default, it C<ISO-8859-2>.
-
C<recode> is optional string constisting of character or words pairs that
should be replaced in input stream.
my $log = $self->_get_logger;
- $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
- $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
- $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if ($self->{low_mem});
+ $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
+ $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
+ $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if $self->{lookup};
+ $log->logconfess("low_mem argument is not suppored any more. rewrite it to load_row and save_row") if $self->{low_mem};
$log->logconfess("specify low-level file format module") unless ($self->{module});
my $module_path = $self->{module};
require $module_path;
- $self->{'encoding'} ||= 'ISO-8859-2';
-
$self ? return $self : return undef;
}
$input->open(
path => '/path/to/database/file',
- code_page => 'cp852',
+ input_encoding => 'cp852',
+ strict_encoding => 0,
limit => 500,
offset => 6000,
stats => 1,
);
-By default, C<code_page> is assumed to be C<cp852>.
+By default, C<input_encoding> is assumed to be C<cp852>.
C<offset> is optional parametar to position at some offset before reading from database.
C<save_row> and C<load_row> are low-level implementation of store engine. Calling convention
is documented in example above.
+C<strict_encoding> should really default to 1, but it doesn't for now.
+
Returns size of database, regardless of C<offset> and C<limit>
parametars, see also C<size>.
my $arg = {@_};
my $log = $self->_get_logger();
+ $log->debug( "arguments: ",dump( $arg ));
+ $log->logconfess("encoding argument is not suppored any more.") if $self->{encoding};
+ $log->logconfess("code_page argument is not suppored any more.") if $self->{code_page};
$log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
$log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
$log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
$log->logcroak("need path") if (! $arg->{'path'});
- my $code_page = $arg->{'code_page'} || 'cp852';
+ my $input_encoding = $arg->{'input_encoding'} || $self->{'input_encoding'} || 'cp852';
# store data in object
- $self->{'input_code_page'} = $code_page;
foreach my $v (qw/path offset limit/) {
$self->{$v} = $arg->{$v} if ($arg->{$v});
}
# filter => sub {
# my ($l,$f_nr) = @_;
# return unless defined($l);
-# from_to($l, $code_page, $self->{'encoding'});
+# $l = decode($input_encoding, $l);
# $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
# return $l;
# },
# store size for later
$self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
- $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
+ my $strict_encoding = $arg->{strict_encoding} || $self->{strict_encoding}; ## FIXME should be 1 really
+
+ $log->info("processing $self->{size}/$size records [$from_rec-$to_rec]",
+ " encoding $input_encoding ", $strict_encoding ? ' [strict]' : '',
+ $self->{stats} ? ' [stats]' : '',
+ );
# read database
for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
$log->debug("-=> $f_nr ## $l");
# codepage conversion and recode_regex
- from_to($l, $code_page, $self->{'encoding'});
+# $l = decode($input_encoding, $l, 1);
+ from_to( $l, $input_encoding, 'utf-8', 1 );
$l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
# apply regexps
my $self = shift;
return $self->{size};
}
+
+sub default_encoding { 'UTF-16' }
+
=head1 AUTHOR
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
use Storable qw/dclone/;
use Pod::Usage qw/pod2usage/;
-use Proc::Queue size => 1;
use POSIX ":sys_wait_h"; # imports WNOHANG
=head1 NAME
my @links;
if ($parallel) {
+ eval 'use Proc::Queue size => 1;';
+ die $@ if $@;
$log->info("Using $parallel processes for speedup");
Proc::Queue::size($parallel);
}
$log->info("database $database doesn't have inputs defined");
} );
- if ( -e 'out/debug' ) { # fixme flag?
+ if ( -e 'out/debug' ) { # FIXME flag?
my $out;
foreach my $i ( @inputs ) {
warn dump( $i );
my $input_db = new WebPAC::Input(
module => $input_module,
- encoding => $config->webpac('webpac_encoding'),
limit => $limit || $input->{limit},
offset => $offset,
recode => $input->{recode},
my $maxmfn = $input_db->open(
path => $input->{path},
- code_page => $input->{encoding}, # database encoding
+ input_encoding => $input->{encoding}, # database encoding
lookup_coderef => $lookup_coderef,
lookup => $lookup_jar,
%{ $input },
use strict;
use blib;
-use Test::More tests => 45;
+use Test::More tests => 49;
BEGIN {
use_ok( 'WebPAC::Test' );
my $module = 'WebPAC::Input::Excel';
diag "testing with $module";
-our ($input,$db,$size);
+ok(my $input = new WebPAC::Input(
+ module => $module,
+ no_progress_bar => 1,
+ %LOG,
+), "new");
-sub open_xls {
+our ($db,$size);
- ok($input = new WebPAC::Input(
- module => $module,
- no_progress_bar => 1,
- %LOG,
- ), "new $module");
+sub open_xls {
+ my $args = shift;
- my $path = shift;
- $path = "$abs_path/data/$path";
- ok(my $db = $input->open( path => $path ), "open $path");
+ $args->{path} = "$abs_path/data/" . $args->{path} || die "no path?";
+ ok(my $db = $input->open( %$args ), "open " . dump( $args ));
ok($size = $input->size, "size $size");
return $db;
}
-
-open_xls('excel_95.xls');
+open_xls({ path => 'excel_95.xls' });
foreach my $mfn ( 1 ... $size ) {
my $rec = $input->fetch;
diag "rec: ", dump($rec), "\n" if $debug;
}
-open_xls('stara-signatura.xls');
-
-foreach my $mfn ( 20 .. 23 ) {
- diag dump( $input->fetch( $mfn ) ) if $debug;
-}
+open_xls({ path => 'stara-signatura.xls', input_encoding => 'utf-16' });
+ok( $input->seek( 790 ), 'seek 790');
+ok( my $rec = $input->fetch, 'fetch' );
+diag dump( $rec );
diag dump( @expected ) if $debug;
-ok(my $input = WebPAC::Input->new( module => 'WebPAC::Input::Excel', no_progress_bar => 1, %LOG ), 'new input' );
+ok(my $input = WebPAC::Input->new( module => 'WebPAC::Input::Excel', no_progress_bar => 1, encoding => 'utf-16', %LOG ), 'new input' );
ok(my $db = $input->open( path => $path ), "input->open $path");
cmp_ok( $input->size, '==', $#expected, "same size" );
my $ds = shift @expected;
my $rec = $input->fetch;
diag dump( $ds, $rec ) if $debug;
- is_deeply( $ds, $rec, "$mfn same?" );
+ is_deeply( $rec, $ds, "$mfn same?" );
}
ok(my $isis = new WebPAC::Input(
module => 'WebPAC::Input::ISIS',
- code_page => 'ISO-8859-2', # application encoding
limit => 100,
no_progress_bar => 1,
), "new Input::ISIS");
ok(my $maxmfn = $isis->open(
path => $isis_file,
- code_page => 'cp852', # database encoding
+ input_encoding => 'cp852', # database encoding
lookup_coderef => sub {
my $rec = shift || return;
ok($rec, 'lookup_coderef has rec');