From: Dobrica Pavlinusic Date: Tue, 20 Nov 2007 09:30:56 +0000 (+0000) Subject: r1645@llin: dpavlin | 2007-11-19 23:05:23 +0100 X-Git-Url: http://git.rot13.org/?p=webpac2;a=commitdiff_plain;h=a9d71ffece93d473c02e612d5b9593c2c7778ee0 r1645@llin: dpavlin | 2007-11-19 23:05:23 +0100 added experimenal (still not working) WebPAC::Input::PDF git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1054 07558da8-63fa-0310-ba24-9fe276d99e06 --- diff --git a/Makefile.PL b/Makefile.PL index c39e84c..ad6aca0 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -46,6 +46,13 @@ features( ], ); +features( + 'WebPAC::Input::PDF' => [ + -default => 0, + recommends('CAM::PDF'), + ], +); + features( 'WebPAC::Output::Estraier' => [ -default => 0, diff --git a/conf/log.conf b/conf/log.conf index 507930e..a9eedcc 100644 --- a/conf/log.conf +++ b/conf/log.conf @@ -41,6 +41,7 @@ log4perl.rootLogger=INFO, LOG, SCREEN #log4perl.logger.WebPAC.Input.DBF=DEBUG #log4perl.logger.WebPAC.Input.ISI=DEBUG #log4perl.logger.WebPAC.Input.XML=DEBUG +log4perl.logger.WebPAC.Input.PDF=DEBUG #log4perl.logger.WebPAC.Normalize.=DEBUG #log4perl.logger.WebPAC.Normalize.new=DEBUG diff --git a/lib/WebPAC/Input/PDF.pm b/lib/WebPAC/Input/PDF.pm new file mode 100644 index 0000000..91b9920 --- /dev/null +++ b/lib/WebPAC/Input/PDF.pm @@ -0,0 +1,180 @@ +package WebPAC::Input::PDF; + +use warnings; +use strict; + +use WebPAC::Input; +use base qw/WebPAC::Common/; + +use CAM::PDF; +use Carp qw/confess/; + +use Data::Dump qw/dump/; + +=head1 NAME + +WebPAC::Input::PDF - try to parse PDF tabular data + +=head1 SYNOPSIS + +Open PBF file in PDF export fromat + + my $input = new WebPAC::Input::PDF( + path => '/path/to/file.pdf', + ); + +=head1 FUNCTIONS + +=head2 new + +Returns new low-level input API object + + my $input = new WebPAC::Input::PDF( + path => '/path/to/file.pdf' + filter => sub { + my ($l,$field_nr) = @_; + # do something with $l which is line of input file + return $l; + }, + } + +Options: + +=over 4 + +=item path + +path to PDF file + +=back + +=cut + +my $verbose = 1; + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + my $arg = {@_}; + + my $log = $self->_get_logger(); + + my $file = $arg->{path} || $log->logide("need path"); + + my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr ); + + my $pages = $doc->numPages(); + + $log->info("opend $file with $pages pages"); + + my @lines; + + foreach my $p ( 1 .. $pages ) { + my $tree = $doc->getPageContentTree($p); + if ($tree) { + my $out; + + confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY'; + + foreach my $blocks ( @{ $tree->{blocks} } ) { + foreach my $block ( $blocks ) { + next unless defined $block->{value}; + foreach my $value ( $block->{value} ) { + confess "expect array for value" unless ref($value) eq 'ARRAY'; + foreach my $v ( @$value ) { + next unless defined $v->{args}; +#warn "## v ",ref($v),dump( $v ); + my @data; + foreach my $args ( $v->{args} ) { +#warn "## args ",ref($args),dump( $args ); + confess "expect array for args" unless ref($args) eq 'ARRAY'; + foreach my $a ( @$args ) { + if ( $a->{type} eq 'array' ) { +#warn "## a ",ref($a),dump( $a ); + foreach my $av ( @{ $a->{value} } ) { + next unless $av->{type} eq 'string'; +#warn "## av ",ref($av),dump( $av ); + push @data, $av->{value}; + } + } elsif ( $a->{type} eq 'string' ) { + push @data, $a->{value}; + } + } + next unless @data; + warn "data $#data = ",dump(@data); + ## FIXME data specific! + if ( $#data == 4 ) { + push @lines, [ @data ]; + } elsif ( $#data == 0 && $#lines >= 0 ) { + my $v = shift @data; + warn "add $#lines to ",dump( $lines[ $#lines ]->[4] ); + $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v; + warn "added to ",dump( $lines[ $#lines ] ); + } else { + $log->warn("ignored: ",dump( @data )); + } + } + } + } + } + } + } + } + + $self->size( $#lines ); + + $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) }); + + $self ? return $self : return undef; +} + +=head2 fetch_rec + +Return record with ID C<$mfn> from database + + my $rec = $input->fetch_rec( $mfn, $filter_coderef ); + +=cut + +sub fetch_rec { + my $self = shift; + + my ( $mfn, $filter_coderef ) = @_; + + return $self->{_rec}->[$mfn-1]; +} + + +=head2 size + +Return number of records in database + + my $size = $input->size; + +=cut + +sub size { + my $self = shift; + return $#{$self->{_rec}} + 1; +} + +=head1 SEE ALSO + +L is only sane source of document format which Google could find... + +=head1 AUTHOR + +Dobrica Pavlinusic, C<< >> + +=head1 COPYRIGHT & LICENSE + +Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut + +1; # End of WebPAC::Input::PDF diff --git a/t/2-input-pdf.t b/t/2-input-pdf.t new file mode 100755 index 0000000..c0098ba --- /dev/null +++ b/t/2-input-pdf.t @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w + +use strict; +use blib; + +use Test::More tests => 9; + +BEGIN { +use_ok( 'WebPAC::Test' ); +use_ok( 'WebPAC::Input' ); +} + +my $module = 'WebPAC::Input::PDF'; +diag "testing with $module"; + +ok(my $input = new WebPAC::Input( + module => $module, + no_progress_bar => 1, + %LOG +), "new"); + +ok(my $db = $input->open( + path => "$abs_path/data/UFOReport1998.pdf" +), "open"); +ok(my $size = $input->size, "size"); + +foreach my $mfn ( 1 ... $size ) { + my $rec = $input->fetch; + if ($mfn <= 10 || $mfn == 20) { + ok($rec, "fetch $mfn"); + } else { + ok(! $rec, "empty $mfn"); + } + + cmp_ok($input->pos, '==', $mfn, "pos $mfn"); + + diag "rec: ", dump($rec), "\n" if $debug; +} +