r1645@llin: dpavlin | 2007-11-19 23:05:23 +0100
authorDobrica Pavlinusic <dpavlin@rot13.org>
Tue, 20 Nov 2007 09:30:56 +0000 (09:30 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Tue, 20 Nov 2007 09:30:56 +0000 (09:30 +0000)
 added experimenal (still not working) WebPAC::Input::PDF

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1054 07558da8-63fa-0310-ba24-9fe276d99e06

Makefile.PL
conf/log.conf
lib/WebPAC/Input/PDF.pm [new file with mode: 0644]
t/2-input-pdf.t [new file with mode: 0755]

index c39e84c..ad6aca0 100644 (file)
@@ -46,6 +46,13 @@ features(
        ],
 );
 
+features(
+       'WebPAC::Input::PDF' => [
+               -default => 0,
+               recommends('CAM::PDF'),
+       ],
+);
+
 features(
        'WebPAC::Output::Estraier' => [
                -default => 0,
index 507930e..a9eedcc 100644 (file)
@@ -41,6 +41,7 @@ log4perl.rootLogger=INFO, LOG, SCREEN
 #log4perl.logger.WebPAC.Input.DBF=DEBUG
 #log4perl.logger.WebPAC.Input.ISI=DEBUG
 #log4perl.logger.WebPAC.Input.XML=DEBUG
+log4perl.logger.WebPAC.Input.PDF=DEBUG
 
 #log4perl.logger.WebPAC.Normalize.=DEBUG
 #log4perl.logger.WebPAC.Normalize.new=DEBUG
diff --git a/lib/WebPAC/Input/PDF.pm b/lib/WebPAC/Input/PDF.pm
new file mode 100644 (file)
index 0000000..91b9920
--- /dev/null
@@ -0,0 +1,180 @@
+package WebPAC::Input::PDF;
+
+use warnings;
+use strict;
+
+use WebPAC::Input;
+use base qw/WebPAC::Common/;
+
+use CAM::PDF;
+use Carp qw/confess/;
+
+use Data::Dump qw/dump/;
+
+=head1 NAME
+
+WebPAC::Input::PDF - try to parse PDF tabular data
+
+=head1 SYNOPSIS
+
+Open PBF file in PDF export fromat
+
+ my $input = new WebPAC::Input::PDF(
+       path => '/path/to/file.pdf',
+ );
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Returns new low-level input API object
+
+  my $input = new WebPAC::Input::PDF(
+       path => '/path/to/file.pdf'
+       filter => sub {
+               my ($l,$field_nr) = @_;
+               # do something with $l which is line of input file
+               return $l;
+       },
+  }
+
+Options:
+
+=over 4
+
+=item path
+
+path to PDF file
+
+=back
+
+=cut
+
+my $verbose = 1;
+
+sub new {
+       my $class = shift;
+       my $self = {@_};
+       bless($self, $class);
+
+       my $arg = {@_};
+
+       my $log = $self->_get_logger();
+
+       my $file = $arg->{path} || $log->logide("need path");
+
+       my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
+
+       my $pages = $doc->numPages();
+
+       $log->info("opend $file with $pages pages");
+
+       my @lines;
+
+       foreach my $p ( 1 .. $pages ) {
+               my $tree = $doc->getPageContentTree($p);
+               if ($tree) {
+                       my $out;
+
+                       confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
+
+                       foreach my $blocks ( @{ $tree->{blocks} } ) {
+                               foreach my $block ( $blocks ) {
+                                       next unless defined $block->{value};
+                                       foreach my $value ( $block->{value} ) {
+                                       confess "expect array for value" unless ref($value) eq 'ARRAY';
+                                               foreach my $v ( @$value ) {
+                                                       next unless defined $v->{args};
+#warn "## v ",ref($v),dump( $v );
+                                                       my @data;
+                                                       foreach my $args ( $v->{args} ) {
+#warn "## args ",ref($args),dump( $args );
+                                                               confess "expect array for args" unless ref($args) eq 'ARRAY';
+                                                               foreach my $a ( @$args ) {
+                                                                       if ( $a->{type} eq 'array' ) {
+#warn "## a ",ref($a),dump( $a );
+                                                                               foreach my $av ( @{ $a->{value} } ) {
+                                                                                       next unless $av->{type} eq 'string';
+#warn "## av ",ref($av),dump( $av );
+                                                                                       push @data, $av->{value};
+                                                                               }
+                                                                       } elsif ( $a->{type} eq 'string' ) {
+                                                                               push @data, $a->{value};
+                                                                       }
+                                                               }
+                                                               next unless @data;
+                                                               warn "data $#data = ",dump(@data);
+                                                               ## FIXME data specific!
+                                                               if ( $#data == 4 ) {
+                                                                       push @lines, [ @data ];
+                                                               } elsif ( $#data == 0 && $#lines >= 0 ) {
+                                                                       my $v = shift @data;
+                                                                       warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
+                                                                       $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
+                                                                       warn "added to ",dump( $lines[ $#lines ] );
+                                                               } else {
+                                                                       $log->warn("ignored: ",dump( @data ));
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       $self->size( $#lines );
+
+       $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
+
+       $self ? return $self : return undef;
+}
+
+=head2 fetch_rec
+
+Return record with ID C<$mfn> from database
+
+  my $rec = $input->fetch_rec( $mfn, $filter_coderef );
+
+=cut
+
+sub fetch_rec {
+       my $self = shift;
+
+       my ( $mfn, $filter_coderef ) = @_;
+
+       return $self->{_rec}->[$mfn-1];
+}
+
+
+=head2 size
+
+Return number of records in database
+
+  my $size = $input->size;
+
+=cut
+
+sub size {
+       my $self = shift;
+       return $#{$self->{_rec}} + 1;
+}
+
+=head1 SEE ALSO
+
+L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Input::PDF
diff --git a/t/2-input-pdf.t b/t/2-input-pdf.t
new file mode 100755 (executable)
index 0000000..c0098ba
--- /dev/null
@@ -0,0 +1,39 @@
+#!/usr/bin/perl -w
+
+use strict;
+use blib;
+
+use Test::More tests => 9;
+
+BEGIN {
+use_ok( 'WebPAC::Test' );
+use_ok( 'WebPAC::Input' );
+}
+
+my $module = 'WebPAC::Input::PDF';
+diag "testing with $module";
+
+ok(my $input = new WebPAC::Input(
+       module => $module,
+       no_progress_bar => 1,
+       %LOG
+), "new");
+
+ok(my $db = $input->open(
+       path => "$abs_path/data/UFOReport1998.pdf"
+), "open");
+ok(my $size = $input->size, "size");
+
+foreach my $mfn ( 1 ... $size ) {
+       my $rec = $input->fetch;
+       if ($mfn <= 10 || $mfn == 20) {
+               ok($rec, "fetch $mfn");
+       } else {
+               ok(! $rec, "empty $mfn");
+       }
+
+       cmp_ok($input->pos, '==', $mfn, "pos $mfn");
+
+       diag "rec: ", dump($rec), "\n" if $debug;
+}
+