--- /dev/null
+package WebPAC::Input::PDF;
+
+use warnings;
+use strict;
+
+use WebPAC::Input;
+use base qw/WebPAC::Common/;
+
+use CAM::PDF;
+use Carp qw/confess/;
+
+use Data::Dump qw/dump/;
+
+=head1 NAME
+
+WebPAC::Input::PDF - try to parse PDF tabular data
+
+=head1 SYNOPSIS
+
+Open PBF file in PDF export fromat
+
+ my $input = new WebPAC::Input::PDF(
+ path => '/path/to/file.pdf',
+ );
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Returns new low-level input API object
+
+ my $input = new WebPAC::Input::PDF(
+ path => '/path/to/file.pdf'
+ filter => sub {
+ my ($l,$field_nr) = @_;
+ # do something with $l which is line of input file
+ return $l;
+ },
+ }
+
+Options:
+
+=over 4
+
+=item path
+
+path to PDF file
+
+=back
+
+=cut
+
+my $verbose = 1;
+
+sub new {
+ my $class = shift;
+ my $self = {@_};
+ bless($self, $class);
+
+ my $arg = {@_};
+
+ my $log = $self->_get_logger();
+
+ my $file = $arg->{path} || $log->logide("need path");
+
+ my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
+
+ my $pages = $doc->numPages();
+
+ $log->info("opend $file with $pages pages");
+
+ my @lines;
+
+ foreach my $p ( 1 .. $pages ) {
+ my $tree = $doc->getPageContentTree($p);
+ if ($tree) {
+ my $out;
+
+ confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
+
+ foreach my $blocks ( @{ $tree->{blocks} } ) {
+ foreach my $block ( $blocks ) {
+ next unless defined $block->{value};
+ foreach my $value ( $block->{value} ) {
+ confess "expect array for value" unless ref($value) eq 'ARRAY';
+ foreach my $v ( @$value ) {
+ next unless defined $v->{args};
+#warn "## v ",ref($v),dump( $v );
+ my @data;
+ foreach my $args ( $v->{args} ) {
+#warn "## args ",ref($args),dump( $args );
+ confess "expect array for args" unless ref($args) eq 'ARRAY';
+ foreach my $a ( @$args ) {
+ if ( $a->{type} eq 'array' ) {
+#warn "## a ",ref($a),dump( $a );
+ foreach my $av ( @{ $a->{value} } ) {
+ next unless $av->{type} eq 'string';
+#warn "## av ",ref($av),dump( $av );
+ push @data, $av->{value};
+ }
+ } elsif ( $a->{type} eq 'string' ) {
+ push @data, $a->{value};
+ }
+ }
+ next unless @data;
+ warn "data $#data = ",dump(@data);
+ ## FIXME data specific!
+ if ( $#data == 4 ) {
+ push @lines, [ @data ];
+ } elsif ( $#data == 0 && $#lines >= 0 ) {
+ my $v = shift @data;
+ warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
+ $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
+ warn "added to ",dump( $lines[ $#lines ] );
+ } else {
+ $log->warn("ignored: ",dump( @data ));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ $self->size( $#lines );
+
+ $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
+
+ $self ? return $self : return undef;
+}
+
+=head2 fetch_rec
+
+Return record with ID C<$mfn> from database
+
+ my $rec = $input->fetch_rec( $mfn, $filter_coderef );
+
+=cut
+
+sub fetch_rec {
+ my $self = shift;
+
+ my ( $mfn, $filter_coderef ) = @_;
+
+ return $self->{_rec}->[$mfn-1];
+}
+
+
+=head2 size
+
+Return number of records in database
+
+ my $size = $input->size;
+
+=cut
+
+sub size {
+ my $self = shift;
+ return $#{$self->{_rec}} + 1;
+}
+
+=head1 SEE ALSO
+
+L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Input::PDF
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use blib;
+
+use Test::More tests => 9;
+
+BEGIN {
+use_ok( 'WebPAC::Test' );
+use_ok( 'WebPAC::Input' );
+}
+
+my $module = 'WebPAC::Input::PDF';
+diag "testing with $module";
+
+ok(my $input = new WebPAC::Input(
+ module => $module,
+ no_progress_bar => 1,
+ %LOG
+), "new");
+
+ok(my $db = $input->open(
+ path => "$abs_path/data/UFOReport1998.pdf"
+), "open");
+ok(my $size = $input->size, "size");
+
+foreach my $mfn ( 1 ... $size ) {
+ my $rec = $input->fetch;
+ if ($mfn <= 10 || $mfn == 20) {
+ ok($rec, "fetch $mfn");
+ } else {
+ ok(! $rec, "empty $mfn");
+ }
+
+ cmp_ok($input->pos, '==', $mfn, "pos $mfn");
+
+ diag "rec: ", dump($rec), "\n" if $debug;
+}
+