1 package WebPAC::Input::PDF;
7 use base qw/WebPAC::Common/;
12 use Data::Dump qw/dump/;
16 WebPAC::Input::PDF - try to parse PDF tabular data
20 Open PBF file in PDF export fromat
22 my $input = new WebPAC::Input::PDF(
23 path => '/path/to/file.pdf',
30 Returns new low-level input API object
32 my $input = new WebPAC::Input::PDF(
33 path => '/path/to/file.pdf'
35 my ($l,$field_nr) = @_;
36 # do something with $l which is line of input file
62 my $log = $self->_get_logger();
64 my $file = $arg->{path} || $log->logide("need path");
66 my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
68 my $pages = $doc->numPages();
70 $log->info("opend $file with $pages pages");
74 foreach my $p ( 1 .. $pages ) {
75 my $tree = $doc->getPageContentTree($p);
79 confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
81 foreach my $blocks ( @{ $tree->{blocks} } ) {
82 foreach my $block ( $blocks ) {
83 next unless defined $block->{value};
84 foreach my $value ( $block->{value} ) {
85 confess "expect array for value" unless ref($value) eq 'ARRAY';
86 foreach my $v ( @$value ) {
87 next unless defined $v->{args};
88 #warn "## v ",ref($v),dump( $v );
90 foreach my $args ( $v->{args} ) {
91 #warn "## args ",ref($args),dump( $args );
92 confess "expect array for args" unless ref($args) eq 'ARRAY';
93 foreach my $a ( @$args ) {
94 if ( $a->{type} eq 'array' ) {
95 #warn "## a ",ref($a),dump( $a );
96 foreach my $av ( @{ $a->{value} } ) {
97 next unless $av->{type} eq 'string';
98 #warn "## av ",ref($av),dump( $av );
99 push @data, $av->{value};
101 } elsif ( $a->{type} eq 'string' ) {
102 push @data, $a->{value};
106 warn "data $#data = ",dump(@data);
107 ## FIXME data specific!
109 push @lines, [ @data ];
110 } elsif ( $#data == 0 && $#lines >= 0 ) {
112 warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
113 $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
114 warn "added to ",dump( $lines[ $#lines ] );
116 $log->warn("ignored: ",dump( @data ));
126 $self->{_lines} = \@lines;
128 $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
130 $self ? return $self : return undef;
135 Return record with ID C<$mfn> from database
137 my $rec = $input->fetch_rec( $mfn, $filter_coderef );
139 Records are returned as field C<A>, C<B> and so on...
141 Last supported column is C<ZZ>.
148 my ( $mfn, $filter_coderef ) = @_;
154 my $line = $self->{_lines}->[ $mfn - 1 ] || return;
155 confess "expected ARRAY for _lines $mfn" unless ref($line) eq 'ARRAY';
157 # warn "## line = ",dump( $line );
161 foreach my $e ( @$line ) {
164 # FIXME what about columns > ZZ
167 } elsif ( $col eq 'ZZ' ) {
168 $self->_get_logger()->logwarn("ignoring colums above ZZ (original ", $#$line + 1, " > $c max columns)");
170 } elsif ( $col =~ m/([A-Z])Z$/ ) {
177 # warn "## rec = ",dump( $rec );
185 Return number of records in database
187 my $size = $input->size;
193 return $#{$self->{_lines}} + 1;
198 L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
202 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
204 =head1 COPYRIGHT & LICENSE
206 Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
208 This program is free software; you can redistribute it and/or modify it
209 under the same terms as Perl itself.
213 1; # End of WebPAC::Input::PDF