r1650@llin: dpavlin | 2007-11-20 11:07:57 +0100
[webpac2] / lib / WebPAC / Input / PDF.pm
1 package WebPAC::Input::PDF;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8
9 use CAM::PDF;
10 use Carp qw/confess/;
11
12 use Data::Dump qw/dump/;
13
14 =head1 NAME
15
16 WebPAC::Input::PDF - try to parse PDF tabular data
17
18 =head1 SYNOPSIS
19
20 Open PBF file in PDF export fromat
21
22  my $input = new WebPAC::Input::PDF(
23         path => '/path/to/file.pdf',
24  );
25
26 =head1 FUNCTIONS
27
28 =head2 new
29
30 Returns new low-level input API object
31
32   my $input = new WebPAC::Input::PDF(
33         path => '/path/to/file.pdf'
34         filter => sub {
35                 my ($l,$field_nr) = @_;
36                 # do something with $l which is line of input file
37                 return $l;
38         },
39   }
40
41 Options:
42
43 =over 4
44
45 =item path
46
47 path to PDF file
48
49 =back
50
51 =cut
52
53 my $verbose = 1;
54
55 sub new {
56         my $class = shift;
57         my $self = {@_};
58         bless($self, $class);
59
60         my $arg = {@_};
61
62         my $log = $self->_get_logger();
63
64         my $file = $arg->{path} || $log->logide("need path");
65
66         my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
67
68         my $pages = $doc->numPages();
69
70         $log->info("opend $file with $pages pages");
71
72         my @lines = ();
73
74         foreach my $p ( 1 .. $pages ) {
75                 my $tree = $doc->getPageContentTree($p);
76                 if ($tree) {
77                         my $out;
78
79                         confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
80
81                         foreach my $blocks ( @{ $tree->{blocks} } ) {
82                                 foreach my $block ( $blocks ) {
83                                         next unless defined $block->{value};
84                                         foreach my $value ( $block->{value} ) {
85                                         confess "expect array for value" unless ref($value) eq 'ARRAY';
86                                                 foreach my $v ( @$value ) {
87                                                         next unless defined $v->{args};
88 #warn "## v ",ref($v),dump( $v );
89                                                         my @data;
90                                                         foreach my $args ( $v->{args} ) {
91 #warn "## args ",ref($args),dump( $args );
92                                                                 confess "expect array for args" unless ref($args) eq 'ARRAY';
93                                                                 foreach my $a ( @$args ) {
94                                                                         if ( $a->{type} eq 'array' ) {
95 #warn "## a ",ref($a),dump( $a );
96                                                                                 foreach my $av ( @{ $a->{value} } ) {
97                                                                                         next unless $av->{type} eq 'string';
98 #warn "## av ",ref($av),dump( $av );
99                                                                                         push @data, $av->{value};
100                                                                                 }
101                                                                         } elsif ( $a->{type} eq 'string' ) {
102                                                                                 push @data, $a->{value};
103                                                                         }
104                                                                 }
105                                                                 next unless @data;
106                                                                 warn "data $#data = ",dump(@data);
107                                                                 ## FIXME data specific!
108                                                                 if ( $#data == 4 ) {
109                                                                         push @lines, [ @data ];
110                                                                 } elsif ( $#data == 0 && $#lines >= 0 ) {
111                                                                         my $v = shift @data;
112                                                                         warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
113                                                                         $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
114                                                                         warn "added to ",dump( $lines[ $#lines ] );
115                                                                 } else {
116                                                                         $log->warn("ignored: ",dump( @data ));
117                                                                 }
118                                                         }
119                                                 }
120                                         }
121                                 }
122                         }
123                 }
124         }
125
126         $self->{_lines} = \@lines;
127
128         $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
129
130         $self ? return $self : return undef;
131 }
132
133 =head2 fetch_rec
134
135 Return record with ID C<$mfn> from database
136
137   my $rec = $input->fetch_rec( $mfn, $filter_coderef );
138
139 Records are returned as field C<A>, C<B> and so on...
140
141 Last supported column is C<ZZ>.
142
143 =cut
144
145 sub fetch_rec {
146         my $self = shift;
147
148         my ( $mfn, $filter_coderef ) = @_;
149
150         my $rec = {
151                 '000' => [ $mfn ],
152         };
153
154         my $line = $self->{_lines}->[ $mfn - 1 ] || return;
155         confess "expected ARRAY for _lines $mfn" unless ref($line) eq 'ARRAY';
156
157 #       warn "## line = ",dump( $line );
158
159         my $col = 'A';
160         my $c = 0;
161         foreach my $e ( @$line ) {
162                 $rec->{$col} = $e;
163                 $c++;
164                 # FIXME what about columns > ZZ
165                 if ( $col eq 'Z' ) {
166                         $col .= 'AA';
167                 } elsif ( $col eq 'ZZ' ) {
168                         $self->_get_logger()->logwarn("ignoring colums above ZZ (original ", $#$line + 1, " > $c max columns)");
169                         last;
170                 } elsif ( $col =~ m/([A-Z])Z$/ ) {
171                         $col .= $1++ . 'A';
172                 } else {
173                         $col++;
174                 }
175         }
176
177 #       warn "## rec = ",dump( $rec );
178
179         return $rec;
180 }
181
182
183 =head2 size
184
185 Return number of records in database
186
187   my $size = $input->size;
188
189 =cut
190
191 sub size {
192         my $self = shift;
193         return $#{$self->{_lines}} + 1;
194 }
195
196 =head1 SEE ALSO
197
198 L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
199  
200 =head1 AUTHOR
201
202 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
203
204 =head1 COPYRIGHT & LICENSE
205
206 Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
207
208 This program is free software; you can redistribute it and/or modify it
209 under the same terms as Perl itself.
210
211 =cut
212
213 1; # End of WebPAC::Input::PDF