r1137@llin: dpavlin | 2006-11-05 15:51:19 +0100
[webpac2] / lib / WebPAC / Input / Gutenberg.pm
1 package WebPAC::Input::Gutenberg;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8 use XML::LibXML;
9 use Data::Dump qw/dump/;
10 use Encode qw/encode_utf8/;
11
12 =head1 NAME
13
14 WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg
15
16 =head1 VERSION
17
18 Version 0.01
19
20 =cut
21
22 our $VERSION = '0.01';
23
24
25 =head1 SYNOPSIS
26
27 Read catalog data from Project Gutemberg (uncompressed!) and create
28 pseudo-MARC records from them.
29
30  my $ll_db = new WebPAC::Input::Gutenberg(
31         path => '/path/to/catalog.rdf',
32  );
33
34 =head1 FUNCTIONS
35
36 =head2 new
37
38 Returns new low-level input API object
39
40   my $ll_db = new WebPAC::Input::Gutenberg(
41         path => '/path/to/catalog.rdf'
42         filter => sub {
43                 my ($l,$field_nr) = @_;
44                 # do something with $l which is line of input file
45                 return $l;
46         },
47   }
48
49 Options:
50
51 =over 4
52
53 =item path
54
55 path to Project Gutenberg RDF catalog file
56
57 =back
58
59 =cut
60
61 sub new {
62         my $class = shift;
63         my $self = {@_};
64         bless($self, $class);
65
66         my $arg = {@_};
67
68         my $log = $self->_get_logger();
69
70         $log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");
71
72         my $parser = XML::LibXML->new ();
73         $parser->keep_blanks (0);
74         my $doc = $parser->parse_file( $arg->{path} );
75
76         $log->info("parsing over, finding book nodes");
77         my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');
78
79         $log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);
80
81         my $mapping = [
82                 [ 'dc:title//text()',           '200', 'a' ],
83                 [ 'dc:creator//text()',         '700', 'a' ],
84                 [ 'dc:alternative//text()',     '740', 'a' ],
85                 [ 'dc:subject//text()',         '650', 'a' ],
86                 [ 'dc:contributor//text()',     '700', 'a' ],
87                 [ 'dc:created//text()',         '533', 'd' ],
88                 [ 'dc:description//text()',     '500', 'a' ],
89                 [ 'dc:language//text()',        '041', 'a' ],
90         ];
91
92         $log->info("found ", $booknodes->size, " book nodes, processing");
93
94         my $mfn = 1;
95
96         foreach my $booknode (@$booknodes) {
97
98                 # this is a book description node
99                 my $etext_no = $booknode->getAttribute ('ID');
100                 $etext_no =~ s/^etext//;
101
102                 my $row = {
103                         '001' => [ $etext_no ],
104                 };
105
106                 foreach my $m ( @$mapping ) {
107                         my ($xpath,$f,$sf) = @$m;
108
109                         foreach my $v ($booknode->findnodes($xpath)) {
110                                 push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
111                         }
112
113                         $log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
114                 }
115
116                 $self->{_rows}->{ $mfn } = $row;
117                 $log->debug("created row $mfn ", dump( $row ));
118
119                 $mfn++;
120         }
121         $booknodes = undef; # release some memory
122
123         $self->{size} = $mfn - 1;
124
125         $log->info("created ", $self->{size}, " records for ", $arg->{path});
126
127         $self ? return $self : return undef;
128 }
129
130 =head2 fetch_rec
131
132 Return record with ID C<$mfn> from database
133
134   my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );
135
136 =cut
137
138 sub fetch_rec {
139         my $self = shift;
140
141         my ($mfn, $filter_coderef) = @_;
142
143         my $rec = $self->_to_hash(
144                 mfn => $mfn,
145                 row => $self->{_rows}->{$mfn},
146                 hash_filter => $filter_coderef,
147         );
148
149         my $log = $self->_get_logger();
150         $log->debug("fetch_rec($mfn) = ", dump($rec));
151
152         return $rec;
153 }
154
155 =head2 size
156
157 Return number of records in database
158
159   my $size = $ll_db->size;
160
161 =cut
162
163 sub size {
164         my $self = shift;
165         return $self->{size};
166 }
167
168 =head2 _to_hash
169
170 Return hash from row. Taken from L<Biblio::Isis>
171
172   my $rec = $ll_db->_to_hash(
173         mfn => $mfn;
174         $row
175   );
176
177 =cut
178
179 sub _to_hash {
180         my $self = shift;
181
182         my $arg = {@_};
183
184         my $log = $self->_get_logger();
185
186         my $hash_filter = $arg->{hash_filter};
187         my $mfn = $arg->{mfn} || $log->logconfess("need mfn in arguments");
188         my $row = $arg->{row} || $log->logconfess("need row in arguments");
189
190         # init record to include MFN as field 000
191         my $rec = { '000' => [ $mfn ] };
192
193         foreach my $f_nr (keys %{$row}) {
194                 foreach my $l (@{$row->{$f_nr}}) {
195
196                         # filter output
197                         $l = $hash_filter->($l, $f_nr) if ($hash_filter);
198                         next unless defined($l);
199
200                         my $val;
201                         my $r_sf;       # repeatable subfields in this record
202
203                         # has subfields?
204                         if ($l =~ m/\^/) {
205                                 foreach my $t (split(/\^/,$l)) {
206                                         next if (! $t);
207                                         my ($sf,$v) = (substr($t,0,1), substr($t,1));
208                                         next unless (defined($v) && $v ne '');
209
210                                         if (ref( $val->{$sf} ) eq 'ARRAY') {
211
212                                                 push @{ $val->{$sf} }, $v;
213
214                                                 # record repeatable subfield it it's offset
215                                                 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } );
216                                                 $r_sf->{$sf}++;
217
218                                         } elsif (defined( $val->{$sf} )) {
219
220                                                 # convert scalar field to array
221                                                 $val->{$sf} = [ $val->{$sf}, $v ];
222
223                                                 push @{ $val->{subfields} }, ( $sf, 1 );
224                                                 $r_sf->{$sf}++;
225
226                                         } else {
227                                                 $val->{$sf} = $v;
228                                                 push @{ $val->{subfields} }, ( $sf, 0 );
229                                         }
230                                 }
231                         } else {
232                                 $val = $l;
233                         }
234
235                         push @{$rec->{$f_nr}}, $val;
236                 }
237         }
238
239         return $rec;
240 }
241
242 =head1 AUTHOR
243
244 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
245
246 =head1 COPYRIGHT & LICENSE
247
248 Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
249
250 This program is free software; you can redistribute it and/or modify it
251 under the same terms as Perl itself.
252
253 =cut
254
255 1; # End of WebPAC::Input::Gutenberg