improvements to WebPAC::Normalize::XML
[webpac2] / lib / WebPAC / Normalize / XML.pm
1 package WebPAC::Normalize::XML;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7 use Storable;
8 use XML::Simple;
9 use Data::Dumper;
10
11 =head1 NAME
12
13 WebPAC::Normalize::XML - apply XML normalisaton rules
14
15 =head1 VERSION
16
17 Version 0.01
18
19 =cut
20
21 our $VERSION = '0.01';
22
23 =head1 SYNOPSIS
24
25 This module uses C<conf/normalize/*.xml> files to perform normalisation
26 from input records
27
28 =cut
29
30 =head1 FUNCTIONS
31
32 =head2 new
33
34 Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
35 parse it.
36
37  my $n = new WebPAC::Normalize::XML(
38         tag => 'isis',
39         xml_file => '/path/to/conf/normalize/isis.xml',
40         cache_data_structure => './cache/ds/',
41         lookup_regex => $lookup->regex,
42  }
43
44 C<tag> defines tag to use within C<xml_file>
45
46 C<xml_file> defines path to normalize XML.
47
48 Optional parameter C<cache_data_structure> defines path to directory
49 in which cache file for C<data_structure> call will be created.
50
51 Recommended parametar C<lookup_regex> specify ... 
52
53 =cut
54
55 sub new {
56         my $class = shift;
57         my $self = {@_};
58         bless($self, $class);
59
60         $self->setup_cache_dir( $self->{'cache_data_structure'} );
61
62         my $log = $self->_get_logger();
63
64         foreach my $req (qw/tag xml_file/) {
65                 $log->logconfess("need argument $req") unless $self->{$req};
66         }
67
68         my $f = 
69
70         my $xml_file = $self->{'xml_file'};
71
72         $log->info("using $xml_file tag <",$self->{'tag'},">");
73
74         $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
75
76         $self->{'import_xml_file'} = $xml_file;
77
78         $self->{'import_xml'} = XMLin($f,
79                 ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
80         );
81
82         $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
83
84         return $self;
85 }
86
87 =head2 setup_cache_dir
88
89 Check if specified cache directory exist, and if not, disable caching.
90
91  $setup_cache_dir('./cache/ds/');
92
93 If you pass false or zero value to this function, it will disable
94 cacheing.
95
96 =cut
97
98 sub setup_cache_dir {
99         my $self = shift;
100
101         my $dir = shift;
102
103         my $log = $self->_get_logger();
104
105         if ($dir) {
106                 my $msg;
107                 if (! -e $dir) {
108                         $msg = "doesn't exist";
109                 } elsif (! -d $dir) {
110                         $msg = "is not directory";
111                 } elsif (! -w $dir) {
112                         $msg = "not writable";
113                 }
114
115                 if ($msg) {
116                         undef $self->{'cache_data_structure'};
117                         $log->warn("cache_data_structure $dir $msg, disabling...");
118                 } else {
119                         $log->debug("using cache dir $dir");
120                 }
121         } else {
122                 $log->debug("disabling cache");
123                 undef $self->{'cache_data_structure'};
124         }
125 }
126
127
128 =head2 data_structure
129
130 Create in-memory data structure which represents normalized layout from
131 C<conf/normalize/*.xml>.
132
133 This structures are used to produce output.
134
135  my @ds = $webpac->data_structure($rec);
136
137 B<Note: historical oddity follows>
138
139 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
140 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
141 C<< <headline> >> tag.
142
143 =cut
144
145 sub data_structure {
146         my $self = shift;
147
148         my $log = $self->_get_logger();
149
150         my $rec = shift;
151         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
152
153         my $cache_file;
154
155         if (my $cache_path = $self->{'cache_data_structure'}) {
156                 my $id = $rec->{'000'};
157                 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
158                 unless (defined($id)) {
159                         $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
160                         undef $self->{'cache_data_structure'};
161                 } else {
162                         $cache_file = "$cache_path/$id";
163                         if (-r $cache_file) {
164                                 my $ds_ref = retrieve($cache_file);
165                                 if ($ds_ref) {
166                                         $log->debug("cache hit: $cache_file");
167                                         my $ok = 1;
168                                         foreach my $f (qw(current_filename headline)) {
169                                                 if ($ds_ref->{$f}) {
170                                                         $self->{$f} = $ds_ref->{$f};
171                                                 } else {
172                                                         $ok = 0;
173                                                 }
174                                         };
175                                         if ($ok && $ds_ref->{'ds'}) {
176                                                 return @{ $ds_ref->{'ds'} };
177                                         } else {
178                                                 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
179                                                 undef $self->{'cache_data_structure'};
180                                         }
181                                 }
182                         }
183                 }
184         }
185
186         undef $self->{'currnet_filename'};
187         undef $self->{'headline'};
188
189         my @sorted_tags;
190         if ($self->{tags_by_order}) {
191                 @sorted_tags = @{$self->{tags_by_order}};
192         } else {
193                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
194                 $self->{tags_by_order} = \@sorted_tags;
195         }
196
197         my @ds;
198
199         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
200
201         foreach my $field (@sorted_tags) {
202
203                 my $row;
204
205 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
206
207                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
208                         my $format = $tag->{'value'} || $tag->{'content'};
209
210                         $log->debug("format: $format");
211
212                         my @v;
213                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
214                                 @v = $self->fill_in_to_arr($rec,$format);
215                         } else {
216                                 @v = $self->parse_to_arr($rec,$format);
217                         }
218                         next if (! @v);
219
220                         if ($tag->{'sort'}) {
221                                 @v = $self->sort_arr(@v);
222                         }
223
224                         # use format?
225                         if ($tag->{'format_name'}) {
226                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
227                         }
228
229                         if ($field eq 'filename') {
230                                 $self->{'current_filename'} = join('',@v);
231                                 $log->debug("filename: ",$self->{'current_filename'});
232                         } elsif ($field eq 'headline') {
233                                 $self->{'headline'} .= join('',@v);
234                                 $log->debug("headline: ",$self->{'headline'});
235                                 next; # don't return headline in data_structure!
236                         }
237
238                         # delimiter will join repeatable fields
239                         if ($tag->{'delimiter'}) {
240                                 @v = ( join($tag->{'delimiter'}, @v) );
241                         }
242
243                         # default types 
244                         my @types = qw(display swish);
245                         # override by type attribute
246                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
247
248                         foreach my $type (@types) {
249                                 # append to previous line?
250                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
251                                 if ($tag->{'append'}) {
252
253                                         # I will delimit appended part with
254                                         # delimiter (or ,)
255                                         my $d = $tag->{'delimiter'};
256                                         # default delimiter
257                                         $d ||= " ";
258
259                                         my $last = pop @{$row->{$type}};
260                                         $d = "" if (! $last);
261                                         $last .= $d . join($d, @v);
262                                         push @{$row->{$type}}, $last;
263
264                                 } else {
265                                         push @{$row->{$type}}, @v;
266                                 }
267                         }
268
269
270                 }
271
272                 if ($row) {
273                         $row->{'tag'} = $field;
274
275                         # TODO: name_sigular, name_plural
276                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
277                         $row->{'name'} = $name ? $self->_x($name) : $field;
278
279                         # post-sort all values in field
280                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
281                                 $log->warn("sort at field tag not implemented");
282                         }
283
284                         push @ds, $row;
285
286                         $log->debug("row $field: ",sub { Dumper($row) });
287                 }
288
289         }
290
291         if ($cache_file) {
292                 store {
293                         ds => \@ds,
294                         current_filename => $self->{'current_filename'},
295                         headline => $self->{'headline'},
296                 }, $cache_file;
297                 $log->debug("created storable cache file $cache_file");
298         }
299
300         return @ds;
301
302 }
303
304 =head2 apply_format
305
306 Apply format specified in tag with C<format_name="name"> and
307 C<format_delimiter=";;">.
308
309  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
310
311 Formats can contain C<lookup{...}> if you need them.
312
313 =cut
314
315 sub apply_format {
316         my $self = shift;
317
318         my ($name,$delimiter,$data) = @_;
319
320         my $log = $self->_get_logger();
321
322         if (! $self->{'import_xml'}->{'format'}->{$name}) {
323                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
324                 return $data;
325         }
326
327         $log->warn("no delimiter for format $name") if (! $delimiter);
328
329         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
330
331         my @data = split(/\Q$delimiter\E/, $data);
332
333         my $out = sprintf($format, @data);
334         $log->debug("using format $name [$format] on $data to produce: $out");
335
336         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
337                 return $self->lookup($out);
338         } else {
339                 return $out;
340         }
341
342 }
343
344
345 =head1 AUTHOR
346
347 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
348
349 =head1 COPYRIGHT & LICENSE
350
351 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
352
353 This program is free software; you can redistribute it and/or modify it
354 under the same terms as Perl itself.
355
356 =cut
357
358 1; # End of WebPAC::Normalize::XML