1 package WebPAC::Normalize::XML;
6 use base qw/WebPAC::Common/;
13 WebPAC::Normalize::XML - apply XML normalisaton rules
21 our $VERSION = '0.01';
25 This module uses C<conf/normalize/*.xml> files to perform normalisation
34 Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
37 my $n = new WebPAC::Normalize::XML(
39 xml_file => '/path/to/conf/normalize/isis.xml',
40 cache_data_structure => './cache/ds/',
41 lookup_regex => $lookup->regex,
44 C<tag> defines tag to use within C<xml_file>
46 C<xml_file> defines path to normalize XML.
48 Optional parameter C<cache_data_structure> defines path to directory
49 in which cache file for C<data_structure> call will be created.
51 Recommended parametar C<lookup_regex> specify ...
60 $self->setup_cache_dir( $self->{'cache_data_structure'} );
62 my $log = $self->_get_logger();
64 foreach my $req (qw/tag xml_file/) {
65 $log->logconfess("need argument $req") unless $self->{$req};
70 my $xml_file = $self->{'xml_file'};
72 $log->info("using $xml_file tag <",$self->{'tag'},">");
74 $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
76 $self->{'import_xml_file'} = $xml_file;
78 $self->{'import_xml'} = XMLin($f,
79 ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
82 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
87 =head2 setup_cache_dir
89 Check if specified cache directory exist, and if not, disable caching.
91 $setup_cache_dir('./cache/ds/');
93 If you pass false or zero value to this function, it will disable
103 my $log = $self->_get_logger();
108 $msg = "doesn't exist";
109 } elsif (! -d $dir) {
110 $msg = "is not directory";
111 } elsif (! -w $dir) {
112 $msg = "not writable";
116 undef $self->{'cache_data_structure'};
117 $log->warn("cache_data_structure $dir $msg, disabling...");
119 $log->debug("using cache dir $dir");
122 $log->debug("disabling cache");
123 undef $self->{'cache_data_structure'};
128 =head2 data_structure
130 Create in-memory data structure which represents normalized layout from
131 C<conf/normalize/*.xml>.
133 This structures are used to produce output.
135 my @ds = $webpac->data_structure($rec);
137 B<Note: historical oddity follows>
139 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
140 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
141 C<< <headline> >> tag.
148 my $log = $self->_get_logger();
151 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
155 if (my $cache_path = $self->{'cache_data_structure'}) {
156 my $id = $rec->{'000'};
157 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
158 unless (defined($id)) {
159 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
160 undef $self->{'cache_data_structure'};
162 $cache_file = "$cache_path/$id";
163 if (-r $cache_file) {
164 my $ds_ref = retrieve($cache_file);
166 $log->debug("cache hit: $cache_file");
168 foreach my $f (qw(current_filename headline)) {
170 $self->{$f} = $ds_ref->{$f};
175 if ($ok && $ds_ref->{'ds'}) {
176 return @{ $ds_ref->{'ds'} };
178 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
179 undef $self->{'cache_data_structure'};
186 undef $self->{'currnet_filename'};
187 undef $self->{'headline'};
190 if ($self->{tags_by_order}) {
191 @sorted_tags = @{$self->{tags_by_order}};
193 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
194 $self->{tags_by_order} = \@sorted_tags;
199 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
201 foreach my $field (@sorted_tags) {
205 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
207 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
208 my $format = $tag->{'value'} || $tag->{'content'};
210 $log->debug("format: $format");
213 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
214 @v = $self->fill_in_to_arr($rec,$format);
216 @v = $self->parse_to_arr($rec,$format);
220 if ($tag->{'sort'}) {
221 @v = $self->sort_arr(@v);
225 if ($tag->{'format_name'}) {
226 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
229 if ($field eq 'filename') {
230 $self->{'current_filename'} = join('',@v);
231 $log->debug("filename: ",$self->{'current_filename'});
232 } elsif ($field eq 'headline') {
233 $self->{'headline'} .= join('',@v);
234 $log->debug("headline: ",$self->{'headline'});
235 next; # don't return headline in data_structure!
238 # delimiter will join repeatable fields
239 if ($tag->{'delimiter'}) {
240 @v = ( join($tag->{'delimiter'}, @v) );
244 my @types = qw(display swish);
245 # override by type attribute
246 @types = ( $tag->{'type'} ) if ($tag->{'type'});
248 foreach my $type (@types) {
249 # append to previous line?
250 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
251 if ($tag->{'append'}) {
253 # I will delimit appended part with
255 my $d = $tag->{'delimiter'};
259 my $last = pop @{$row->{$type}};
260 $d = "" if (! $last);
261 $last .= $d . join($d, @v);
262 push @{$row->{$type}}, $last;
265 push @{$row->{$type}}, @v;
273 $row->{'tag'} = $field;
275 # TODO: name_sigular, name_plural
276 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
277 $row->{'name'} = $name ? $self->_x($name) : $field;
279 # post-sort all values in field
280 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
281 $log->warn("sort at field tag not implemented");
286 $log->debug("row $field: ",sub { Dumper($row) });
294 current_filename => $self->{'current_filename'},
295 headline => $self->{'headline'},
297 $log->debug("created storable cache file $cache_file");
306 Apply format specified in tag with C<format_name="name"> and
307 C<format_delimiter=";;">.
309 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
311 Formats can contain C<lookup{...}> if you need them.
318 my ($name,$delimiter,$data) = @_;
320 my $log = $self->_get_logger();
322 if (! $self->{'import_xml'}->{'format'}->{$name}) {
323 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
327 $log->warn("no delimiter for format $name") if (! $delimiter);
329 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
331 my @data = split(/\Q$delimiter\E/, $data);
333 my $out = sprintf($format, @data);
334 $log->debug("using format $name [$format] on $data to produce: $out");
336 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
337 return $self->lookup($out);
347 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
349 =head1 COPYRIGHT & LICENSE
351 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
353 This program is free software; you can redistribute it and/or modify it
354 under the same terms as Perl itself.
358 1; # End of WebPAC::Normalize::XML