--- /dev/null
+package WebPAC::Normalise::XML;
+
+use warnings;
+use strict;
+
+use base qw/WebPAC::Common/;
+
+=head1 NAME
+
+WebPAC::Normalise::XML - apply XML normalisaton rules
+
+=head1 VERSION
+
+Version 0.01
+
+=cut
+
+our $VERSION = '0.01';
+
+=head1 SYNOPSIS
+
+This module uses C<conf/normalize/*.xml> files to perform normalisation
+from input records
+
+ use WebPAC::Normalise::XML;
+
+ my $foo = WebPAC::Normalise::XML->new();
+ ...
+
+=cut
+
+# mapping between data type and tag which specify
+# format in XML file
+my %type2tag = (
+ 'isis' => 'isis',
+# 'excel' => 'column',
+# 'marc' => 'marc',
+# 'feed' => 'feed'
+);
+
+
+=head1 EXPORT
+
+A list of functions that can be exported. You can delete this section
+if you don't export anything, such as for a purely object-oriented module.
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Create new instance of WebPAC using configuration specified by C<config_file>.
+
+ my $n = new WebPAC::Normalize::XML(
+ cache_data_structure => './cache/ds/',
+ }
+
+Optional parameter C<cache_data_structure> defines path to directory
+in which cache file for C<data_structure> call will be created.
+
+=cut
+
+sub new {
+ my $class = shift;
+ my $self = {@_};
+ bless($self, $class);
+
+ $self->setup_cache_dir( $self->{'cache_data_structure'} );
+
+ return $self;
+}
+
+=head2 open_import_xml
+
+Read file from C<import_xml/> directory and parse it.
+
+ $webpac->open_import_xml(type => 'isis');
+
+=cut
+
+sub open_import_xml {
+ my $self = shift;
+
+ my $log = $self->_get_logger();
+
+ my $arg = {@_};
+ $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
+
+ $self->{'type'} = $arg->{'type'};
+
+ my $type_base = $arg->{'type'};
+ $type_base =~ s/_.*$//g;
+
+ $self->{'tag'} = $type2tag{$type_base};
+
+ $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+
+ my $f = "./import_xml/".$self->{'type'}.".xml";
+ $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
+
+ $log->info("reading '$f'");
+
+ $self->{'import_xml_file'} = $f;
+
+ $self->{'import_xml'} = XMLin($f,
+ ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+ );
+
+ $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
+
+}
+
+=head2 setup_cache_dir
+
+Check if specified cache directory exist, and if not, disable caching.
+
+ $setup_cache_dir('./cache/ds/');
+
+If you pass false or zero value to this function, it will disable
+cacheing.
+
+=cut
+
+sub setup_cache_dir {
+ my $self = shift;
+
+ my $dir = shift;
+
+ my $log = $self->_get_logger();
+
+ if ($dir) {
+ my $msg;
+ if (! -e $dir) {
+ $msg = "doesn't exist";
+ } elsif (! -d $dir) {
+ $msg = "is not directory";
+ } elsif (! -w $dir) {
+ $msg = "not writable";
+ }
+
+ if ($msg) {
+ undef $self->{'cache_data_structure'};
+ $log->warn("cache_data_structure $dir $msg, disabling...");
+ } else {
+ $log->debug("using cache dir $dir");
+ }
+ } else {
+ $log->debug("disabling cache");
+ undef $self->{'cache_data_structure'};
+ }
+}
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents layout from C<import_xml>.
+It is used later to produce output.
+
+ my @ds = $webpac->data_structure($rec);
+
+This method will also set C<$webpac->{'currnet_filename'}> if there is
+<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
+<headline> tag.
+
+=cut
+
+sub data_structure {
+ my $self = shift;
+
+ my $log = $self->_get_logger();
+
+ my $rec = shift;
+ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ my $cache_file;
+
+ if (my $cache_path = $self->{'cache_data_structure'}) {
+ my $id = $rec->{'000'};
+ $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
+ unless (defined($id)) {
+ $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
+ undef $self->{'cache_data_structure'};
+ } else {
+ $cache_file = "$cache_path/$id";
+ if (-r $cache_file) {
+ my $ds_ref = retrieve($cache_file);
+ if ($ds_ref) {
+ $log->debug("cache hit: $cache_file");
+ my $ok = 1;
+ foreach my $f (qw(current_filename headline)) {
+ if ($ds_ref->{$f}) {
+ $self->{$f} = $ds_ref->{$f};
+ } else {
+ $ok = 0;
+ }
+ };
+ if ($ok && $ds_ref->{'ds'}) {
+ return @{ $ds_ref->{'ds'} };
+ } else {
+ $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
+ undef $self->{'cache_data_structure'};
+ }
+ }
+ }
+ }
+ }
+
+ undef $self->{'currnet_filename'};
+ undef $self->{'headline'};
+
+ my @sorted_tags;
+ if ($self->{tags_by_order}) {
+ @sorted_tags = @{$self->{tags_by_order}};
+ } else {
+ @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+ $self->{tags_by_order} = \@sorted_tags;
+ }
+
+ my @ds;
+
+ $log->debug("tags: ",sub { join(", ",@sorted_tags) });
+
+ foreach my $field (@sorted_tags) {
+
+ my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+ foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+ my $format = $tag->{'value'} || $tag->{'content'};
+
+ $log->debug("format: $format");
+
+ my @v;
+ if ($format =~ /$LOOKUP_REGEX/o) {
+ @v = $self->fill_in_to_arr($rec,$format);
+ } else {
+ @v = $self->parse_to_arr($rec,$format);
+ }
+ next if (! @v);
+
+ if ($tag->{'sort'}) {
+ @v = $self->sort_arr(@v);
+ }
+
+ # use format?
+ if ($tag->{'format_name'}) {
+ @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+ }
+
+ if ($field eq 'filename') {
+ $self->{'current_filename'} = join('',@v);
+ $log->debug("filename: ",$self->{'current_filename'});
+ } elsif ($field eq 'headline') {
+ $self->{'headline'} .= join('',@v);
+ $log->debug("headline: ",$self->{'headline'});
+ next; # don't return headline in data_structure!
+ }
+
+ # delimiter will join repeatable fields
+ if ($tag->{'delimiter'}) {
+ @v = ( join($tag->{'delimiter'}, @v) );
+ }
+
+ # default types
+ my @types = qw(display swish);
+ # override by type attribute
+ @types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+ foreach my $type (@types) {
+ # append to previous line?
+ $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+ if ($tag->{'append'}) {
+
+ # I will delimit appended part with
+ # delimiter (or ,)
+ my $d = $tag->{'delimiter'};
+ # default delimiter
+ $d ||= " ";
+
+ my $last = pop @{$row->{$type}};
+ $d = "" if (! $last);
+ $last .= $d . join($d, @v);
+ push @{$row->{$type}}, $last;
+
+ } else {
+ push @{$row->{$type}}, @v;
+ }
+ }
+
+
+ }
+
+ if ($row) {
+ $row->{'tag'} = $field;
+
+ # TODO: name_sigular, name_plural
+ my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+ $row->{'name'} = $name ? $self->_x($name) : $field;
+
+ # post-sort all values in field
+ if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+ $log->warn("sort at field tag not implemented");
+ }
+
+ push @ds, $row;
+
+ $log->debug("row $field: ",sub { Dumper($row) });
+ }
+
+ }
+
+ if ($cache_file) {
+ store {
+ ds => \@ds,
+ current_filename => $self->{'current_filename'},
+ headline => $self->{'headline'},
+ }, $cache_file;
+ $log->debug("created storable cache file $cache_file");
+ }
+
+ return @ds;
+
+}
+
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Normalise::XML