little cleanup and first cut into WebPAC::Normalize::XML
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 16 Jul 2005 16:48:35 +0000 (16:48 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 16 Jul 2005 16:48:35 +0000 (16:48 +0000)
git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@8 07558da8-63fa-0310-ba24-9fe276d99e06

lib/WebPAC/Input/ISIS.pm
lib/WebPAC/Manual.pod
lib/WebPAC/Normalize/XML.pm [new file with mode: 0644]
lib/WebPAC/Output/CDBI.pm [new file with mode: 0644]
lib/WebPAC/Output/ClassDBI.pm [deleted file]
t/2-input-isis.t [new file with mode: 0755]

index 2ef8194..0022237 100644 (file)
@@ -202,6 +202,9 @@ Fetch next record from database. It will also displays progress bar.
 
  my $rec = $webpac->fetch_rec;
 
+You should rearly have the need to call this function directly. Instead use
+C<fetch_data_structure> which returns normalised data.
+
 =cut
 
 sub fetch_rec {
@@ -234,6 +237,22 @@ sub fetch_rec {
        }
 }
 
+=head2 fetch_data_structure
+
+Fetch data structure of next record from database.
+
+ my @ds = $webpac->fetch_data_structure;
+
+=cut
+
+sub fetch_data_structure {
+       my $self = shift;
+
+       return $self->data_structure(
+               $self->fetch_rec(@_)
+       );
+}
+
 =head2 mfn
 
 Returns current record number (MFN).
index 97aa975..cdf2608 100644 (file)
@@ -17,7 +17,7 @@ of your data:
   1   | apply import normalisation rules (xml)
       V
    intermidiate                this data is re-formatted source data converted
-     data              to chunks based on tag names from import_xml
+     data              to chunks based on tag names from config/input/*.xml
       |
   2   | apply output filter (TT2)
       V
@@ -37,10 +37,11 @@ in webpac. You can split or merge data from input records, apply filters
 (perl subroutines), use lookups within same source file or do simple
 evaluations while producing output.
 
-All that is controlled with C<import_xml> configuration file. You will want
-to create fine-grained chunks of data (like separate first and last name),
-which will later be used to produce output. You can think of conversation
-process as application of C<import_xml> recepie on every input record.
+All that is controlled with C<config/input/*.xml> configuration file. You
+will want to create fine-grained chunks of data (like separate first and
+last name), which will later be used to produce output. You can think of
+conversation process as application of C<config/input/*.xml> recepie on
+every input record.
 
 Each tag within recepie is creating one new records as long as there are
 fields in input format (which can be repeatable) that satisfy at least one
diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm
new file mode 100644 (file)
index 0000000..0a9c497
--- /dev/null
@@ -0,0 +1,339 @@
+package WebPAC::Normalise::XML;
+
+use warnings;
+use strict;
+
+use base qw/WebPAC::Common/;
+
+=head1 NAME
+
+WebPAC::Normalise::XML - apply XML normalisaton rules
+
+=head1 VERSION
+
+Version 0.01
+
+=cut
+
+our $VERSION = '0.01';
+
+=head1 SYNOPSIS
+
+This module uses C<conf/normalize/*.xml> files to perform normalisation
+from input records
+
+    use WebPAC::Normalise::XML;
+
+    my $foo = WebPAC::Normalise::XML->new();
+    ...
+
+=cut
+
+# mapping between data type and tag which specify
+# format in XML file
+my %type2tag = (
+       'isis' => 'isis',
+#      'excel' => 'column',
+#      'marc' => 'marc',
+#      'feed' => 'feed'
+);
+
+
+=head1 EXPORT
+
+A list of functions that can be exported.  You can delete this section
+if you don't export anything, such as for a purely object-oriented module.
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Create new instance of WebPAC using configuration specified by C<config_file>.
+
+ my $n = new WebPAC::Normalize::XML(
+       cache_data_structure => './cache/ds/',
+ }
+
+Optional parameter C<cache_data_structure> defines path to directory
+in which cache file for C<data_structure> call will be created.
+
+=cut
+
+sub new {
+       my $class = shift;
+        my $self = {@_};
+        bless($self, $class);
+
+       $self->setup_cache_dir( $self->{'cache_data_structure'} );
+
+       return $self;
+}
+
+=head2 open_import_xml
+
+Read file from C<import_xml/> directory and parse it.
+
+ $webpac->open_import_xml(type => 'isis');
+
+=cut
+
+sub open_import_xml {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       my $arg = {@_};
+       $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
+
+       $self->{'type'} = $arg->{'type'};
+
+       my $type_base = $arg->{'type'};
+       $type_base =~ s/_.*$//g;
+
+       $self->{'tag'} = $type2tag{$type_base};
+
+       $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+
+       my $f = "./import_xml/".$self->{'type'}.".xml";
+       $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
+
+       $log->info("reading '$f'");
+
+       $self->{'import_xml_file'} = $f;
+
+       $self->{'import_xml'} = XMLin($f,
+               ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+       );
+
+       $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
+
+}
+
+=head2 setup_cache_dir
+
+Check if specified cache directory exist, and if not, disable caching.
+
+ $setup_cache_dir('./cache/ds/');
+
+If you pass false or zero value to this function, it will disable
+cacheing.
+
+=cut
+
+sub setup_cache_dir {
+       my $self = shift;
+
+       my $dir = shift;
+
+       my $log = $self->_get_logger();
+
+       if ($dir) {
+               my $msg;
+               if (! -e $dir) {
+                       $msg = "doesn't exist";
+               } elsif (! -d $dir) {
+                       $msg = "is not directory";
+               } elsif (! -w $dir) {
+                       $msg = "not writable";
+               }
+
+               if ($msg) {
+                       undef $self->{'cache_data_structure'};
+                       $log->warn("cache_data_structure $dir $msg, disabling...");
+               } else {
+                       $log->debug("using cache dir $dir");
+               }
+       } else {
+               $log->debug("disabling cache");
+               undef $self->{'cache_data_structure'};
+       }
+}
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents layout from C<import_xml>.
+It is used later to produce output.
+
+ my @ds = $webpac->data_structure($rec);
+
+This method will also set C<$webpac->{'currnet_filename'}> if there is
+<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
+<headline> tag.
+
+=cut
+
+sub data_structure {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       my $rec = shift;
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+       my $cache_file;
+
+       if (my $cache_path = $self->{'cache_data_structure'}) {
+               my $id = $rec->{'000'};
+               $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
+               unless (defined($id)) {
+                       $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
+                       undef $self->{'cache_data_structure'};
+               } else {
+                       $cache_file = "$cache_path/$id";
+                       if (-r $cache_file) {
+                               my $ds_ref = retrieve($cache_file);
+                               if ($ds_ref) {
+                                       $log->debug("cache hit: $cache_file");
+                                       my $ok = 1;
+                                       foreach my $f (qw(current_filename headline)) {
+                                               if ($ds_ref->{$f}) {
+                                                       $self->{$f} = $ds_ref->{$f};
+                                               } else {
+                                                       $ok = 0;
+                                               }
+                                       };
+                                       if ($ok && $ds_ref->{'ds'}) {
+                                               return @{ $ds_ref->{'ds'} };
+                                       } else {
+                                               $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
+                                               undef $self->{'cache_data_structure'};
+                                       }
+                               }
+                       }
+               }
+       }
+
+       undef $self->{'currnet_filename'};
+       undef $self->{'headline'};
+
+       my @sorted_tags;
+       if ($self->{tags_by_order}) {
+               @sorted_tags = @{$self->{tags_by_order}};
+       } else {
+               @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+               $self->{tags_by_order} = \@sorted_tags;
+       }
+
+       my @ds;
+
+       $log->debug("tags: ",sub { join(", ",@sorted_tags) });
+
+       foreach my $field (@sorted_tags) {
+
+               my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+               foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+                       my $format = $tag->{'value'} || $tag->{'content'};
+
+                       $log->debug("format: $format");
+
+                       my @v;
+                       if ($format =~ /$LOOKUP_REGEX/o) {
+                               @v = $self->fill_in_to_arr($rec,$format);
+                       } else {
+                               @v = $self->parse_to_arr($rec,$format);
+                       }
+                       next if (! @v);
+
+                       if ($tag->{'sort'}) {
+                               @v = $self->sort_arr(@v);
+                       }
+
+                       # use format?
+                       if ($tag->{'format_name'}) {
+                               @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+                       }
+
+                       if ($field eq 'filename') {
+                               $self->{'current_filename'} = join('',@v);
+                               $log->debug("filename: ",$self->{'current_filename'});
+                       } elsif ($field eq 'headline') {
+                               $self->{'headline'} .= join('',@v);
+                               $log->debug("headline: ",$self->{'headline'});
+                               next; # don't return headline in data_structure!
+                       }
+
+                       # delimiter will join repeatable fields
+                       if ($tag->{'delimiter'}) {
+                               @v = ( join($tag->{'delimiter'}, @v) );
+                       }
+
+                       # default types 
+                       my @types = qw(display swish);
+                       # override by type attribute
+                       @types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+                       foreach my $type (@types) {
+                               # append to previous line?
+                               $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+                               if ($tag->{'append'}) {
+
+                                       # I will delimit appended part with
+                                       # delimiter (or ,)
+                                       my $d = $tag->{'delimiter'};
+                                       # default delimiter
+                                       $d ||= " ";
+
+                                       my $last = pop @{$row->{$type}};
+                                       $d = "" if (! $last);
+                                       $last .= $d . join($d, @v);
+                                       push @{$row->{$type}}, $last;
+
+                               } else {
+                                       push @{$row->{$type}}, @v;
+                               }
+                       }
+
+
+               }
+
+               if ($row) {
+                       $row->{'tag'} = $field;
+
+                       # TODO: name_sigular, name_plural
+                       my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+                       $row->{'name'} = $name ? $self->_x($name) : $field;
+
+                       # post-sort all values in field
+                       if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+                               $log->warn("sort at field tag not implemented");
+                       }
+
+                       push @ds, $row;
+
+                       $log->debug("row $field: ",sub { Dumper($row) });
+               }
+
+       }
+
+       if ($cache_file) {
+               store {
+                       ds => \@ds,
+                       current_filename => $self->{'current_filename'},
+                       headline => $self->{'headline'},
+               }, $cache_file;
+               $log->debug("created storable cache file $cache_file");
+       }
+
+       return @ds;
+
+}
+
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Normalise::XML
diff --git a/lib/WebPAC/Output/CDBI.pm b/lib/WebPAC/Output/CDBI.pm
new file mode 100644 (file)
index 0000000..52268b2
--- /dev/null
@@ -0,0 +1,73 @@
+package WebPAC::Output::CDBI;
+
+use warnings;
+use strict;
+
+=head1 NAME
+
+WebPAC::Output::CDBI - The great new WebPAC::Output::CDBI!
+
+=head1 VERSION
+
+Version 0.01
+
+=cut
+
+our $VERSION = '0.01';
+
+=head1 SYNOPSIS
+
+Quick summary of what the module does.
+
+Perhaps a little code snippet.
+
+    use WebPAC::Output::CDBI;
+
+    my $foo = WebPAC::Output::CDBI->new();
+    ...
+
+=head1 EXPORT
+
+A list of functions that can be exported.  You can delete this section
+if you don't export anything, such as for a purely object-oriented module.
+
+=head1 FUNCTIONS
+
+=head2 function1
+
+=cut
+
+sub function1 {
+}
+
+=head2 function2
+
+=cut
+
+sub function2 {
+}
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 BUGS
+
+Please report any bugs or feature requests to
+C<bug-webpac-output-classdbi@rt.cpan.org>, or through the web interface at
+L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.
+I will be notified, and then you'll automatically be notified of progress on
+your bug as I make changes.
+
+=head1 ACKNOWLEDGEMENTS
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Output::CDBI
diff --git a/lib/WebPAC/Output/ClassDBI.pm b/lib/WebPAC/Output/ClassDBI.pm
deleted file mode 100644 (file)
index 6de6dfc..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-package WebPAC::Output::ClassDBI;
-
-use warnings;
-use strict;
-
-=head1 NAME
-
-WebPAC::Output::ClassDBI - The great new WebPAC::Output::ClassDBI!
-
-=head1 VERSION
-
-Version 0.01
-
-=cut
-
-our $VERSION = '0.01';
-
-=head1 SYNOPSIS
-
-Quick summary of what the module does.
-
-Perhaps a little code snippet.
-
-    use WebPAC::Output::ClassDBI;
-
-    my $foo = WebPAC::Output::ClassDBI->new();
-    ...
-
-=head1 EXPORT
-
-A list of functions that can be exported.  You can delete this section
-if you don't export anything, such as for a purely object-oriented module.
-
-=head1 FUNCTIONS
-
-=head2 function1
-
-=cut
-
-sub function1 {
-}
-
-=head2 function2
-
-=cut
-
-sub function2 {
-}
-
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 BUGS
-
-Please report any bugs or feature requests to
-C<bug-webpac-output-classdbi@rt.cpan.org>, or through the web interface at
-L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.
-I will be notified, and then you'll automatically be notified of progress on
-your bug as I make changes.
-
-=head1 ACKNOWLEDGEMENTS
-
-=head1 COPYRIGHT & LICENSE
-
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
-
-=cut
-
-1; # End of WebPAC::Output::ClassDBI
diff --git a/t/2-input-isis.t b/t/2-input-isis.t
new file mode 100755 (executable)
index 0000000..61fd3e3
--- /dev/null
@@ -0,0 +1,16 @@
+#!/usr/bin/perl -w
+
+use Test::More tests => 4;
+use Test::Exception;
+use blib;
+use strict;
+
+BEGIN {
+use_ok( 'WebPAC::Input::ISIS' );
+}
+
+ok(my $isis = new WebPAC::Input::ISIS(), "new");
+
+throws_ok { $isis->open() } qr/filename/, "need filename";
+
+throws_ok { $isis->open( filename => '/dev/null' ) } qr/can't find database/ , "open";