data_source seems to work
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 16 Jul 2005 23:56:14 +0000 (23:56 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 16 Jul 2005 23:56:14 +0000 (23:56 +0000)
git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@13 07558da8-63fa-0310-ba24-9fe276d99e06

lib/WebPAC/Common.pm
lib/WebPAC/Normalize.pm
lib/WebPAC/Normalize/XML.pm
t/3-normalize-xml.t

index efc6cff..728997d 100644 (file)
@@ -269,45 +269,6 @@ sub _eval {
        return $ret || undef;
 }
 
-=head2 _sort_by_order
-
-Sort xml tags data structure accoding to C<order=""> attribute.
-
-=cut
-
-sub _sort_by_order {
-       my $self = shift;
-
-       my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
-               $self->{'import_xml'}->{'indexer'}->{$a};
-       my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
-               $self->{'import_xml'}->{'indexer'}->{$b};
-
-       return $va <=> $vb;
-}
-
-=head2 _x
-
-Convert string from UTF-8 to code page defined in C<import_xml>.
-
- my $text = $webpac->_x('utf8 text');
-
-Default application code page is C<ISO-8859-2>. You will probably want to
-change that when creating new instance of object based on this one.
-
-=cut
-
-sub _x {
-       my $self = shift;
-       my $utf8 = shift || return;
-
-       # create UTF-8 convertor for import_xml files
-       $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2');
-
-       return $self->{'utf2cp'}->convert($utf8) ||
-               $self->_get_logger()->logwarn("can't convert '$utf8'");
-}
-
 =head2 _init_logger
 
 This function will init C<Log::Log4perl> using provided configuration file.
index 0257caf..1dacb2f 100644 (file)
@@ -2,6 +2,7 @@ package WebPAC::Normalize;
 
 use warnings;
 use strict;
+use Data::Dumper;
 
 =head1 NAME
 
@@ -22,13 +23,501 @@ normalisation front-ends.
 
 =head1 FUNCTIONS
 
-=head2 none_yet
+=head2 new
+
+Create new normalisation object
+
+  my $n = new WebPAC::Normalize::Something(
+       cache_data_structure => './cache/ds/',
+       lookup_regex => $lookup->regex,
+  );
+
+Optional parameter C<cache_data_structure> defines path to directory
+in which cache file for C<data_structure> call will be created.
+
+Recommended parametar C<lookup_regex> is used to enable parsing of lookups
+in structures.
+
+=cut
+
+sub new {
+       my $class = shift;
+        my $self = {@_};
+        bless($self, $class);
+
+       $self->setup_cache_dir( $self->{'cache_data_structure'} );
+
+       $self ? return $self : return undef;
+}
+
+=head2 setup_cache_dir
+
+Check if specified cache directory exist, and if not, disable caching.
+
+ $setup_cache_dir('./cache/ds/');
+
+If you pass false or zero value to this function, it will disable
+cacheing.
+
+=cut
+
+sub setup_cache_dir {
+       my $self = shift;
+
+       my $dir = shift;
+
+       my $log = $self->_get_logger();
+
+       if ($dir) {
+               my $msg;
+               if (! -e $dir) {
+                       $msg = "doesn't exist";
+               } elsif (! -d $dir) {
+                       $msg = "is not directory";
+               } elsif (! -w $dir) {
+                       $msg = "not writable";
+               }
+
+               if ($msg) {
+                       undef $self->{'cache_data_structure'};
+                       $log->warn("cache_data_structure $dir $msg, disabling...");
+               } else {
+                       $log->debug("using cache dir $dir");
+               }
+       } else {
+               $log->debug("disabling cache");
+               undef $self->{'cache_data_structure'};
+       }
+}
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
+
+ my @ds = $webpac->data_structure($rec);
+
+B<Note: historical oddity follows>
+
+This method will also set C<< $webpac->{'currnet_filename'} >> if there is
+C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
+C<< <headline> >> tag.
+
+=cut
+
+sub data_structure {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       my $rec = shift;
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+       my $cache_file;
+
+       if (my $cache_path = $self->{'cache_data_structure'}) {
+               my $id = $rec->{'000'};
+               $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
+               unless (defined($id)) {
+                       $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
+                       undef $self->{'cache_data_structure'};
+               } else {
+                       $cache_file = "$cache_path/$id";
+                       if (-r $cache_file) {
+                               my $ds_ref = retrieve($cache_file);
+                               if ($ds_ref) {
+                                       $log->debug("cache hit: $cache_file");
+                                       my $ok = 1;
+                                       foreach my $f (qw(current_filename headline)) {
+                                               if ($ds_ref->{$f}) {
+                                                       $self->{$f} = $ds_ref->{$f};
+                                               } else {
+                                                       $ok = 0;
+                                               }
+                                       };
+                                       if ($ok && $ds_ref->{'ds'}) {
+                                               return @{ $ds_ref->{'ds'} };
+                                       } else {
+                                               $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
+                                               undef $self->{'cache_data_structure'};
+                                       }
+                               }
+                       }
+               }
+       }
+
+       undef $self->{'currnet_filename'};
+       undef $self->{'headline'};
+
+       my @sorted_tags;
+       if ($self->{tags_by_order}) {
+               @sorted_tags = @{$self->{tags_by_order}};
+       } else {
+               @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+               $self->{tags_by_order} = \@sorted_tags;
+       }
+
+       my @ds;
+
+       $log->debug("tags: ",sub { join(", ",@sorted_tags) });
+
+       foreach my $field (@sorted_tags) {
+
+               my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+               foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+                       my $format = $tag->{'value'} || $tag->{'content'};
+
+                       $log->debug("format: $format");
+
+                       my @v;
+                       if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
+                               @v = $self->fill_in_to_arr($rec,$format);
+                       } else {
+                               @v = $self->parse_to_arr($rec,$format);
+                       }
+                       next if (! @v);
+
+                       if ($tag->{'sort'}) {
+                               @v = $self->sort_arr(@v);
+                       }
+
+                       # use format?
+                       if ($tag->{'format_name'}) {
+                               @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+                       }
+
+                       if ($field eq 'filename') {
+                               $self->{'current_filename'} = join('',@v);
+                               $log->debug("filename: ",$self->{'current_filename'});
+                       } elsif ($field eq 'headline') {
+                               $self->{'headline'} .= join('',@v);
+                               $log->debug("headline: ",$self->{'headline'});
+                               next; # don't return headline in data_structure!
+                       }
+
+                       # delimiter will join repeatable fields
+                       if ($tag->{'delimiter'}) {
+                               @v = ( join($tag->{'delimiter'}, @v) );
+                       }
+
+                       # default types 
+                       my @types = qw(display swish);
+                       # override by type attribute
+                       @types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+                       foreach my $type (@types) {
+                               # append to previous line?
+                               $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+                               if ($tag->{'append'}) {
+
+                                       # I will delimit appended part with
+                                       # delimiter (or ,)
+                                       my $d = $tag->{'delimiter'};
+                                       # default delimiter
+                                       $d ||= " ";
+
+                                       my $last = pop @{$row->{$type}};
+                                       $d = "" if (! $last);
+                                       $last .= $d . join($d, @v);
+                                       push @{$row->{$type}}, $last;
+
+                               } else {
+                                       push @{$row->{$type}}, @v;
+                               }
+                       }
+
+
+               }
+
+               if ($row) {
+                       $row->{'tag'} = $field;
+
+                       # TODO: name_sigular, name_plural
+                       my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+                       $row->{'name'} = $name ? $self->_x($name) : $field;
+
+                       # post-sort all values in field
+                       if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+                               $log->warn("sort at field tag not implemented");
+                       }
+
+                       push @ds, $row;
+
+                       $log->debug("row $field: ",sub { Dumper($row) });
+               }
+
+       }
+
+       if ($cache_file) {
+               store {
+                       ds => \@ds,
+                       current_filename => $self->{'current_filename'},
+                       headline => $self->{'headline'},
+               }, $cache_file;
+               $log->debug("created storable cache file $cache_file");
+       }
+
+       return @ds;
+
+}
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
 
 =cut
 
-sub none_yet {
+sub apply_format {
+       my $self = shift;
+
+       my ($name,$delimiter,$data) = @_;
+
+       my $log = $self->_get_logger();
+
+       if (! $self->{'import_xml'}->{'format'}->{$name}) {
+               $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+               return $data;
+       }
+
+       $log->warn("no delimiter for format $name") if (! $delimiter);
+
+       my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+       my @data = split(/\Q$delimiter\E/, $data);
+
+       my $out = sprintf($format, @data);
+       $log->debug("using format $name [$format] on $data to produce: $out");
+
+       if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+               return $self->lookup($out);
+       } else {
+               return $out;
+       }
+
 }
 
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C<eval{...}> and
+return output or nothing depending on eval code.
+
+ my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+=cut
+
+sub parse {
+       my $self = shift;
+
+       my ($rec, $format_utf8, $i) = @_;
+
+       return if (! $format_utf8);
+
+       my $log = $self->_get_logger();
+
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+       $i = 0 if (! $i);
+
+       my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+       my @out;
+
+       $log->debug("format: $format");
+
+       my $eval_code;
+       # remove eval{...} from beginning
+       $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+       my $filter_name;
+       # remove filter{...} from beginning
+       $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+       my $prefix;
+       my $all_found=0;
+
+       while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
+
+               my $del = $1 || '';
+               $prefix ||= $del if ($all_found == 0);
+
+               # repeatable index
+               my $r = $i;
+               $r = 0 if (lc("$2") eq 's');
+
+               my $found = 0;
+               my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
+
+               if ($found) {
+                       push @out, $del;
+                       push @out, $tmp;
+                       $all_found += $found;
+               }
+       }
+
+       return if (! $all_found);
+
+       my $out = join('',@out);
+
+       if ($out) {
+               # add rest of format (suffix)
+               $out .= $format;
+
+               # add prefix if not there
+               $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+               $log->debug("result: $out");
+       }
+
+       if ($eval_code) {
+               my $eval = $self->fill_in($rec,$eval_code,$i) || return;
+               $log->debug("about to eval{$eval} format: $out");
+               return if (! $self->_eval($eval));
+       }
+       
+       if ($filter_name && $self->{'filter'}->{$filter_name}) {
+               $log->debug("about to filter{$filter_name} format: $out");
+               $out = $self->{'filter'}->{$filter_name}->($out);
+               return unless(defined($out));
+               $log->debug("filter result: $out");
+       }
+
+       return $out;
+}
+
+=head2 parse_to_arr
+
+Similar to C<parse>, but returns array of all repeatable fields
+
+ my @arr = $webpac->parse_to_arr($rec,'v250^a');
+
+=cut
+
+sub parse_to_arr {
+       my $self = shift;
+
+       my ($rec, $format_utf8) = @_;
+
+       my $log = $self->_get_logger();
+
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+       return if (! $format_utf8);
+
+       my $i = 0;
+       my @arr;
+
+       while (my $v = $self->parse($rec,$format_utf8,$i++)) {
+               push @arr, $v;
+       }
+
+       $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+       return @arr;
+}
+
+=head2 fill_in_to_arr
+
+Similar to C<fill_in>, but returns array of all repeatable fields. Usable
+for fields which have lookups, so they shouldn't be parsed but rather
+C<fill_id>ed.
+
+ my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
+
+=cut
+
+sub fill_in_to_arr {
+       my $self = shift;
+
+       my ($rec, $format_utf8) = @_;
+
+       my $log = $self->_get_logger();
+
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+       return if (! $format_utf8);
+
+       my $i = 0;
+       my @arr;
+
+       while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
+               push @arr, @v;
+       }
+
+       $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+       return @arr;
+}
+
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       # FIXME add Schwartzian Transformation?
+
+       my @sorted = sort {
+               $a =~ s#<[^>]+/*>##;
+               $b =~ s#<[^>]+/*>##;
+               lc($b) cmp lc($a)
+       } @_;
+       $log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+       return @sorted;
+}
+
+
+=head2 _sort_by_order
+
+Sort xml tags data structure accoding to C<order=""> attribute.
+
+=cut
+
+sub _sort_by_order {
+       my $self = shift;
+
+       my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+               $self->{'import_xml'}->{'indexer'}->{$a};
+       my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+               $self->{'import_xml'}->{'indexer'}->{$b};
+
+       return $va <=> $vb;
+}
+
+=head2 _x
+
+Convert strings from C<conf/normalize> encoding into application specific
+(optinally specified using C<code_page> to C<new> constructor.
+
+ my $text = $n->_x('normalize text string');
+
+This is a stub so that other modules doesn't have to implement it.
+
+=cut
+
+sub _x {
+       my $self = shift;
+       return shift;
+}
+
+
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
index 24c1c1d..929b775 100644 (file)
@@ -3,10 +3,11 @@ package WebPAC::Normalize::XML;
 use warnings;
 use strict;
 
-use base qw/WebPAC::Common/;
+use base qw/WebPAC::Common WebPAC::Normalize/;
 use Storable;
 use XML::Simple;
 use Data::Dumper;
+use Text::Iconv;
 
 =head1 NAME
 
@@ -29,45 +30,36 @@ from input records
 
 =head1 FUNCTIONS
 
-=head2 new
+=head2 open
 
 Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
 parse it.
 
- my $n = new WebPAC::Normalize::XML(
+ my $n = new WebPAC::Normalize::XML;
+ $n->open(
        tag => 'isis',
        xml_file => '/path/to/conf/normalize/isis.xml',
-       cache_data_structure => './cache/ds/',
-       lookup_regex => $lookup->regex,
- }
+ );
 
 C<tag> defines tag to use within C<xml_file>
 
 C<xml_file> defines path to normalize XML.
 
-Optional parameter C<cache_data_structure> defines path to directory
-in which cache file for C<data_structure> call will be created.
-
-Recommended parametar C<lookup_regex> specify ... 
-
 =cut
 
-sub new {
-       my $class = shift;
-        my $self = {@_};
-        bless($self, $class);
+sub open {
+       my $self = shift;
 
-       $self->setup_cache_dir( $self->{'cache_data_structure'} );
+       my $arg = {@_};
 
        my $log = $self->_get_logger();
 
        foreach my $req (qw/tag xml_file/) {
-               $log->logconfess("need argument $req") unless $self->{$req};
+               $log->logconfess("need argument $req") unless $arg->{$req};
        }
 
-       my $f = 
-
-       my $xml_file = $self->{'xml_file'};
+       $self->{'tag'} = $arg->{'tag'};
+       my $xml_file = $arg->{'xml_file'};
 
        $log->info("using $xml_file tag <",$self->{'tag'},">");
 
@@ -75,7 +67,7 @@ sub new {
 
        $self->{'import_xml_file'} = $xml_file;
 
-       $self->{'import_xml'} = XMLin($f,
+       $self->{'import_xml'} = XMLin($xml_file,
                ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
        );
 
@@ -84,261 +76,27 @@ sub new {
        return $self;
 }
 
-=head2 setup_cache_dir
-
-Check if specified cache directory exist, and if not, disable caching.
-
- $setup_cache_dir('./cache/ds/');
-
-If you pass false or zero value to this function, it will disable
-cacheing.
-
-=cut
-
-sub setup_cache_dir {
-       my $self = shift;
-
-       my $dir = shift;
-
-       my $log = $self->_get_logger();
-
-       if ($dir) {
-               my $msg;
-               if (! -e $dir) {
-                       $msg = "doesn't exist";
-               } elsif (! -d $dir) {
-                       $msg = "is not directory";
-               } elsif (! -w $dir) {
-                       $msg = "not writable";
-               }
-
-               if ($msg) {
-                       undef $self->{'cache_data_structure'};
-                       $log->warn("cache_data_structure $dir $msg, disabling...");
-               } else {
-                       $log->debug("using cache dir $dir");
-               }
-       } else {
-               $log->debug("disabling cache");
-               undef $self->{'cache_data_structure'};
-       }
-}
-
-
-=head2 data_structure
-
-Create in-memory data structure which represents normalized layout from
-C<conf/normalize/*.xml>.
-
-This structures are used to produce output.
-
- my @ds = $webpac->data_structure($rec);
-
-B<Note: historical oddity follows>
-
-This method will also set C<< $webpac->{'currnet_filename'} >> if there is
-C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
-C<< <headline> >> tag.
-
-=cut
-
-sub data_structure {
-       my $self = shift;
-
-       my $log = $self->_get_logger();
-
-       my $rec = shift;
-       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
-       my $cache_file;
-
-       if (my $cache_path = $self->{'cache_data_structure'}) {
-               my $id = $rec->{'000'};
-               $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
-               unless (defined($id)) {
-                       $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
-                       undef $self->{'cache_data_structure'};
-               } else {
-                       $cache_file = "$cache_path/$id";
-                       if (-r $cache_file) {
-                               my $ds_ref = retrieve($cache_file);
-                               if ($ds_ref) {
-                                       $log->debug("cache hit: $cache_file");
-                                       my $ok = 1;
-                                       foreach my $f (qw(current_filename headline)) {
-                                               if ($ds_ref->{$f}) {
-                                                       $self->{$f} = $ds_ref->{$f};
-                                               } else {
-                                                       $ok = 0;
-                                               }
-                                       };
-                                       if ($ok && $ds_ref->{'ds'}) {
-                                               return @{ $ds_ref->{'ds'} };
-                                       } else {
-                                               $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
-                                               undef $self->{'cache_data_structure'};
-                                       }
-                               }
-                       }
-               }
-       }
-
-       undef $self->{'currnet_filename'};
-       undef $self->{'headline'};
-
-       my @sorted_tags;
-       if ($self->{tags_by_order}) {
-               @sorted_tags = @{$self->{tags_by_order}};
-       } else {
-               @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
-               $self->{tags_by_order} = \@sorted_tags;
-       }
-
-       my @ds;
-
-       $log->debug("tags: ",sub { join(", ",@sorted_tags) });
-
-       foreach my $field (@sorted_tags) {
 
-               my $row;
+=head2 _x
 
-#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+Convert string from UTF-8 to code page defined in C<import_xml>.
 
-               foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
-                       my $format = $tag->{'value'} || $tag->{'content'};
+ my $text = $n->_x('utf8 text');
 
-                       $log->debug("format: $format");
-
-                       my @v;
-                       if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
-                               @v = $self->fill_in_to_arr($rec,$format);
-                       } else {
-                               @v = $self->parse_to_arr($rec,$format);
-                       }
-                       next if (! @v);
-
-                       if ($tag->{'sort'}) {
-                               @v = $self->sort_arr(@v);
-                       }
-
-                       # use format?
-                       if ($tag->{'format_name'}) {
-                               @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
-                       }
-
-                       if ($field eq 'filename') {
-                               $self->{'current_filename'} = join('',@v);
-                               $log->debug("filename: ",$self->{'current_filename'});
-                       } elsif ($field eq 'headline') {
-                               $self->{'headline'} .= join('',@v);
-                               $log->debug("headline: ",$self->{'headline'});
-                               next; # don't return headline in data_structure!
-                       }
-
-                       # delimiter will join repeatable fields
-                       if ($tag->{'delimiter'}) {
-                               @v = ( join($tag->{'delimiter'}, @v) );
-                       }
-
-                       # default types 
-                       my @types = qw(display swish);
-                       # override by type attribute
-                       @types = ( $tag->{'type'} ) if ($tag->{'type'});
-
-                       foreach my $type (@types) {
-                               # append to previous line?
-                               $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
-                               if ($tag->{'append'}) {
-
-                                       # I will delimit appended part with
-                                       # delimiter (or ,)
-                                       my $d = $tag->{'delimiter'};
-                                       # default delimiter
-                                       $d ||= " ";
-
-                                       my $last = pop @{$row->{$type}};
-                                       $d = "" if (! $last);
-                                       $last .= $d . join($d, @v);
-                                       push @{$row->{$type}}, $last;
-
-                               } else {
-                                       push @{$row->{$type}}, @v;
-                               }
-                       }
-
-
-               }
-
-               if ($row) {
-                       $row->{'tag'} = $field;
-
-                       # TODO: name_sigular, name_plural
-                       my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
-                       $row->{'name'} = $name ? $self->_x($name) : $field;
-
-                       # post-sort all values in field
-                       if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
-                               $log->warn("sort at field tag not implemented");
-                       }
-
-                       push @ds, $row;
-
-                       $log->debug("row $field: ",sub { Dumper($row) });
-               }
-
-       }
-
-       if ($cache_file) {
-               store {
-                       ds => \@ds,
-                       current_filename => $self->{'current_filename'},
-                       headline => $self->{'headline'},
-               }, $cache_file;
-               $log->debug("created storable cache file $cache_file");
-       }
-
-       return @ds;
-
-}
-
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
-
-Formats can contain C<lookup{...}> if you need them.
+Default application code page is C<ISO-8859-2>. You will probably want to
+change that when creating new instance of object based on this one.
 
 =cut
 
-sub apply_format {
+sub _x {
        my $self = shift;
+       my $utf8 = shift || return;
 
-       my ($name,$delimiter,$data) = @_;
-
-       my $log = $self->_get_logger();
-
-       if (! $self->{'import_xml'}->{'format'}->{$name}) {
-               $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
-               return $data;
-       }
-
-       $log->warn("no delimiter for format $name") if (! $delimiter);
-
-       my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
-
-       my @data = split(/\Q$delimiter\E/, $data);
-
-       my $out = sprintf($format, @data);
-       $log->debug("using format $name [$format] on $data to produce: $out");
-
-       if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
-               return $self->lookup($out);
-       } else {
-               return $out;
-       }
+       # create UTF-8 convertor for import_xml files
+       $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2');
 
+       return $self->{'utf2cp'}->convert($utf8) ||
+               $self->_get_logger()->logwarn("can't convert '$utf8'");
 }
 
 
index 85f7bb0..8247c2d 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/perl -w
 
-use Test::More tests => 6;
+use Test::More tests => 8;
 use Test::Exception;
 use Cwd qw/abs_path/;
 use blib;
@@ -14,13 +14,24 @@ ok(my $abs_path = abs_path($0), "abs_path");
 $abs_path =~ s#/[^/]*$#/#;
 diag "abs_path: $abs_path";
 
-throws_ok { new WebPAC::Normalize::XML() } qr/tag/, "new without tag";
-throws_ok { new WebPAC::Normalize::XML( tag => 'isis' ) } qr/xml_file/, "new without xml_file";
-throws_ok { new WebPAC::Normalize::XML( tag => 'isis', xml_file => 'foo' ) } qr/file.*doesn't exist/, "new with invalid xml_file";
-ok(my $isis = new WebPAC::Normalize::XML(
+ok(my $n = new WebPAC::Normalize::XML( debug => 1 ), "new");
+
+
+throws_ok { $n->open() } qr/tag/, "open without tag";
+throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file";
+throws_ok { $n->open( tag => 'isis', xml_file => 'foo' ) } qr/file.*doesn't exist/, "open with invalid xml_file";
+ok( $n->open(
        tag => 'isis',
        xml_file => "$abs_path../conf/normalize/isis.xml",
-       debug => 1,
-), "new");
+), "open");
+
+my $rec = {
+       '000' => [ '001' ],
+       '800' => [ 'foo' ],
+       '900' => [ 'bar' ],
+};
 
+ok(my @ds = $n->data_structure( $rec ), "data_structure");
 
+use Data::Dumper;
+diag Dumper(\@ds);