changes to support UTF-8 encoding from
[webpac] / parse_format.pm
index 464767f..e348dde 100644 (file)
@@ -3,7 +3,6 @@
 # parse_format(...)
 #
 
-
 sub parse_format {
        my $type = shift || die "parset_format must be called with type!";
        my $format = shift || die "parse_format must be called with format!";
@@ -37,61 +36,154 @@ sub parse_iso_format {
        my $out;
        my $out_swish;
 
-       my $prefix = "";
-       if ($format =~ s/^([^\d]+)//) {
-               $prefix = $1;
-       }
-
        my $display;
        my $swish;
 
        sub cnv_cp {
-               my $tmp = shift;
+               my $codepage = shift;
+               my $tmp = shift || return;
                if ($codepage) {
-                       $tmp = $codepage->convert($tmp) || print STDERR "$1$2 = '$tmp' can't convert";
+                       $tmp = $codepage->convert($tmp) || print STDERR "iso: '$tmp' can't convert\n";
                }
                return $tmp;
        }
 
-       while ($format) {
-#print STDERR "\n#### $format";
-               # this is EBSCO special to support numeric subfield in
-               # form of 856#3
-               if ($format =~ s/^(\d\d\d)#*(\w?)//) {
-                       my $tmp = get_sf($row,$1,$2,$i);
-                       if ($tmp) {
-                               $display .= $prefix.cnv_cp($tmp);
-                               $swish .= $tmp." ";
-#print STDERR " == $tmp";
+       # if format doesn't exits, store it in cache
+       if (! defined($cache->{format}->{$format})) {
+#              print STDERR "parsing format for '$format'\n";
+               my @fmt;
+
+               my $f = $format;
+
+               my $eval;
+               $eval = $1 if ($f =~ s/^eval{([^}]+?)}//);
+
+               if ($f =~ s/^([^\d]+)//) {
+                       if ($f) {       # there is more to parse
+                               push @fmt,$1;
+                       } else {
+                               @fmt = ('',$1,undef,'');
+#print STDERR "just one field: $1\n";
                        }
-                       $prefix = "";
-               # this might be our local scpeciality -- fields 10 and 11
-               # (as opposed to 010 and 011) so they are strictly listed
-               # here
-               } elsif ($format =~ s/^(1[01])//) {
-                       my $tmp = get_sf($row,$1,undef,$i);
-                       if ($tmp) {
-                               $display .= $prefix.cnv_cp($tmp);
+               } else {
+                       push @fmt,'';
+               }
+
+               while ($f) {
+#      print STDERR "\n#### $f";
+                       # this is EBSCO special to support numeric subfield in
+                       # form of 856#3
+                       if ($f =~ s/^(\d\d\d)#*(\w?)//) {
+                               push @fmt,$1;
+                               if ($2) {
+                                       push @fmt,$2;
+                               } else {
+                                       push @fmt,undef;
+                               }
+                       # this might be our local scpeciality -- fields 10 and 11
+                       # (as opposed to 010 and 011) so they are strictly listed
+                       # here
+                       } elsif ($f =~ s/^(1[01]\w?)//) {
+                               push @fmt,$1;
+                               push @fmt,undef;
+                       } elsif ($f =~ s/^mfn//i) {
+                               push @fmt,'mfn';
+                               push @fmt,'';
+                       } elsif ($f =~ s/^([^\d]+)(\d{0,3})/$2/) {
+                               # still prefix?
+                               if ($#fmt == 0) {
+                                       $fmt[0] .= $1;
+                               } else {
+                                       push @fmt,$1;
+                               }
+                       } elsif ($f =~ s/^([^\d]+\d{0,2})//) {
+                               if ($#fmt == 0) {
+                                       $fmt[0] .= $1;
+                               } else {
+                                       push @fmt,$1;
+                               }
+                       } elsif ($f =~ s/^(\d{1,2})//) {
+                               if ($#fmt == 0) {
+                                       $fmt[0] .= $1;
+                               } else {
+                                       push @fmt,$1;
+                               }
+                       } else {
+                               print STDERR "unparsed format: $f\n";
+                               $f = "";
+                       }
+               }
+               push @fmt,'' if ($#fmt % 3 != 0);       # add empty suffix
+
+               $cache->{format_eval}->{$format} = $eval; # store eval string (if any)
+
+               $cache->{format}->{$format} = \@fmt;
+               
+#              print STDERR "storing format for '$format': [",join("|",@fmt),"]\n";
+#              print STDERR "storing format for '$format':",Dumper(@fmt),"\n";
+#              print STDERR Dumper($cache->{format}->{$format});
+       }
+
+       # now produce actual record
+       my $tmp = $cache->{format}->{$format} || die "no format cache for '$format'";
+       my @fmt = @{$tmp};
+#      print STDERR "using format for '$format':",Dumper(@fmt),"\n";
+#      print STDERR "tmp ",Dumper($tmp);
+#      print STDERR "cache: ",Dumper($cache->{format}->{$format});
+
+       # prefix
+       my $prefix = shift @fmt;
+       my $sufix;
+       while($#fmt > 1) {
+               my $f = shift @fmt || die "BUG: field name can't be empty!";
+               my $sf = shift @fmt;
+
+               if ($f eq 'mfn' && $i == 0) {
+                       $display .= $sufix if ($display);
+                       $display .= $row->{mfn};
+               } else {
+                       my $val = &$func($row,$f,$sf,$i);
+                       if ($val) {
+#                              print STDERR "val: $val\n";
+                               my $tmp = cnv_cp($codepage,$val);
+                               if ($display) {
+                                       $display .= $sufix.$tmp;
+                               } else {
+                                       $display = $tmp;
+                               }
                                $swish .= $tmp." ";
                        }
-                       $prefix = "";
-               } elsif ($format =~ s/^mfn//i) {
-                       $display .= $prefix . $row->{mfn};
-                       $prefix = "";
-               } elsif ($format =~ s/^([^\d]+)(\d{0,3})/$2/) {
-                       $prefix .= $1 if ($display);
-               } elsif ($format =~ s/^([^\d]+\d{0,2})//) {
-                       $prefix .= $1 if ($display);
-               } elsif ($format =~ s/^(\d{1,2})//) {
-                       $prefix .= $1 if ($display);
+               }
+               $sufix = shift @fmt;
+       }
+       $display = $prefix.$display.$sufix if ($display);
+
+       my $eval = $cache->{format_eval}->{$format};
+       if ($eval) {
+               sub fld2str {
+                       my ($func,$row,$f,$sf,$i) = @_;
+#print STDERR "## in fld2str\n";
+                       my $tmp = $codepage->convert(&$func($row,$f,$sf,$i)) ||  $codepage->convert(&$func($row,$f,$sf,0)) || '';
+                       return "'$tmp'";
+               }
+
+               $eval =~ s/v(\d+)\^(\w*)/fld2str($func,$row,$1,$2,$i)/eg;
+#print STDERR "## eval: $eval\n";
+               if (eval "$eval") {
+                       die "eval error: eval{$eval}: $@" if ($@);
+                       return ($swish,$display);
                } else {
-                       print STDERR "unparsed format: $format\n";
-                       $prefix .= $format;
-                       $format = "";
+                       die "eval error: eval{$eval}: $@" if ($@);
+                       return (undef,undef);
                }
        }
-       # add suffix
-       $display .= $prefix if ($display);
+
+       if (@fmt) {
+               print STDERR "format left unused: [",join("|",@fmt),"]\n";
+               print STDERR "format: [",join("|",@{$tmp}),"]\n";
+       }
+
+#      print STDERR "format: {",$format || '',"} display: {",$display || '',"} swish: {",$swish || '',"}\n";
 
        return ($swish,$display);
 }
@@ -102,7 +194,16 @@ sub parse_excel_format {
        my $format = shift;
        my $row = shift;
        my $i = shift;
-       my $codepage = shift;
+       #my $codepage = shift;
+       #
+       # data allready comes in utf-8 due to change in
+       # SpreadSheet::ParseExcel::FmtDefault line 69 from
+       #       return pack('C*', unpack('n*', $sTxt));
+       # to following which returns utf-8:
+       #       return pack('U*', unpack('n*', $sTxt));
+       #
+
+       return if ($i > 0);     # Excel doesn't support repeatable fields
 
        my $out;
        my $out_swish;
@@ -121,9 +222,6 @@ sub parse_excel_format {
 #print STDERR "--$1-> $format -[",length($format),"] ";
                        if ($row->{$1}) {
                                my $tmp = $row->{$1};
-                               if ($codepage) {
-                                       $tmp = $codepage->convert($tmp) || warn "excel: $1 '$tmp' can't convert";
-                               }
                                $display .= $prefix . $tmp;
                                $swish .= $tmp." ";
 #print STDERR " == $tmp";
@@ -132,7 +230,7 @@ sub parse_excel_format {
                } elsif ($format =~ s/^([^A-Z\|]+)(\|[A-Z]{1,2}\|)/$2/) {
                        $prefix .= $1 if ($display);
                } else {
-                       print STDERR "unparsed format: $format\n";
+                       #print STDERR "unparsed format: $format\n";
                        $prefix .= $format;
                        $format = "";
                }
@@ -152,6 +250,10 @@ sub parse_feed_format {
        my $i = shift;
        my $codepage = shift;
 
+       # XXX feed doesn't support repeatable fields, but they really
+       # should, This is a bug. It should be fixed!
+       return if ($i > 0);
+
        my $out;
        my $out_swish;