hopefully fix utf-8 encoding when sending it to swish
authorDobrica Pavlinusic <dpavlin@rot13.org>
Wed, 22 Jul 2009 12:53:07 +0000 (12:53 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Wed, 22 Jul 2009 12:53:07 +0000 (12:53 +0000)
- remove accented characters before indexing (FIXME: done anyway by swish?)
- convert data to JSON with utf8 encoding
- remove explicit encode
- use bytes to calculate length

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1247 07558da8-63fa-0310-ba24-9fe276d99e06

lib/WebPAC/Output/SWISH.pm

index be512ef..6339329 100644 (file)
@@ -18,7 +18,8 @@ use File::Path qw/mkpath/;
 use Data::Dump qw/dump/;
 use YAML;
 use JSON;
-use Encode qw/encode/;
+#use Encode qw/encode encode_utf8 is_utf8/;
+use Text::Unaccent::PurePerl qw/unac_string/;
 
 
 =head1 NAME
@@ -169,23 +170,27 @@ sub add {
                next if ! $vals;
 
                $vals =~ s/($escape_re)/$escape{$1}/gs;
+               $data->{$tag} = $vals;
+               $vals = unac_string( $vals );
+
                # BW & EW are our markers for tag boundry
                $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
+#              $xml .= qq{<!-- } . is_utf8( $vals ) . qq{!>};
 
                $self->{stats}->{attr}->{$tag}++;
                $self->{stats}->{input}->{ $self->input }->{$tag}++;
 
-               $data->{$tag} = $vals;
        }
 
        # serialize to JSON instead of YAML because we will loose whitespace
-       $data = to_json($data);
+       $data = to_json($data, {utf8=>1});
        $xml .= qq{<data><![CDATA[$data]]></data>};
 
        $xml .= qq{</all>\n};
 
-       $xml = encode('utf-8', $xml);
+#      $xml = encode('utf-8', $xml);
 
+       use bytes;
        my $len = length($xml);
 
        my $fh = $self->{_swish_fh} || die "_swish_fh missing";