Bug fixing charsets Using Unicode::Normalize
authorHenri-Damien LAURENT <henridamien.laurent@biblibre.com>
Thu, 9 Apr 2009 13:16:20 +0000 (15:16 +0200)
committerHenri-Damien LAURENT <henridamien.laurent@biblibre.com>
Mon, 13 Jul 2009 10:42:37 +0000 (12:42 +0200)
C4/Biblio.pm
C4/Charset.pm

index 336df15..05729ff 100644 (file)
@@ -1125,7 +1125,7 @@ sub GetXmlBiblio {
       $dbh->prepare("SELECT marcxml FROM biblioitems WHERE biblionumber=? ");
     $sth->execute($biblionumber);
     my ($marcxml) = $sth->fetchrow;
-    return $marcxml;
+    return Normalize_String($marcxml);
 }
 
 =head2 GetCOinSBiblio
@@ -3386,7 +3386,7 @@ sub ModBiblioMarc {
     $sth =
       $dbh->prepare(
         "UPDATE biblioitems SET marc=?,marcxml=? WHERE biblionumber=?");
-    $sth->execute( $record->as_usmarc(), $record->as_xml_record($encoding),
+    $sth->execute( $record->as_usmarc(), Normalize_String($record->as_xml_record($encoding)),
         $biblionumber );
     $sth->finish;
     ModZebra($biblionumber,"specialUpdate","biblioserver",$oldRecord,$record);
index bbeef24..541ebce 100644 (file)
@@ -22,6 +22,7 @@ use warnings;
 
 use MARC::Charset qw/marc8_to_utf8/;
 use Text::Iconv;
+use Unicode::Normalize;
 
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
@@ -35,6 +36,7 @@ BEGIN {
         MarcToUTF8Record
         SetMarcUnicodeFlag
         StripNonXmlChars
+               Normalize_String
     );
 }
 
@@ -110,6 +112,30 @@ sub IsStringUTF8ish {
     return utf8::decode($str);
 }
 
+
+=head2 Normalize_String
+
+=over 4
+
+my $$string_normalized = Normalize_String($string);
+
+=back
+
+Returns normalized string C<$string> in C Form 
+
+
+=cut
+
+sub Normalize_String {
+    my $string = shift;
+       if (IsStringUTF8ish($string)){
+               return NFC($string);
+       }
+       else {
+               return $string;
+       }
+}
+
 =head2 MarcToUTF8Record
 
 =over 4
@@ -215,7 +241,7 @@ sub MarcToUTF8Record {
             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
         } else {
             # assume any other character encoding is for Text::Iconv
-            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
         }
 
         if (@errors) {
@@ -455,6 +481,7 @@ sub _marc_marc8_to_utf8 {
                     # that the resulting string is UTF-8.
                     utf8::upgrade($utf8sf);
                 }
+                           $utf8sf=NFC($utf8sf);
                 push @converted_subfields, $subfield->[0], $utf8sf;
             }
 
@@ -503,6 +530,7 @@ sub _marc_iso5426_to_utf8 {
             my @converted_subfields;
             foreach my $subfield ($field->subfields()) {
                 my $utf8sf = char_decode5426($subfield->[1]);
+                           $utf8sf=NFC($utf8sf);
                 push @converted_subfields, $subfield->[0], $utf8sf;
             }
 
@@ -573,6 +601,7 @@ sub _marc_to_utf8_via_text_iconv {
                     push @converted_subfields, $subfield->[0], $converted_value;
                 } else {
                     $converted_value = $subfield->[1];
+                               $converted_value=NFC($converted_value);
                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
                     push @converted_subfields, $subfield->[0], $converted_value;
                 }
@@ -620,6 +649,7 @@ sub _marc_to_utf8_replacement_char {
             my @converted_subfields;
             foreach my $subfield ($field->subfields()) {
                 my $value = $subfield->[1];
+                               $value=NFC($value);
                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
                 push @converted_subfields, $subfield->[0], $value;
             }