Bug fixing charsets Using Unicode::Normalize

[koha.git] / C4 / Charset.pm
diff --git a/C4/Charset.pm b/C4/Charset.pm

index 2e2e831..541ebce 100644 (file)
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -18,14 +18,17 @@ package C4::Charset;
  # Suite 330, Boston, MA  02111-1307 USA
  
  use strict;
+use warnings;
+
  use MARC::Charset qw/marc8_to_utf8/;
  use Text::Iconv;
+use Unicode::Normalize;
  
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  
  BEGIN {
      # set the version for version checking
-    $VERSION = 3.00;
+    $VERSION = 3.01;
      require Exporter;
      @ISA    = qw(Exporter);
      @EXPORT = qw(
@@ -33,6 +36,7 @@ BEGIN {
          MarcToUTF8Record
          SetMarcUnicodeFlag
          StripNonXmlChars
+               Normalize_String
      );
  }
  
@@ -108,6 +112,30 @@ sub IsStringUTF8ish {
      return utf8::decode($str);
  }
  
+
+=head2 Normalize_String
+
+=over 4
+
+my $$string_normalized = Normalize_String($string);
+
+=back
+
+Returns normalized string C<$string> in C Form 
+
+
+=cut
+
+sub Normalize_String {
+    my $string = shift;
+       if (IsStringUTF8ish($string)){
+               return NFC($string);
+       }
+       else {
+               return $string;
+       }
+}
+
  =head2 MarcToUTF8Record
  
  =over 4
@@ -153,7 +181,23 @@ sub MarcToUTF8Record {
          $marc =~ s/^\s+//;
          $marc =~ s/\s+$//;
          $marc_blob_is_utf8 = IsStringUTF8ish($marc);
-        $marc_record = MARC::Record->new_from_usmarc($marc);
+        eval {
+            $marc_record = MARC::Record->new_from_usmarc($marc);
+        };
+        if ($@) {
+            # if we fail the first time, one likely problem
+            # is that we have a MARC21 record that says that it's
+            # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
+            # We'll try parsing it again.
+            substr($marc, 9, 1) = ' ';
+            eval {
+                $marc_record = MARC::Record->new_from_usmarc($marc);
+            };
+            if ($@) {
+                # it's hopeless; return an empty MARC::Record
+                return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
+            }
+        }
      }
  
      # If we do not know the source encoding, try some guesses
@@ -197,7 +241,7 @@ sub MarcToUTF8Record {
              @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
          } else {
              # assume any other character encoding is for Text::Iconv
-            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
          }
  
          if (@errors) {
@@ -228,7 +272,7 @@ any actual character conversion.
  
  sub SetMarcUnicodeFlag {
      my $marc_record = shift;
-    my $marc_flavour = shift;
+    my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
  
      $marc_record->encoding('UTF-8');
      if ($marc_flavour eq 'MARC21') {
@@ -238,9 +282,17 @@ sub SetMarcUnicodeFlag {
      } elsif ($marc_flavour eq "UNIMARC") {
          if (my $field = $marc_record->field('100')) {
              my $sfa = $field->subfield('a');
-            substr($sfa, 26, 4) = '5050';
+            
+            my $subflength = 36;
+            # fix the length of the field
+            $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
+            $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
+            
+            substr($sfa, 26, 4) = '50  ';
              $field->update('a' => $sfa);
          }
+    } else {
+        warn "Unrecognized marcflavour: $marc_flavour";
      }
  }
  
@@ -419,6 +471,17 @@ sub _marc_marc8_to_utf8 {
              my @converted_subfields;
              foreach my $subfield ($field->subfields()) {
                  my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
+                unless (IsStringUTF8ish($utf8sf)) {
+                    # Because of a bug in MARC::Charset 0.98, if the string
+                    # has (a) one or more diacritics that (b) are only in character positions
+                    # 128 to 255 inclusive, the resulting converted string is not in
+                    # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
+                    # occurs, upgrade the string in place.  Moral of the story seems to be
+                    # that pack("U", ...) is better than chr(...) if you need to guarantee
+                    # that the resulting string is UTF-8.
+                    utf8::upgrade($utf8sf);
+                }
+                           $utf8sf=NFC($utf8sf);
                  push @converted_subfields, $subfield->[0], $utf8sf;
              }
  
@@ -467,6 +530,7 @@ sub _marc_iso5426_to_utf8 {
              my @converted_subfields;
              foreach my $subfield ($field->subfields()) {
                  my $utf8sf = char_decode5426($subfield->[1]);
+                           $utf8sf=NFC($utf8sf);
                  push @converted_subfields, $subfield->[0], $utf8sf;
              }
  
@@ -537,6 +601,7 @@ sub _marc_to_utf8_via_text_iconv {
                      push @converted_subfields, $subfield->[0], $converted_value;
                  } else {
                      $converted_value = $subfield->[1];
+                               $converted_value=NFC($converted_value);
                      $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
                      push @converted_subfields, $subfield->[0], $converted_value;
                  }
@@ -584,6 +649,7 @@ sub _marc_to_utf8_replacement_char {
              my @converted_subfields;
              foreach my $subfield ($field->subfields()) {
                  my $value = $subfield->[1];
+                               $value=NFC($value);
                  $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
                  push @converted_subfields, $subfield->[0], $value;
              }
@@ -610,9 +676,7 @@ Converts a string from ISO-5426 to UTF-8.
  
  =cut
  
-sub char_decode5426 {
-    my ( $string) = @_;
-    my $result;
+
  my %chars;
  $chars{0xb0}=0x0101;#3/0ayn[ain]
  $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
@@ -985,6 +1049,11 @@ $chars{0xda20}=0x02cc; #
          # 5/14 right half of ligature sign
          # 5/15 right half of double tilde
  #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
+
+sub char_decode5426 {
+    my ( $string) = @_;
+    my $result;
+
      my @data = unpack("C*", $string);
      my @characters;
      my $length=scalar(@data);