Fix for Bug 6458 - incorrect parsing result in translation processing

[koha.git] / C4 / Charset.pm
diff --git a/C4/Charset.pm b/C4/Charset.pm

index 6c10309..a4e6b71 100644 (file)
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -33,6 +33,7 @@ BEGIN {
      require Exporter;
      @ISA    = qw(Exporter);
      @EXPORT = qw(
+        NormalizeString
          IsStringUTF8ish
          MarcToUTF8Record
          SetUTF8Flag
@@ -47,7 +48,7 @@ C4::Charset - utilities for handling character set conversions.
  
  =head1 SYNOPSIS
  
-use C4::Charset;
+  use C4::Charset;
  
  =head1 DESCRIPTION
  
@@ -76,16 +77,12 @@ on how to deal with the situation.
  
  =head2 IsStringUTF8ish
  
-=over 4
-
-my $is_utf8 = IsStringUTF8ish($str);
-
-=back
+  my $is_utf8 = IsStringUTF8ish($str);
  
  Determines if C<$str> is valid UTF-8.  This can mean
  one of two things:
  
-=over 2
+=over
  
  =item *
  
@@ -115,11 +112,7 @@ sub IsStringUTF8ish {
  
  =head2 SetUTF8Flag
  
-=over 4
-
-my $marc_record = SetUTF8Flag($marc_record);
-
-=back
+  my $marc_record = SetUTF8Flag($marc_record, $nfd);
  
  This function sets the PERL UTF8 flag for data.
  It is required when using new_from_usmarc 
@@ -127,6 +120,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting.
  When editing unicode marc records fields and subfields, you
  would end up in double encoding without using this function. 
  
+If $nfd is set, string normalization will use NFD instead of NFC
+
  FIXME
  In my opinion, this function belongs to MARC::Record and not
  to this package.
@@ -135,13 +130,13 @@ But since it handles charset, and MARC::Record, it finds its way in that package
  =cut
  
  sub SetUTF8Flag{
-       my ($record)=@_;
+       my ($record, $nfd)=@_;
         return unless ($record && $record->fields());
         foreach my $field ($record->fields()){
                 if ($field->tag()>=10){
                         my @subfields;
                         foreach my $subfield ($field->subfields()){
-                               push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
+                               push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
                         }
                         my $newfield=MARC::Field->new(
                                                         $field->tag(),
@@ -156,30 +151,28 @@ sub SetUTF8Flag{
  
  =head2 NormalizeString
  
-=over 4
+    my $normalized_string=NormalizeString($string,$nfd,$transform);
  
-    my $normalized_string=NormalizeString($string);
+Given a string
+nfd : If you want to set NFD and not NFC
+transform : If you expect all the signs to be removed
  
-=back
+Sets the PERL UTF8 Flag on your initial data if need be
+and applies cleaning if required
+
+Returns a utf8 NFC normalized string
+
+Sample code :
+   my $string=NormalizeString ("l'ornithoptère");
+   #results into ornithoptère in NFC form and sets UTF8 Flag
  
-       Given 
-           a string
-        nfc : If you want to set NFC and not NFD
-        transform : If you expect all the signs to be removed
-    Sets the PERL UTF8 Flag on your initial data if need be
-    and applies cleaning if required 
-    
-       Returns a utf8 NFD normalized string
-       
-       Sample code :
-       my $string=NormalizeString ("l'ornithoptère");
-    #results into ornithoptère in NFD form and sets UTF8 Flag
  =cut
  
+
  sub NormalizeString{
-       my ($string,$nfc,$transform)=@_;
+       my ($string,$nfd,$transform)=@_;
         utf8::decode($string) unless (utf8::is_utf8($string));
-       if ($nfc){
+       if ($nfd){
                 $string= NFD($string);
         }
         else {
@@ -196,11 +189,8 @@ sub NormalizeString{
  
  =head2 MarcToUTF8Record
  
-=over 4
-
-($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
-
-=back
+  ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, 
+                                       $marc_flavour, [, $source_encoding]);
  
  Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an 
  optional source encoding, return a C<MARC::Record> that is 
@@ -260,20 +250,20 @@ sub MarcToUTF8Record {
      # If we do not know the source encoding, try some guesses
      # as follows:
      #   1. Record is UTF-8 already.
-    #   2. If MARC flavor is MARC21, then
+    #   2. If MARC flavor is MARC21 or NORMARC, then
      #      a. record is MARC-8
      #      b. record is ISO-8859-1
      #   3. If MARC flavor is UNIMARC, then
      if (not defined $source_encoding) {
          if ($marc_blob_is_utf8) {
-            # note that for MARC21 we are not bothering to check
+            # note that for MARC21/NORMARC we are not bothering to check
              # if the Leader/09 is set to 'a' or not -- because
              # of problems with various ILSs (including Koha in the
              # past, alas), this just is not trustworthy.
              SetMarcUnicodeFlag($marc_record, $marc_flavour);
              return $marc_record, 'UTF-8', [];
          } else {
-            if ($marc_flavour eq 'MARC21') {
+            if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
                  return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
              } elsif ($marc_flavour =~/UNIMARC/) {
                  return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
@@ -313,11 +303,7 @@ sub MarcToUTF8Record {
  
  =head2 SetMarcUnicodeFlag
  
-=over 4
-
-SetMarcUnicodeFlag($marc_record, $marc_flavour);
-
-=back
+  SetMarcUnicodeFlag($marc_record, $marc_flavour);
  
  Set both the internal MARC::Record encoding flag
  and the appropriate Leader/09 (MARC21) or 
@@ -332,7 +318,7 @@ sub SetMarcUnicodeFlag {
      my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
  
      $marc_record->encoding('UTF-8');
-    if ($marc_flavour eq 'MARC21') {
+    if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
          my $leader = $marc_record->leader();
          substr($leader, 9, 1) = 'a';
          $marc_record->leader($leader); 
@@ -364,11 +350,7 @@ sub SetMarcUnicodeFlag {
  
  =head2 StripNonXmlChars
  
-=over 4
-
-my $new_str = StripNonXmlChars($old_str);
-
-=back
+  my $new_str = StripNonXmlChars($old_str);
  
  Given a string, return a copy with the
  characters that are illegal in XML 
@@ -404,11 +386,7 @@ sub StripNonXmlChars {
  
  =head2 _default_marc21_charconv_to_utf8
  
-=over 4
-
-my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
-
-=back
+  my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
  
  Converts a C<MARC::Record> of unknown character set to UTF-8,
  first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
@@ -450,11 +428,7 @@ sub _default_marc21_charconv_to_utf8 {
  
  =head2 _default_unimarc_charconv_to_utf8
  
-=over 4
-
-my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
-
-=back
+  my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
  
  Converts a C<MARC::Record> of unknown character set to UTF-8,
  first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
@@ -494,11 +468,7 @@ sub _default_unimarc_charconv_to_utf8 {
  
  =head2 _marc_marc8_to_utf8
  
-=over 4
-
-my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
  
  Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
  If the conversion fails for some reason, an
@@ -569,11 +539,7 @@ sub _marc_marc8_to_utf8 {
  
  =head2 _marc_iso5426_to_utf8
  
-=over 4
-
-my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
  
  Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
  If the conversion fails for some reason, an
@@ -615,11 +581,7 @@ sub _marc_iso5426_to_utf8 {
  
  =head2 _marc_to_utf8_via_text_iconv 
  
-=over 4
-
-my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
  
  Convert a C<MARC::Record> to UTF-8 in-place using the
  C<Text::Iconv> CPAN module.  Any source encoding accepted
@@ -688,11 +650,7 @@ sub _marc_to_utf8_via_text_iconv {
  
  =head2 _marc_to_utf8_replacement_char 
  
-=over 4
-
-_marc_to_utf8_replacement_char($marc_record, $marc_flavour);
-
-=back
+  _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
  
  Convert a C<MARC::Record> to UTF-8 in-place, adopting the 
  unsatisfactory method of replacing all non-ASCII (e.g.,
@@ -731,11 +689,7 @@ sub _marc_to_utf8_replacement_char {
  
  =head2 char_decode5426
  
-=over 4
-
-my $utf8string = char_decode5426($iso_5426_string);
-
-=back
+  my $utf8string = char_decode5426($iso_5426_string);
  
  Converts a string from ISO-5426 to UTF-8.
  
@@ -751,11 +705,14 @@ $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
  $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
  # $chars{0xb4}='è';
  $chars{0xb4}=0x00e8;
+$chars{0xbd}=0x02b9;
+$chars{0xbe}=0x02ba;
  # $chars{0xb5}='é';
  $chars{0xb5}=0x00e9;
  $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
  $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
-$chars{0xfa}=0x0153;#oe
+$chars{0xfa}=0x0153; #oe
+$chars{0xea}=0x0152; #oe
  $chars{0x81d1}=0x00b0;
  
  ####
@@ -1171,7 +1128,7 @@ sub char_decode5426 {
  
  =head1 AUTHOR
  
-Koha Development Team <info@koha.org>
+Koha Development Team <http://koha-community.org/>
  
  Galen Charlton <galen.charlton@liblime.com>