X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=C4%2FCharset.pm;h=c294b361331489a29eb771456d415fea95b8f7a0;hb=223bf0099fcb837fb6bc1c719517e175c43f6ffc;hp=54a1322ca06b3abfb3f8aaaa0d9a2965bf8da9d6;hpb=c86f5df43109244bfadda66fd899588ced658e83;p=koha.git

diff --git a/C4/Charset.pm b/C4/Charset.pm
index 54a1322ca0..c294b36133 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -13,25 +13,33 @@ package C4::Charset;
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 #
-# You should have received a copy of the GNU General Public License along with
-# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
-# Suite 330, Boston, MA  02111-1307 USA
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
 use strict;
+use warnings;
+
 use MARC::Charset qw/marc8_to_utf8/;
 use Text::Iconv;
+use C4::Debug;
+use Unicode::Normalize;
 
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
 BEGIN {
     # set the version for version checking
-    $VERSION = 3.00;
+    $VERSION = 3.07.00.049;
     require Exporter;
     @ISA    = qw(Exporter);
     @EXPORT = qw(
+        NormalizeString
         IsStringUTF8ish
         MarcToUTF8Record
+        SetUTF8Flag
         SetMarcUnicodeFlag
+        StripNonXmlChars
+        nsb_clean
     );
 }
 
@@ -41,7 +49,7 @@ C4::Charset - utilities for handling character set conversions.
 
 =head1 SYNOPSIS
 
-use C4::Charset;
+  use C4::Charset;
 
 =head1 DESCRIPTION
 
@@ -70,16 +78,12 @@ on how to deal with the situation.
 
 =head2 IsStringUTF8ish
 
-=over 4
-
-my $is_utf8 = IsStringUTF8ish($str);
-
-=back
+  my $is_utf8 = IsStringUTF8ish($str);
 
 Determines if C<$str> is valid UTF-8.  This can mean
 one of two things:
 
-=over 2
+=over
 
 =item *
 
@@ -107,13 +111,87 @@ sub IsStringUTF8ish {
     return utf8::decode($str);
 }
 
-=head2 MarcToUTF8Record
+=head2 SetUTF8Flag
 
-=over 4
+  my $marc_record = SetUTF8Flag($marc_record, $nfd);
 
-($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
+This function sets the PERL UTF8 flag for data.
+It is required when using new_from_usmarc 
+since MARC::File::USMARC does not handle PERL UTF8 setting.
+When editing unicode marc records fields and subfields, you
+would end up in double encoding without using this function. 
 
-=back
+If $nfd is set, string normalization will use NFD instead of NFC
+
+FIXME
+In my opinion, this function belongs to MARC::Record and not
+to this package.
+But since it handles charset, and MARC::Record, it finds its way in that package
+
+=cut
+
+sub SetUTF8Flag{
+	my ($record, $nfd)=@_;
+	return unless ($record && $record->fields());
+	foreach my $field ($record->fields()){
+		if ($field->tag()>=10){
+			my @subfields;
+			foreach my $subfield ($field->subfields()){
+				push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
+			}
+			my $newfield=MARC::Field->new(
+							$field->tag(),
+							$field->indicator(1),
+							$field->indicator(2),
+							@subfields
+						);
+			$field->replace_with($newfield);
+		}
+	}
+}
+
+=head2 NormalizeString
+
+    my $normalized_string=NormalizeString($string,$nfd,$transform);
+
+Given a string
+nfd : If you want to set NFD and not NFC
+transform : If you expect all the signs to be removed
+
+Sets the PERL UTF8 Flag on your initial data if need be
+and applies cleaning if required
+
+Returns a utf8 NFC normalized string
+
+Sample code :
+   my $string=NormalizeString ("l'ornithoptÃ¨re");
+   #results into ornithoptÃ¨re in NFC form and sets UTF8 Flag
+
+=cut
+
+
+sub NormalizeString{
+	my ($string,$nfd,$transform)=@_;
+	utf8::decode($string) unless (utf8::is_utf8($string));
+	if ($nfd){
+		$string= NFD($string);
+	}
+	else {
+		$string=NFC($string);
+	}
+	if ($transform){
+    $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
+	#removing one letter words "d'" "l'"  was changed into "d " "l " 
+    $string=~s/\b\S\b//g;
+    $string=~s/\s+$//g;
+	}
+    return $string; 
+}
+
+=head2 MarcToUTF8Record
+
+  ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, 
+					$marc_flavour, [, $source_encoding]);
 
 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an 
 optional source encoding, return a C<MARC::Record> that is 
@@ -135,7 +213,6 @@ sub MarcToUTF8Record {
     my $marc = shift;
     my $marc_flavour = shift;
     my $source_encoding = shift;
-
     my $marc_record;
     my $marc_blob_is_utf8 = 0;
     if (ref($marc) eq 'MARC::Record') {
@@ -152,28 +229,44 @@ sub MarcToUTF8Record {
         $marc =~ s/^\s+//;
         $marc =~ s/\s+$//;
         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
-        $marc_record = MARC::Record->new_from_usmarc($marc);
+        eval {
+            $marc_record = MARC::Record->new_from_usmarc($marc);
+        };
+        if ($@) {
+            # if we fail the first time, one likely problem
+            # is that we have a MARC21 record that says that it's
+            # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
+            # We'll try parsing it again.
+            substr($marc, 9, 1) = ' ';
+            eval {
+                $marc_record = MARC::Record->new_from_usmarc($marc);
+            };
+            if ($@) {
+                # it's hopeless; return an empty MARC::Record
+                return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
+            }
+        }
     }
 
     # If we do not know the source encoding, try some guesses
     # as follows:
     #   1. Record is UTF-8 already.
-    #   2. If MARC flavor is MARC21, then
+    #   2. If MARC flavor is MARC21 or NORMARC, then
     #      a. record is MARC-8
     #      b. record is ISO-8859-1
     #   3. If MARC flavor is UNIMARC, then
     if (not defined $source_encoding) {
         if ($marc_blob_is_utf8) {
-            # note that for MARC21 we are not bothering to check
+            # note that for MARC21/NORMARC we are not bothering to check
             # if the Leader/09 is set to 'a' or not -- because
             # of problems with various ILSs (including Koha in the
             # past, alas), this just is not trustworthy.
             SetMarcUnicodeFlag($marc_record, $marc_flavour);
             return $marc_record, 'UTF-8', [];
         } else {
-            if ($marc_flavour eq 'MARC21') {
+            if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
-            } elsif ($marc_flavour eq 'UNIMARC') {
+            } elsif ($marc_flavour =~/UNIMARC/) {
                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
             } else {
                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
@@ -196,7 +289,7 @@ sub MarcToUTF8Record {
             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
         } else {
             # assume any other character encoding is for Text::Iconv
-            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
         }
 
         if (@errors) {
@@ -211,11 +304,7 @@ sub MarcToUTF8Record {
 
 =head2 SetMarcUnicodeFlag
 
-=over 4
-
-SetMarcUnicodeFlag($marc_record, $marc_flavour);
-
-=back
+  SetMarcUnicodeFlag($marc_record, $marc_flavour);
 
 Set both the internal MARC::Record encoding flag
 and the appropriate Leader/09 (MARC21) or 
@@ -227,34 +316,113 @@ any actual character conversion.
 
 sub SetMarcUnicodeFlag {
     my $marc_record = shift;
-    my $marc_flavour = shift;
+    my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 
     $marc_record->encoding('UTF-8');
-    if ($marc_flavour eq 'MARC21') {
+    if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
         my $leader = $marc_record->leader();
         substr($leader, 9, 1) = 'a';
         $marc_record->leader($leader); 
-    } elsif ($marc_flavour eq "UNIMARC") {
-        if (my $field = $marc_record->field('100')) {
-            my $sfa = $field->subfield('a');
-            substr($sfa, 26, 4) = '5050';
-            $field->update('a' => $sfa);
+    } elsif ($marc_flavour =~/UNIMARC/) {
+        my $string; 
+		my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
+		$string=$marc_record->subfield( 100, "a" );
+        if (defined $string && length($string)==$subflength) { 
+			$string = substr $string, 0,$subflength if (length($string)>$subflength);
+        } 
+        else { 
+            $string = POSIX::strftime( "%Y%m%d", localtime ); 
+            $string =~ s/\-//g; 
+            $string = sprintf( "%-*s", $subflength, $string ); 
+        } 
+        substr( $string, $encodingposition, 8, "frey50  " ); 
+        if ( $marc_record->subfield( 100, "a" ) ) { 
+			$marc_record->field('100')->update(a=>$string);
+		}
+		else {
+            $marc_record->insert_grouped_field( 
+                MARC::Field->new( 100, '', '', "a" => $string ) ); 
         }
+		$debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
+    } else {
+        warn "Unrecognized marcflavour: $marc_flavour";
     }
 }
 
+=head2 StripNonXmlChars
 
+  my $new_str = StripNonXmlChars($old_str);
 
-=head1 INTERNAL FUNCTIONS
+Given a string, return a copy with the
+characters that are illegal in XML 
+removed.
+
+This function exists to work around a problem
+that can occur with badly-encoded MARC records.
+Specifically, if a UTF-8 MARC record also
+has excape (\x1b) characters, MARC::File::XML
+will let the escape characters pass through
+when as_xml() or as_xml_record() is called.  The
+problem is that the escape character is not
+legal in well-formed XML documents, so when
+MARC::File::XML attempts to parse such a record,
+the XML parser will fail.
+
+Stripping such characters will allow a 
+MARC::Record->new_from_xml()
+to work, at the possible risk of some data loss.
+
+=cut
+
+sub StripNonXmlChars {
+    my $str = shift;
+    if (!defined($str) || $str eq ""){
+        return "";
+    }
+    $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
+    return $str;
+}
 
-=head2 _default_marc21_charconv_to_utf8
+
+
+=head2 nsb_clean
 
 =over 4
 
-my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
+nsb_clean($string);
 
 =back
 
+Removes Non Sorting Block characters
+
+=cut
+sub nsb_clean {
+    my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
+    my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
+    my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
+    my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
+    my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
+
+    # handles non sorting blocks
+    my ($string) = @_ ;
+    $_ = $string ;
+    s/$NSB//g ;
+    s/$NSE//g ;
+    s/$NSB2//g ;
+    s/$NSE2//g ;
+    s/$C2//g ;
+    $string = $_ ;
+
+    return($string) ;
+}
+
+
+=head1 INTERNAL FUNCTIONS
+
+=head2 _default_marc21_charconv_to_utf8
+
+  my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
+
 Converts a C<MARC::Record> of unknown character set to UTF-8,
 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 to UTF-8, then a default conversion that replaces each non-ASCII
@@ -295,11 +463,7 @@ sub _default_marc21_charconv_to_utf8 {
 
 =head2 _default_unimarc_charconv_to_utf8
 
-=over 4
-
-my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
-
-=back
+  my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 
 Converts a C<MARC::Record> of unknown character set to UTF-8,
 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
@@ -339,11 +503,7 @@ sub _default_unimarc_charconv_to_utf8 {
 
 =head2 _marc_marc8_to_utf8
 
-=over 4
-
-my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 
 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 If the conversion fails for some reason, an
@@ -385,6 +545,16 @@ sub _marc_marc8_to_utf8 {
             my @converted_subfields;
             foreach my $subfield ($field->subfields()) {
                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
+                unless (IsStringUTF8ish($utf8sf)) {
+                    # Because of a bug in MARC::Charset 0.98, if the string
+                    # has (a) one or more diacritics that (b) are only in character positions
+                    # 128 to 255 inclusive, the resulting converted string is not in
+                    # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
+                    # occurs, upgrade the string in place.  Moral of the story seems to be
+                    # that pack("U", ...) is better than chr(...) if you need to guarantee
+                    # that the resulting string is UTF-8.
+                    utf8::upgrade($utf8sf);
+                }
                 push @converted_subfields, $subfield->[0], $utf8sf;
             }
 
@@ -404,11 +574,7 @@ sub _marc_marc8_to_utf8 {
 
 =head2 _marc_iso5426_to_utf8
 
-=over 4
-
-my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 
 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 If the conversion fails for some reason, an
@@ -450,11 +616,7 @@ sub _marc_iso5426_to_utf8 {
 
 =head2 _marc_to_utf8_via_text_iconv 
 
-=over 4
-
-my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 
 Convert a C<MARC::Record> to UTF-8 in-place using the
 C<Text::Iconv> CPAN module.  Any source encoding accepted
@@ -523,11 +685,7 @@ sub _marc_to_utf8_via_text_iconv {
 
 =head2 _marc_to_utf8_replacement_char 
 
-=over 4
-
-_marc_to_utf8_replacement_char($marc_record, $marc_flavour);
-
-=back
+  _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 
 Convert a C<MARC::Record> to UTF-8 in-place, adopting the 
 unsatisfactory method of replacing all non-ASCII (e.g.,
@@ -566,19 +724,13 @@ sub _marc_to_utf8_replacement_char {
 
 =head2 char_decode5426
 
-=over 4
-
-my $utf8string = char_decode5426($iso_5426_string);
-
-=back
+  my $utf8string = char_decode5426($iso_5426_string);
 
 Converts a string from ISO-5426 to UTF-8.
 
 =cut
 
-sub char_decode5426 {
-    my ( $string) = @_;
-    my $result;
+
 my %chars;
 $chars{0xb0}=0x0101;#3/0ayn[ain]
 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
@@ -588,11 +740,14 @@ $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 # $chars{0xb4}='Ã¨';
 $chars{0xb4}=0x00e8;
+$chars{0xbd}=0x02b9;
+$chars{0xbe}=0x02ba;
 # $chars{0xb5}='Ã©';
 $chars{0xb5}=0x00e9;
 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
-$chars{0xfa}=0x0153;#oe
+$chars{0xfa}=0x0153; #oe
+$chars{0xea}=0x0152; #oe
 $chars{0x81d1}=0x00b0;
 
 ####
@@ -951,6 +1106,11 @@ $chars{0xda20}=0x02cc; #
         # 5/14 right half of ligature sign
         # 5/15 right half of double tilde
 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
+
+sub char_decode5426 {
+    my ( $string) = @_;
+    my $result;
+
     my @data = unpack("C*", $string);
     my @characters;
     my $length=scalar(@data);
@@ -1003,7 +1163,7 @@ $chars{0xda20}=0x02cc; #
 
 =head1 AUTHOR
 
-Koha Development Team <info@koha.org>
+Koha Development Team <http://koha-community.org/>
 
 Galen Charlton <galen.charlton@liblime.com>