X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=C4%2FCharset.pm;h=4dff14c9ba64b32680b5b6e0760fbfdf0d6ea8b9;hb=00cf699c82aeea46ef5a72bf344fd306430c5aba;hp=e39637acf3d94cf2329aec4142ecfd8e266e296f;hpb=9d1e7f43e15b869afc3fccd80c1545170cc84ea0;p=koha.git

diff --git a/C4/Charset.pm b/C4/Charset.pm
index e39637acf3..4dff14c9ba 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -4,18 +4,18 @@ package C4::Charset;
 #
 # This file is part of Koha.
 #
-# Koha is free software; you can redistribute it and/or modify it under the
-# terms of the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any later
-# version.
+# Koha is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
 #
-# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
-# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+# Koha is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
 #
-# You should have received a copy of the GNU General Public License along with
-# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
-# Suite 330, Boston, MA  02111-1307 USA
+# You should have received a copy of the GNU General Public License
+# along with Koha; if not, see <http://www.gnu.org/licenses>.
 
 use strict;
 use warnings;
@@ -24,30 +24,34 @@ use MARC::Charset qw/marc8_to_utf8/;
 use Text::Iconv;
 use C4::Debug;
 use Unicode::Normalize;
+use Encode qw( decode encode is_utf8 );
 
-use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
 BEGIN {
-    # set the version for version checking
-    $VERSION = 3.01;
     require Exporter;
     @ISA    = qw(Exporter);
     @EXPORT = qw(
+        NormalizeString
         IsStringUTF8ish
         MarcToUTF8Record
         SetUTF8Flag
         SetMarcUnicodeFlag
         StripNonXmlChars
+        nsb_clean
+        SanitizeRecord
     );
 }
 
+=encoding UTF-8
+
 =head1 NAME
 
 C4::Charset - utilities for handling character set conversions.
 
 =head1 SYNOPSIS
 
-use C4::Charset;
+  use C4::Charset;
 
 =head1 DESCRIPTION
 
@@ -76,16 +80,12 @@ on how to deal with the situation.
 
 =head2 IsStringUTF8ish
 
-=over 4
-
-my $is_utf8 = IsStringUTF8ish($str);
-
-=back
+  my $is_utf8 = IsStringUTF8ish($str);
 
 Determines if C<$str> is valid UTF-8.  This can mean
 one of two things:
 
-=over 2
+=over
 
 =item *
 
@@ -109,17 +109,13 @@ will assume that this situation occur does not very often.
 sub IsStringUTF8ish {
     my $str = shift;
 
-    return 1 if utf8::is_utf8($str);
-    return utf8::decode($str);
+    return 1 if Encode::is_utf8($str);
+    return utf8::decode( $str );
 }
 
 =head2 SetUTF8Flag
 
-=over 4
-
-my $marc_record = SetUTF8Flag($marc_record);
-
-=back
+  my $marc_record = SetUTF8Flag($marc_record, $nfd);
 
 This function sets the PERL UTF8 flag for data.
 It is required when using new_from_usmarc 
@@ -127,6 +123,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting.
 When editing unicode marc records fields and subfields, you
 would end up in double encoding without using this function. 
 
+If $nfd is set, string normalization will use NFD instead of NFC
+
 FIXME
 In my opinion, this function belongs to MARC::Record and not
 to this package.
@@ -135,50 +133,53 @@ But since it handles charset, and MARC::Record, it finds its way in that package
 =cut
 
 sub SetUTF8Flag{
-	my ($record)=@_;
-	return unless ($record && $record->fields());
-	foreach my $field ($record->fields()){
-		if ($field->tag()>=10){
-			my @subfields;
-			foreach my $subfield ($field->subfields()){
-				push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
-			}
-			my $newfield=MARC::Field->new(
-							$field->tag(),
-							$field->indicator(1),
-							$field->indicator(2),
-							@subfields
-						);
-			$field->replace_with($newfield);
-		}
-	}
+    my ($record, $nfd)=@_;
+    return unless ($record && $record->fields());
+    foreach my $field ($record->fields()){
+        if ($field->tag()>=10){
+            my @subfields;
+            foreach my $subfield ($field->subfields()){
+                push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
+            }
+            eval {
+                my $newfield=MARC::Field->new(
+                            $field->tag(),
+                            $field->indicator(1),
+                            $field->indicator(2),
+                            @subfields
+                        );
+                $field->replace_with($newfield);
+            };
+            warn "ERROR occurred in SetUTF8Flag $@" if $@;
+        }
+    }
 }
 
 =head2 NormalizeString
 
-=over 4
+    my $normalized_string=NormalizeString($string,$nfd,$transform);
 
-    my $normalized_string=NormalizeString($string);
+Given a string
+nfd : If you want to set NFD and not NFC
+transform : If you expect all the signs to be removed
+
+Sets the PERL UTF8 Flag on your initial data if need be
+and applies cleaning if required
+
+Returns a utf8 NFC normalized string
+
+Sample code :
+   my $string=NormalizeString ("l'ornithoptÃ¨re");
+   #results into ornithoptÃ¨re in NFC form and sets UTF8 Flag
 
-=back
-	Given 
-	    a string
-        nfc : If you want to set NFC and not NFD
-        transform : If you expect all the signs to be removed
-    Sets the PERL UTF8 Flag on your initial data if need be
-    and applies cleaning if required 
-    
-	Returns a utf8 NFD normalized string
-	
-	Sample code :
-	my $string=NormalizeString ("l'ornithoptÃ¨re");
-    #results into ornithoptÃ¨re in NFD form and sets UTF8 Flag
 =cut
 
+
 sub NormalizeString{
-	my ($string,$nfc,$transform)=@_;
-	utf8::decode($string) unless (utf8::is_utf8($string));
-	if ($nfc){
+	my ($string,$nfd,$transform)=@_;
+    return $string unless defined($string); # force scalar context return.
+    $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
+	if ($nfd){
 		$string= NFD($string);
 	}
 	else {
@@ -195,11 +196,8 @@ sub NormalizeString{
 
 =head2 MarcToUTF8Record
 
-=over 4
-
-($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
-
-=back
+  ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, 
+					$marc_flavour, [, $source_encoding]);
 
 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an 
 optional source encoding, return a C<MARC::Record> that is 
@@ -259,20 +257,20 @@ sub MarcToUTF8Record {
     # If we do not know the source encoding, try some guesses
     # as follows:
     #   1. Record is UTF-8 already.
-    #   2. If MARC flavor is MARC21, then
+    #   2. If MARC flavor is MARC21 or NORMARC, then
     #      a. record is MARC-8
     #      b. record is ISO-8859-1
     #   3. If MARC flavor is UNIMARC, then
     if (not defined $source_encoding) {
         if ($marc_blob_is_utf8) {
-            # note that for MARC21 we are not bothering to check
+            # note that for MARC21/NORMARC we are not bothering to check
             # if the Leader/09 is set to 'a' or not -- because
             # of problems with various ILSs (including Koha in the
             # past, alas), this just is not trustworthy.
             SetMarcUnicodeFlag($marc_record, $marc_flavour);
             return $marc_record, 'UTF-8', [];
         } else {
-            if ($marc_flavour eq 'MARC21') {
+            if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
             } elsif ($marc_flavour =~/UNIMARC/) {
                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
@@ -312,11 +310,7 @@ sub MarcToUTF8Record {
 
 =head2 SetMarcUnicodeFlag
 
-=over 4
-
-SetMarcUnicodeFlag($marc_record, $marc_flavour);
-
-=back
+  SetMarcUnicodeFlag($marc_record, $marc_flavour);
 
 Set both the internal MARC::Record encoding flag
 and the appropriate Leader/09 (MARC21) or 
@@ -331,13 +325,16 @@ sub SetMarcUnicodeFlag {
     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 
     $marc_record->encoding('UTF-8');
-    if ($marc_flavour eq 'MARC21') {
+    if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
         my $leader = $marc_record->leader();
         substr($leader, 9, 1) = 'a';
         $marc_record->leader($leader); 
     } elsif ($marc_flavour =~/UNIMARC/) {
+        require C4::Context;
+	my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
+        $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
         my $string; 
-		my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
+		my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 		$string=$marc_record->subfield( 100, "a" );
         if (defined $string && length($string)==$subflength) { 
 			$string = substr $string, 0,$subflength if (length($string)>$subflength);
@@ -345,9 +342,10 @@ sub SetMarcUnicodeFlag {
         else { 
             $string = POSIX::strftime( "%Y%m%d", localtime ); 
             $string =~ s/\-//g; 
-            $string = sprintf( "%-*s", $subflength, $string ); 
+            $string = sprintf( "%-*s", $subflength, $string );
+	    substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
         } 
-        substr( $string, $encodingposition, 8, "frey50  " ); 
+        substr( $string, $encodingposition, 3, "y50" );
         if ( $marc_record->subfield( 100, "a" ) ) { 
 			$marc_record->field('100')->update(a=>$string);
 		}
@@ -355,7 +353,7 @@ sub SetMarcUnicodeFlag {
             $marc_record->insert_grouped_field( 
                 MARC::Field->new( 100, '', '', "a" => $string ) ); 
         }
-		$debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
+		$debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
     } else {
         warn "Unrecognized marcflavour: $marc_flavour";
     }
@@ -363,11 +361,7 @@ sub SetMarcUnicodeFlag {
 
 =head2 StripNonXmlChars
 
-=over 4
-
-my $new_str = StripNonXmlChars($old_str);
-
-=back
+  my $new_str = StripNonXmlChars($old_str);
 
 Given a string, return a copy with the
 characters that are illegal in XML 
@@ -399,16 +393,106 @@ sub StripNonXmlChars {
     return $str;
 }
 
-=head1 INTERNAL FUNCTIONS
 
-=head2 _default_marc21_charconv_to_utf8
+
+=head2 nsb_clean
 
 =over 4
 
-my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
+nsb_clean($string);
 
 =back
 
+Removes Non Sorting Block characters
+
+=cut
+sub nsb_clean {
+    my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
+    my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
+    my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
+    my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
+    my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
+
+    # handles non sorting blocks
+    my ($string) = @_ ;
+    $_ = $string ;
+    s/($C2){0,1}($NSB|$NSB2)//g ;
+    s/($C2){0,1}($NSE|$NSE2)//g ;
+    $string = $_ ;
+
+    return($string) ;
+}
+
+
+=head2 SanitizeRecord
+
+SanitizeRecord($marcrecord);
+
+Sanitize a record
+This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
+It cleans any string with '&amp;amp;...', replacing it by '&'
+
+=cut
+
+sub SanitizeRecord {
+    my ( $record, $biblionumber ) = @_;
+    my $string;
+    my $record_modified = 0;
+    my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
+    my ( $url_field, $url_subfield ) =
+      C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
+    foreach my $field ( $record->fields() ) {
+        if ( $field->is_control_field() ) {
+            my $value           = $field->data();
+            my $sanitized_value = _clean_ampersand($value);
+            $record_modified = 1 if $sanitized_value ne $value;
+            $field->update($sanitized_value);
+        }
+        else {
+            my @subfields = $field->subfields();
+            my @new_subfields;
+            foreach my $subfield (@subfields) {
+                next
+                  if $url_field eq $field->tag()
+                      and $url_subfield eq $subfield->[0];
+                my $value           = $subfield->[1];
+                my $sanitized_value = _clean_ampersand($value);
+                push @new_subfields, $subfield->[0] => $sanitized_value;
+                $record_modified = 1 if $sanitized_value ne $value;
+            }
+            if ( scalar(@new_subfields) > 0 ) {
+                my $new_field = eval {
+                    MARC::Field->new(
+                        $field->tag(),        $field->indicator(1),
+                        $field->indicator(2), @new_subfields
+                    );
+                };
+                if ($@) {
+                    warn "error : $@";
+                }
+                else {
+                    $field->replace_with($new_field);
+                }
+
+            }
+        }
+    }
+
+    return $record, $record_modified;
+}
+
+sub _clean_ampersand {
+    my ($string) = @_;
+    $string =~ s/(&)(amp;)+/$1/g;
+    return $string;
+}
+
+=head1 INTERNAL FUNCTIONS
+
+=head2 _default_marc21_charconv_to_utf8
+
+  my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
+
 Converts a C<MARC::Record> of unknown character set to UTF-8,
 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 to UTF-8, then a default conversion that replaces each non-ASCII
@@ -449,11 +533,7 @@ sub _default_marc21_charconv_to_utf8 {
 
 =head2 _default_unimarc_charconv_to_utf8
 
-=over 4
-
-my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
-
-=back
+  my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 
 Converts a C<MARC::Record> of unknown character set to UTF-8,
 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
@@ -493,11 +573,7 @@ sub _default_unimarc_charconv_to_utf8 {
 
 =head2 _marc_marc8_to_utf8
 
-=over 4
-
-my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 
 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 If the conversion fails for some reason, an
@@ -547,7 +623,7 @@ sub _marc_marc8_to_utf8 {
                     # occurs, upgrade the string in place.  Moral of the story seems to be
                     # that pack("U", ...) is better than chr(...) if you need to guarantee
                     # that the resulting string is UTF-8.
-                    utf8::upgrade($utf8sf);
+                    $utf8sf = Encode::encode('UTF-8', $utf8sf);
                 }
                 push @converted_subfields, $subfield->[0], $utf8sf;
             }
@@ -568,11 +644,7 @@ sub _marc_marc8_to_utf8 {
 
 =head2 _marc_iso5426_to_utf8
 
-=over 4
-
-my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 
 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 If the conversion fails for some reason, an
@@ -614,11 +686,7 @@ sub _marc_iso5426_to_utf8 {
 
 =head2 _marc_to_utf8_via_text_iconv 
 
-=over 4
-
-my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
-
-=back
+  my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 
 Convert a C<MARC::Record> to UTF-8 in-place using the
 C<Text::Iconv> CPAN module.  Any source encoding accepted
@@ -687,11 +755,7 @@ sub _marc_to_utf8_via_text_iconv {
 
 =head2 _marc_to_utf8_replacement_char 
 
-=over 4
-
-_marc_to_utf8_replacement_char($marc_record, $marc_flavour);
-
-=back
+  _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 
 Convert a C<MARC::Record> to UTF-8 in-place, adopting the 
 unsatisfactory method of replacing all non-ASCII (e.g.,
@@ -699,7 +763,7 @@ where the eight bit is set) octet with the Unicode
 replacement character.  This is meant as a last-ditch
 method, and would be best used as part of a UI that
 lets a cataloguer pick various character conversions
-until he or she finds the right one.
+until they find the right one.
 
 =cut
 
@@ -730,11 +794,7 @@ sub _marc_to_utf8_replacement_char {
 
 =head2 char_decode5426
 
-=over 4
-
-my $utf8string = char_decode5426($iso_5426_string);
-
-=back
+  my $utf8string = char_decode5426($iso_5426_string);
 
 Converts a string from ISO-5426 to UTF-8.
 
@@ -742,6 +802,9 @@ Converts a string from ISO-5426 to UTF-8.
 
 
 my %chars;
+
+####
+## 0xb
 $chars{0xb0}=0x0101;#3/0ayn[ain]
 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 #$chars{0xb2}=0x00e0;#'Ã ';
@@ -752,10 +815,45 @@ $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 $chars{0xb4}=0x00e8;
 # $chars{0xb5}='Ã©';
 $chars{0xb5}=0x00e9;
+$chars{0xb6}=0x2021; # double dagger
+$chars{0xb7}=0x00b7; # middle dot
+$chars{0xb8}=0x2033; # double prime
+$chars{0xb9}=0x2019; # right single quotation mark
+$chars{0xba}=0x201d; # right double quotation mark
+$chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
+$chars{0xbc}=0x266f; # music sharp sign
+$chars{0xbd}=0x02b9; # modifier letter prime
+$chars{0xbe}=0x02ba; # modifier letter double prime
+$chars{0xbf}=0x00bf; # inverted question mark
+
+####
+## 0xe
+$chars{0xe1}=0x00c6; # latin capital letter ae
+$chars{0xe2}=0x0110; # latin capital letter d with stroke
+$chars{0xe6}=0x0132; # latin capital ligature ij
+$chars{0xe8}=0x0141; # latin capital letter l with stroke
+$chars{0xe9}=0x00d8; # latin capital letter o with stroke
+$chars{0xea}=0x0152; # latin capital ligature oe
+$chars{0xec}=0x00de; # latin capital letter thorn
+
+####
+## 0xf
+$chars{0xf1}=0x00e6; # latin small letter ae
+$chars{0xf2}=0x0111; # latin small letter d with stroke
+$chars{0xf3}=0x00f0; # latin small letter eth
+$chars{0xf5}=0x0131; # latin small letter dotless i
+$chars{0xf6}=0x0133; # latin small ligature ij
+$chars{0xf8}=0x0142; # latin small letter l with stroke
+$chars{0xf9}=0x00f8; # latin small letter o with stroke
+$chars{0xfa}=0x0153; # latin small ligature oe
+$chars{0xfb}=0x00df; # latin small letter sharp s
+$chars{0xfc}=0x00fe; # latin small letter thorn
+
+####
+## Others
 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
-$chars{0xfa}=0x0153;#oe
-$chars{0x81d1}=0x00b0;
+#$chars{0x81d1}=0x00b0; # FIXME useless
 
 ####
 ## combined characters iso5426
@@ -1057,8 +1155,8 @@ $chars{0xd375}=0x0173; # small u with ogonek
 $chars{0xd441}=0x1e00; # capital a with ring below
 $chars{0xd461}=0x1e01; # small a with ring below
         # 5/5 half circle below
-$chars{0xf948}=0x1e2a; # capital h with breve below
-$chars{0xf968}=0x1e2b; # small h with breve below
+$chars{0xd548}=0x1e2a; # capital h with breve below
+$chars{0xd568}=0x1e2b; # small h with breve below
         # 5/6 dot below
 $chars{0xd641}=0x1ea0; # capital a with dot below
 $chars{0xd642}=0x1e04; # capital b with dot below
@@ -1170,7 +1268,7 @@ sub char_decode5426 {
 
 =head1 AUTHOR
 
-Koha Development Team <info@koha.org>
+Koha Development Team <http://koha-community.org/>
 
 Galen Charlton <galen.charlton@liblime.com>