X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=C4%2FCharset.pm;h=4dff14c9ba64b32680b5b6e0760fbfdf0d6ea8b9;hb=00cf699c82aeea46ef5a72bf344fd306430c5aba;hp=712bd85ca862d79fe0a16af0a19568df9b9893c8;hpb=4117b293f633597e74e3dda9e1e52810d31ad23d;p=koha.git diff --git a/C4/Charset.pm b/C4/Charset.pm index 712bd85ca8..4dff14c9ba 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -4,18 +4,18 @@ package C4::Charset; # # This file is part of Koha. # -# Koha is free software; you can redistribute it and/or modify it under the -# terms of the GNU General Public License as published by the Free Software -# Foundation; either version 2 of the License, or (at your option) any later -# version. +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. # -# Koha is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along -# with Koha; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . use strict; use warnings; @@ -24,12 +24,11 @@ use MARC::Charset qw/marc8_to_utf8/; use Text::Iconv; use C4::Debug; use Unicode::Normalize; +use Encode qw( decode encode is_utf8 ); -use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); BEGIN { - # set the version for version checking - $VERSION = 3.01; require Exporter; @ISA = qw(Exporter); @EXPORT = qw( @@ -39,9 +38,13 @@ BEGIN { SetUTF8Flag SetMarcUnicodeFlag StripNonXmlChars + nsb_clean + SanitizeRecord ); } +=encoding UTF-8 + =head1 NAME C4::Charset - utilities for handling character set conversions. @@ -106,13 +109,13 @@ will assume that this situation occur does not very often. sub IsStringUTF8ish { my $str = shift; - return 1 if utf8::is_utf8($str); - return utf8::decode($str); + return 1 if Encode::is_utf8($str); + return utf8::decode( $str ); } =head2 SetUTF8Flag - my $marc_record = SetUTF8Flag($marc_record); + my $marc_record = SetUTF8Flag($marc_record, $nfd); This function sets the PERL UTF8 flag for data. It is required when using new_from_usmarc @@ -120,6 +123,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting. When editing unicode marc records fields and subfields, you would end up in double encoding without using this function. +If $nfd is set, string normalization will use NFD instead of NFC + FIXME In my opinion, this function belongs to MARC::Record and not to this package. @@ -128,23 +133,26 @@ But since it handles charset, and MARC::Record, it finds its way in that package =cut sub SetUTF8Flag{ - my ($record)=@_; - return unless ($record && $record->fields()); - foreach my $field ($record->fields()){ - if ($field->tag()>=10){ - my @subfields; - foreach my $subfield ($field->subfields()){ - push @subfields,($$subfield[0],NormalizeString($$subfield[1])); - } - my $newfield=MARC::Field->new( - $field->tag(), - $field->indicator(1), - $field->indicator(2), - @subfields - ); - $field->replace_with($newfield); - } - } + my ($record, $nfd)=@_; + return unless ($record && $record->fields()); + foreach my $field ($record->fields()){ + if ($field->tag()>=10){ + my @subfields; + foreach my $subfield ($field->subfields()){ + push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); + } + eval { + my $newfield=MARC::Field->new( + $field->tag(), + $field->indicator(1), + $field->indicator(2), + @subfields + ); + $field->replace_with($newfield); + }; + warn "ERROR occurred in SetUTF8Flag $@" if $@; + } + } } =head2 NormalizeString @@ -169,7 +177,8 @@ Sample code : sub NormalizeString{ my ($string,$nfd,$transform)=@_; - utf8::decode($string) unless (utf8::is_utf8($string)); + return $string unless defined($string); # force scalar context return. + $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string)); if ($nfd){ $string= NFD($string); } @@ -248,20 +257,20 @@ sub MarcToUTF8Record { # If we do not know the source encoding, try some guesses # as follows: # 1. Record is UTF-8 already. - # 2. If MARC flavor is MARC21, then + # 2. If MARC flavor is MARC21 or NORMARC, then # a. record is MARC-8 # b. record is ISO-8859-1 # 3. If MARC flavor is UNIMARC, then if (not defined $source_encoding) { if ($marc_blob_is_utf8) { - # note that for MARC21 we are not bothering to check + # note that for MARC21/NORMARC we are not bothering to check # if the Leader/09 is set to 'a' or not -- because # of problems with various ILSs (including Koha in the # past, alas), this just is not trustworthy. SetMarcUnicodeFlag($marc_record, $marc_flavour); return $marc_record, 'UTF-8', []; } else { - if ($marc_flavour eq 'MARC21') { + if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); } elsif ($marc_flavour =~/UNIMARC/) { return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour); @@ -316,13 +325,16 @@ sub SetMarcUnicodeFlag { my $marc_flavour = shift; # || C4::Context->preference("marcflavour"); $marc_record->encoding('UTF-8'); - if ($marc_flavour eq 'MARC21') { + if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { my $leader = $marc_record->leader(); substr($leader, 9, 1) = 'a'; $marc_record->leader($leader); } elsif ($marc_flavour =~/UNIMARC/) { + require C4::Context; + my $defaultlanguage = C4::Context->preference("UNIMARCField100Language"); + $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3); my $string; - my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22)); + my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25)); $string=$marc_record->subfield( 100, "a" ); if (defined $string && length($string)==$subflength) { $string = substr $string, 0,$subflength if (length($string)>$subflength); @@ -330,9 +342,10 @@ sub SetMarcUnicodeFlag { else { $string = POSIX::strftime( "%Y%m%d", localtime ); $string =~ s/\-//g; - $string = sprintf( "%-*s", $subflength, $string ); + $string = sprintf( "%-*s", $subflength, $string ); + substr ( $string, ($encodingposition - 3), 3, $defaultlanguage); } - substr( $string, $encodingposition, 8, "frey50 " ); + substr( $string, $encodingposition, 3, "y50" ); if ( $marc_record->subfield( 100, "a" ) ) { $marc_record->field('100')->update(a=>$string); } @@ -340,7 +353,7 @@ sub SetMarcUnicodeFlag { $marc_record->insert_grouped_field( MARC::Field->new( 100, '', '', "a" => $string ) ); } - $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 ); + $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 ); } else { warn "Unrecognized marcflavour: $marc_flavour"; } @@ -380,6 +393,100 @@ sub StripNonXmlChars { return $str; } + + +=head2 nsb_clean + +=over 4 + +nsb_clean($string); + +=back + +Removes Non Sorting Block characters + +=cut +sub nsb_clean { + my $NSB = '\x88' ; # NSB : begin Non Sorting Block + my $NSE = '\x89' ; # NSE : Non Sorting Block end + my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block + my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end + my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE + + # handles non sorting blocks + my ($string) = @_ ; + $_ = $string ; + s/($C2){0,1}($NSB|$NSB2)//g ; + s/($C2){0,1}($NSE|$NSE2)//g ; + $string = $_ ; + + return($string) ; +} + + +=head2 SanitizeRecord + +SanitizeRecord($marcrecord); + +Sanitize a record +This routine is called in the maintenance script misc/maintenance/sanitize_records.pl. +It cleans any string with '&...', replacing it by '&' + +=cut + +sub SanitizeRecord { + my ( $record, $biblionumber ) = @_; + my $string; + my $record_modified = 0; + my $frameworkcode = C4::Biblio::GetFrameworkCode($biblionumber); + my ( $url_field, $url_subfield ) = + C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode ); + foreach my $field ( $record->fields() ) { + if ( $field->is_control_field() ) { + my $value = $field->data(); + my $sanitized_value = _clean_ampersand($value); + $record_modified = 1 if $sanitized_value ne $value; + $field->update($sanitized_value); + } + else { + my @subfields = $field->subfields(); + my @new_subfields; + foreach my $subfield (@subfields) { + next + if $url_field eq $field->tag() + and $url_subfield eq $subfield->[0]; + my $value = $subfield->[1]; + my $sanitized_value = _clean_ampersand($value); + push @new_subfields, $subfield->[0] => $sanitized_value; + $record_modified = 1 if $sanitized_value ne $value; + } + if ( scalar(@new_subfields) > 0 ) { + my $new_field = eval { + MARC::Field->new( + $field->tag(), $field->indicator(1), + $field->indicator(2), @new_subfields + ); + }; + if ($@) { + warn "error : $@"; + } + else { + $field->replace_with($new_field); + } + + } + } + } + + return $record, $record_modified; +} + +sub _clean_ampersand { + my ($string) = @_; + $string =~ s/(&)(amp;)+/$1/g; + return $string; +} + =head1 INTERNAL FUNCTIONS =head2 _default_marc21_charconv_to_utf8 @@ -516,7 +623,7 @@ sub _marc_marc8_to_utf8 { # occurs, upgrade the string in place. Moral of the story seems to be # that pack("U", ...) is better than chr(...) if you need to guarantee # that the resulting string is UTF-8. - utf8::upgrade($utf8sf); + $utf8sf = Encode::encode('UTF-8', $utf8sf); } push @converted_subfields, $subfield->[0], $utf8sf; } @@ -656,7 +763,7 @@ where the eight bit is set) octet with the Unicode replacement character. This is meant as a last-ditch method, and would be best used as part of a UI that lets a cataloguer pick various character conversions -until he or she finds the right one. +until they find the right one. =cut @@ -695,6 +802,9 @@ Converts a string from ISO-5426 to UTF-8. my %chars; + +#### +## 0xb $chars{0xb0}=0x0101;#3/0ayn[ain] $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove] #$chars{0xb2}=0x00e0;#'à'; @@ -703,15 +813,47 @@ $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark # $chars{0xb4}='è'; $chars{0xb4}=0x00e8; -$chars{0xbd}=0x02b9; -$chars{0xbe}=0x02ba; # $chars{0xb5}='é'; $chars{0xb5}=0x00e9; +$chars{0xb6}=0x2021; # double dagger +$chars{0xb7}=0x00b7; # middle dot +$chars{0xb8}=0x2033; # double prime +$chars{0xb9}=0x2019; # right single quotation mark +$chars{0xba}=0x201d; # right double quotation mark +$chars{0xbb}=0x00bb; # right-pointing double angle quotation mark +$chars{0xbc}=0x266f; # music sharp sign +$chars{0xbd}=0x02b9; # modifier letter prime +$chars{0xbe}=0x02ba; # modifier letter double prime +$chars{0xbf}=0x00bf; # inverted question mark + +#### +## 0xe +$chars{0xe1}=0x00c6; # latin capital letter ae +$chars{0xe2}=0x0110; # latin capital letter d with stroke +$chars{0xe6}=0x0132; # latin capital ligature ij +$chars{0xe8}=0x0141; # latin capital letter l with stroke +$chars{0xe9}=0x00d8; # latin capital letter o with stroke +$chars{0xea}=0x0152; # latin capital ligature oe +$chars{0xec}=0x00de; # latin capital letter thorn + +#### +## 0xf +$chars{0xf1}=0x00e6; # latin small letter ae +$chars{0xf2}=0x0111; # latin small letter d with stroke +$chars{0xf3}=0x00f0; # latin small letter eth +$chars{0xf5}=0x0131; # latin small letter dotless i +$chars{0xf6}=0x0133; # latin small ligature ij +$chars{0xf8}=0x0142; # latin small letter l with stroke +$chars{0xf9}=0x00f8; # latin small letter o with stroke +$chars{0xfa}=0x0153; # latin small ligature oe +$chars{0xfb}=0x00df; # latin small letter sharp s +$chars{0xfc}=0x00fe; # latin small letter thorn + +#### +## Others $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark -$chars{0xfa}=0x0153; #oe -$chars{0xea}=0x0152; #oe -$chars{0x81d1}=0x00b0; +#$chars{0x81d1}=0x00b0; # FIXME useless #### ## combined characters iso5426 @@ -1013,8 +1155,8 @@ $chars{0xd375}=0x0173; # small u with ogonek $chars{0xd441}=0x1e00; # capital a with ring below $chars{0xd461}=0x1e01; # small a with ring below # 5/5 half circle below -$chars{0xf948}=0x1e2a; # capital h with breve below -$chars{0xf968}=0x1e2b; # small h with breve below +$chars{0xd548}=0x1e2a; # capital h with breve below +$chars{0xd568}=0x1e2b; # small h with breve below # 5/6 dot below $chars{0xd641}=0x1ea0; # capital a with dot below $chars{0xd642}=0x1e04; # capital b with dot below