X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=C4%2FCharset.pm;h=c294b361331489a29eb771456d415fea95b8f7a0;hb=223bf0099fcb837fb6bc1c719517e175c43f6ffc;hp=54a1322ca06b3abfb3f8aaaa0d9a2965bf8da9d6;hpb=c86f5df43109244bfadda66fd899588ced658e83;p=koha.git diff --git a/C4/Charset.pm b/C4/Charset.pm index 54a1322ca0..c294b36133 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -13,25 +13,33 @@ package C4::Charset; # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. See the GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along with -# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, -# Suite 330, Boston, MA 02111-1307 USA +# You should have received a copy of the GNU General Public License along +# with Koha; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. use strict; +use warnings; + use MARC::Charset qw/marc8_to_utf8/; use Text::Iconv; +use C4::Debug; +use Unicode::Normalize; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); BEGIN { # set the version for version checking - $VERSION = 3.00; + $VERSION = 3.07.00.049; require Exporter; @ISA = qw(Exporter); @EXPORT = qw( + NormalizeString IsStringUTF8ish MarcToUTF8Record + SetUTF8Flag SetMarcUnicodeFlag + StripNonXmlChars + nsb_clean ); } @@ -41,7 +49,7 @@ C4::Charset - utilities for handling character set conversions. =head1 SYNOPSIS -use C4::Charset; + use C4::Charset; =head1 DESCRIPTION @@ -70,16 +78,12 @@ on how to deal with the situation. =head2 IsStringUTF8ish -=over 4 - -my $is_utf8 = IsStringUTF8ish($str); - -=back + my $is_utf8 = IsStringUTF8ish($str); Determines if C<$str> is valid UTF-8. This can mean one of two things: -=over 2 +=over =item * @@ -107,13 +111,87 @@ sub IsStringUTF8ish { return utf8::decode($str); } -=head2 MarcToUTF8Record +=head2 SetUTF8Flag -=over 4 + my $marc_record = SetUTF8Flag($marc_record, $nfd); -($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]); +This function sets the PERL UTF8 flag for data. +It is required when using new_from_usmarc +since MARC::File::USMARC does not handle PERL UTF8 setting. +When editing unicode marc records fields and subfields, you +would end up in double encoding without using this function. -=back +If $nfd is set, string normalization will use NFD instead of NFC + +FIXME +In my opinion, this function belongs to MARC::Record and not +to this package. +But since it handles charset, and MARC::Record, it finds its way in that package + +=cut + +sub SetUTF8Flag{ + my ($record, $nfd)=@_; + return unless ($record && $record->fields()); + foreach my $field ($record->fields()){ + if ($field->tag()>=10){ + my @subfields; + foreach my $subfield ($field->subfields()){ + push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); + } + my $newfield=MARC::Field->new( + $field->tag(), + $field->indicator(1), + $field->indicator(2), + @subfields + ); + $field->replace_with($newfield); + } + } +} + +=head2 NormalizeString + + my $normalized_string=NormalizeString($string,$nfd,$transform); + +Given a string +nfd : If you want to set NFD and not NFC +transform : If you expect all the signs to be removed + +Sets the PERL UTF8 Flag on your initial data if need be +and applies cleaning if required + +Returns a utf8 NFC normalized string + +Sample code : + my $string=NormalizeString ("l'ornithoptère"); + #results into ornithoptère in NFC form and sets UTF8 Flag + +=cut + + +sub NormalizeString{ + my ($string,$nfd,$transform)=@_; + utf8::decode($string) unless (utf8::is_utf8($string)); + if ($nfd){ + $string= NFD($string); + } + else { + $string=NFC($string); + } + if ($transform){ + $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g; + #removing one letter words "d'" "l'" was changed into "d " "l " + $string=~s/\b\S\b//g; + $string=~s/\s+$//g; + } + return $string; +} + +=head2 MarcToUTF8Record + + ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, + $marc_flavour, [, $source_encoding]); Given a MARC blob or a C, the MARC flavour, and an optional source encoding, return a C that is @@ -135,7 +213,6 @@ sub MarcToUTF8Record { my $marc = shift; my $marc_flavour = shift; my $source_encoding = shift; - my $marc_record; my $marc_blob_is_utf8 = 0; if (ref($marc) eq 'MARC::Record') { @@ -152,28 +229,44 @@ sub MarcToUTF8Record { $marc =~ s/^\s+//; $marc =~ s/\s+$//; $marc_blob_is_utf8 = IsStringUTF8ish($marc); - $marc_record = MARC::Record->new_from_usmarc($marc); + eval { + $marc_record = MARC::Record->new_from_usmarc($marc); + }; + if ($@) { + # if we fail the first time, one likely problem + # is that we have a MARC21 record that says that it's + # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters. + # We'll try parsing it again. + substr($marc, 9, 1) = ' '; + eval { + $marc_record = MARC::Record->new_from_usmarc($marc); + }; + if ($@) { + # it's hopeless; return an empty MARC::Record + return MARC::Record->new(), 'failed', ['could not parse MARC blob']; + } + } } # If we do not know the source encoding, try some guesses # as follows: # 1. Record is UTF-8 already. - # 2. If MARC flavor is MARC21, then + # 2. If MARC flavor is MARC21 or NORMARC, then # a. record is MARC-8 # b. record is ISO-8859-1 # 3. If MARC flavor is UNIMARC, then if (not defined $source_encoding) { if ($marc_blob_is_utf8) { - # note that for MARC21 we are not bothering to check + # note that for MARC21/NORMARC we are not bothering to check # if the Leader/09 is set to 'a' or not -- because # of problems with various ILSs (including Koha in the # past, alas), this just is not trustworthy. SetMarcUnicodeFlag($marc_record, $marc_flavour); return $marc_record, 'UTF-8', []; } else { - if ($marc_flavour eq 'MARC21') { + if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); - } elsif ($marc_flavour eq 'UNIMARC') { + } elsif ($marc_flavour =~/UNIMARC/) { return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour); } else { return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); @@ -196,7 +289,7 @@ sub MarcToUTF8Record { @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour); } else { # assume any other character encoding is for Text::Iconv - @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1'); + @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); } if (@errors) { @@ -211,11 +304,7 @@ sub MarcToUTF8Record { =head2 SetMarcUnicodeFlag -=over 4 - -SetMarcUnicodeFlag($marc_record, $marc_flavour); - -=back + SetMarcUnicodeFlag($marc_record, $marc_flavour); Set both the internal MARC::Record encoding flag and the appropriate Leader/09 (MARC21) or @@ -227,34 +316,113 @@ any actual character conversion. sub SetMarcUnicodeFlag { my $marc_record = shift; - my $marc_flavour = shift; + my $marc_flavour = shift; # || C4::Context->preference("marcflavour"); $marc_record->encoding('UTF-8'); - if ($marc_flavour eq 'MARC21') { + if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { my $leader = $marc_record->leader(); substr($leader, 9, 1) = 'a'; $marc_record->leader($leader); - } elsif ($marc_flavour eq "UNIMARC") { - if (my $field = $marc_record->field('100')) { - my $sfa = $field->subfield('a'); - substr($sfa, 26, 4) = '5050'; - $field->update('a' => $sfa); + } elsif ($marc_flavour =~/UNIMARC/) { + my $string; + my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22)); + $string=$marc_record->subfield( 100, "a" ); + if (defined $string && length($string)==$subflength) { + $string = substr $string, 0,$subflength if (length($string)>$subflength); + } + else { + $string = POSIX::strftime( "%Y%m%d", localtime ); + $string =~ s/\-//g; + $string = sprintf( "%-*s", $subflength, $string ); + } + substr( $string, $encodingposition, 8, "frey50 " ); + if ( $marc_record->subfield( 100, "a" ) ) { + $marc_record->field('100')->update(a=>$string); + } + else { + $marc_record->insert_grouped_field( + MARC::Field->new( 100, '', '', "a" => $string ) ); } + $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 ); + } else { + warn "Unrecognized marcflavour: $marc_flavour"; } } +=head2 StripNonXmlChars + my $new_str = StripNonXmlChars($old_str); -=head1 INTERNAL FUNCTIONS +Given a string, return a copy with the +characters that are illegal in XML +removed. + +This function exists to work around a problem +that can occur with badly-encoded MARC records. +Specifically, if a UTF-8 MARC record also +has excape (\x1b) characters, MARC::File::XML +will let the escape characters pass through +when as_xml() or as_xml_record() is called. The +problem is that the escape character is not +legal in well-formed XML documents, so when +MARC::File::XML attempts to parse such a record, +the XML parser will fail. + +Stripping such characters will allow a +MARC::Record->new_from_xml() +to work, at the possible risk of some data loss. + +=cut + +sub StripNonXmlChars { + my $str = shift; + if (!defined($str) || $str eq ""){ + return ""; + } + $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g; + return $str; +} -=head2 _default_marc21_charconv_to_utf8 + + +=head2 nsb_clean =over 4 -my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record); +nsb_clean($string); =back +Removes Non Sorting Block characters + +=cut +sub nsb_clean { + my $NSB = '\x88' ; # NSB : begin Non Sorting Block + my $NSE = '\x89' ; # NSE : Non Sorting Block end + my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block + my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end + my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE + + # handles non sorting blocks + my ($string) = @_ ; + $_ = $string ; + s/$NSB//g ; + s/$NSE//g ; + s/$NSB2//g ; + s/$NSE2//g ; + s/$C2//g ; + $string = $_ ; + + return($string) ; +} + + +=head1 INTERNAL FUNCTIONS + +=head2 _default_marc21_charconv_to_utf8 + + my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record); + Converts a C of unknown character set to UTF-8, first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1 to UTF-8, then a default conversion that replaces each non-ASCII @@ -295,11 +463,7 @@ sub _default_marc21_charconv_to_utf8 { =head2 _default_unimarc_charconv_to_utf8 -=over 4 - -my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record); - -=back + my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record); Converts a C of unknown character set to UTF-8, first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1 @@ -339,11 +503,7 @@ sub _default_unimarc_charconv_to_utf8 { =head2 _marc_marc8_to_utf8 -=over 4 - -my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding); - -=back + my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding); Convert a C to UTF-8 in-place from MARC-8. If the conversion fails for some reason, an @@ -385,6 +545,16 @@ sub _marc_marc8_to_utf8 { my @converted_subfields; foreach my $subfield ($field->subfields()) { my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]); + unless (IsStringUTF8ish($utf8sf)) { + # Because of a bug in MARC::Charset 0.98, if the string + # has (a) one or more diacritics that (b) are only in character positions + # 128 to 255 inclusive, the resulting converted string is not in + # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that + # occurs, upgrade the string in place. Moral of the story seems to be + # that pack("U", ...) is better than chr(...) if you need to guarantee + # that the resulting string is UTF-8. + utf8::upgrade($utf8sf); + } push @converted_subfields, $subfield->[0], $utf8sf; } @@ -404,11 +574,7 @@ sub _marc_marc8_to_utf8 { =head2 _marc_iso5426_to_utf8 -=over 4 - -my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding); - -=back + my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding); Convert a C to UTF-8 in-place from ISO-5426. If the conversion fails for some reason, an @@ -450,11 +616,7 @@ sub _marc_iso5426_to_utf8 { =head2 _marc_to_utf8_via_text_iconv -=over 4 - -my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); - -=back + my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); Convert a C to UTF-8 in-place using the C CPAN module. Any source encoding accepted @@ -523,11 +685,7 @@ sub _marc_to_utf8_via_text_iconv { =head2 _marc_to_utf8_replacement_char -=over 4 - -_marc_to_utf8_replacement_char($marc_record, $marc_flavour); - -=back + _marc_to_utf8_replacement_char($marc_record, $marc_flavour); Convert a C to UTF-8 in-place, adopting the unsatisfactory method of replacing all non-ASCII (e.g., @@ -566,19 +724,13 @@ sub _marc_to_utf8_replacement_char { =head2 char_decode5426 -=over 4 - -my $utf8string = char_decode5426($iso_5426_string); - -=back + my $utf8string = char_decode5426($iso_5426_string); Converts a string from ISO-5426 to UTF-8. =cut -sub char_decode5426 { - my ( $string) = @_; - my $result; + my %chars; $chars{0xb0}=0x0101;#3/0ayn[ain] $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove] @@ -588,11 +740,14 @@ $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark # $chars{0xb4}='è'; $chars{0xb4}=0x00e8; +$chars{0xbd}=0x02b9; +$chars{0xbe}=0x02ba; # $chars{0xb5}='é'; $chars{0xb5}=0x00e9; $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark -$chars{0xfa}=0x0153;#oe +$chars{0xfa}=0x0153; #oe +$chars{0xea}=0x0152; #oe $chars{0x81d1}=0x00b0; #### @@ -951,6 +1106,11 @@ $chars{0xda20}=0x02cc; # # 5/14 right half of ligature sign # 5/15 right half of double tilde # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars; + +sub char_decode5426 { + my ( $string) = @_; + my $result; + my @data = unpack("C*", $string); my @characters; my $length=scalar(@data); @@ -1003,7 +1163,7 @@ $chars{0xda20}=0x02cc; # =head1 AUTHOR -Koha Development Team +Koha Development Team Galen Charlton