C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.07.00.049;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         NormalizeString
  37         IsStringUTF8ish
  38         MarcToUTF8Record
  39         SetUTF8Flag
  40         SetMarcUnicodeFlag
  41         StripNonXmlChars
  42         nsb_clean
  43     );
  44 }
  45
  46 =head1 NAME
  47
  48 C4::Charset - utilities for handling character set conversions.
  49
  50 =head1 SYNOPSIS
  51
  52   use C4::Charset;
  53
  54 =head1 DESCRIPTION
  55
  56 This module contains routines for dealing with character set
  57 conversions, particularly for MARC records.
  58
  59 A variety of character encodings are in use by various MARC
  60 standards, and even more character encodings are used by
  61 non-standard MARC records.  The various MARC formats generally
  62 do not do a good job of advertising a given record's character
  63 encoding, and even when a record does advertise its encoding,
  64 e.g., via the Leader/09, experience has shown that one cannot
  65 trust it.
  66
  67 Ultimately, all MARC records are stored in Koha in UTF-8 and
  68 must be converted from whatever the source character encoding is.
  69 The goal of this module is to ensure that these conversions
  70 take place accurately.  When a character conversion cannot take
  71 place, or at least not accurately, the module was provide
  72 enough information to allow user-facing code to inform the user
  73 on how to deal with the situation.
  74
  75 =cut
  76
  77 =head1 FUNCTIONS
  78
  79 =head2 IsStringUTF8ish
  80
  81   my $is_utf8 = IsStringUTF8ish($str);
  82
  83 Determines if C<$str> is valid UTF-8.  This can mean
  84 one of two things:
  85
  86 =over
  87
  88 =item *
  89
  90 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  91
  92 =item *
  93
  94 The Perl UTF-8 flag is B<not> set, but the octets contain
  95 valid UTF-8.
  96
  97 =back
  98
  99 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 100 because in one could be presented with a MARC blob that is
 101 not actually in UTF-8 but whose sequence of octets appears to be
 102 valid UTF-8.  The rest of the MARC character conversion functions
 103 will assume that this situation occur does not very often.
 104
 105 =cut
 106
 107 sub IsStringUTF8ish {
 108     my $str = shift;
 109
 110     return 1 if utf8::is_utf8($str);
 111     return utf8::decode($str);
 112 }
 113
 114 =head2 SetUTF8Flag
 115
 116   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 117
 118 This function sets the PERL UTF8 flag for data.
 119 It is required when using new_from_usmarc
 120 since MARC::File::USMARC does not handle PERL UTF8 setting.
 121 When editing unicode marc records fields and subfields, you
 122 would end up in double encoding without using this function.
 123
 124 If $nfd is set, string normalization will use NFD instead of NFC
 125
 126 FIXME
 127 In my opinion, this function belongs to MARC::Record and not
 128 to this package.
 129 But since it handles charset, and MARC::Record, it finds its way in that package
 130
 131 =cut
 132
 133 sub SetUTF8Flag{
 134     my ($record, $nfd)=@_;
 135     return unless ($record && $record->fields());
 136     foreach my $field ($record->fields()){
 137         if ($field->tag()>=10){
 138             my @subfields;
 139             foreach my $subfield ($field->subfields()){
 140                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 141             }
 142             eval {
 143                 my $newfield=MARC::Field->new(
 144                             $field->tag(),
 145                             $field->indicator(1),
 146                             $field->indicator(2),
 147                             @subfields
 148                         );
 149                 $field->replace_with($newfield);
 150             };
 151             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 152         }
 153     }
 154 }
 155
 156 =head2 NormalizeString
 157
 158     my $normalized_string=NormalizeString($string,$nfd,$transform);
 159
 160 Given a string
 161 nfd : If you want to set NFD and not NFC
 162 transform : If you expect all the signs to be removed
 163
 164 Sets the PERL UTF8 Flag on your initial data if need be
 165 and applies cleaning if required
 166
 167 Returns a utf8 NFC normalized string
 168
 169 Sample code :
 170    my $string=NormalizeString ("l'ornithoptère");
 171    #results into ornithoptère in NFC form and sets UTF8 Flag
 172
 173 =cut
 174
 175
 176 sub NormalizeString{
 177         my ($string,$nfd,$transform)=@_;
 178         utf8::decode($string) unless (utf8::is_utf8($string));
 179         if ($nfd){
 180                 $string= NFD($string);
 181         }
 182         else {
 183                 $string=NFC($string);
 184         }
 185         if ($transform){
 186     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 187         #removing one letter words "d'" "l'"  was changed into "d " "l "
 188     $string=~s/\b\S\b//g;
 189     $string=~s/\s+$//g;
 190         }
 191     return $string;
 192 }
 193
 194 =head2 MarcToUTF8Record
 195
 196   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 197                                         $marc_flavour, [, $source_encoding]);
 198
 199 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 200 optional source encoding, return a C<MARC::Record> that is
 201 converted to UTF-8.
 202
 203 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 204 is not guaranteed to have been converted correctly.  Specifically,
 205 if C<$converted_from> is 'failed', the MARC record returned failed
 206 character conversion and had each of its non-ASCII octets changed
 207 to the Unicode replacement character.
 208
 209 If the source encoding was not specified, this routine will
 210 try to guess it; the character encoding used for a successful
 211 conversion is returned in C<$converted_from>.
 212
 213 =cut
 214
 215 sub MarcToUTF8Record {
 216     my $marc = shift;
 217     my $marc_flavour = shift;
 218     my $source_encoding = shift;
 219     my $marc_record;
 220     my $marc_blob_is_utf8 = 0;
 221     if (ref($marc) eq 'MARC::Record') {
 222         my $marc_blob = $marc->as_usmarc();
 223         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 224         $marc_record = $marc;
 225     } else {
 226         # dealing with a MARC blob
 227
 228         # remove any ersatz whitespace from the beginning and
 229         # end of the MARC blob -- these can creep into MARC
 230         # files produced by several sources -- caller really
 231         # should be doing this, however
 232         $marc =~ s/^\s+//;
 233         $marc =~ s/\s+$//;
 234         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 235         eval {
 236             $marc_record = MARC::Record->new_from_usmarc($marc);
 237         };
 238         if ($@) {
 239             # if we fail the first time, one likely problem
 240             # is that we have a MARC21 record that says that it's
 241             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 242             # We'll try parsing it again.
 243             substr($marc, 9, 1) = ' ';
 244             eval {
 245                 $marc_record = MARC::Record->new_from_usmarc($marc);
 246             };
 247             if ($@) {
 248                 # it's hopeless; return an empty MARC::Record
 249                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 250             }
 251         }
 252     }
 253
 254     # If we do not know the source encoding, try some guesses
 255     # as follows:
 256     #   1. Record is UTF-8 already.
 257     #   2. If MARC flavor is MARC21 or NORMARC, then
 258     #      a. record is MARC-8
 259     #      b. record is ISO-8859-1
 260     #   3. If MARC flavor is UNIMARC, then
 261     if (not defined $source_encoding) {
 262         if ($marc_blob_is_utf8) {
 263             # note that for MARC21/NORMARC we are not bothering to check
 264             # if the Leader/09 is set to 'a' or not -- because
 265             # of problems with various ILSs (including Koha in the
 266             # past, alas), this just is not trustworthy.
 267             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 268             return $marc_record, 'UTF-8', [];
 269         } else {
 270             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 271                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 272             } elsif ($marc_flavour =~/UNIMARC/) {
 273                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 274             } else {
 275                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 276             }
 277         }
 278     } else {
 279         # caller knows the character encoding
 280         my $original_marc_record = $marc_record->clone();
 281         my @errors;
 282         if ($source_encoding =~ /utf-?8/i) {
 283             if ($marc_blob_is_utf8) {
 284                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 285                 return $marc_record, 'UTF-8', [];
 286             } else {
 287                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 288             }
 289         } elsif ($source_encoding =~ /marc-?8/i) {
 290             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 291         } elsif ($source_encoding =~ /5426/) {
 292             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 293         } else {
 294             # assume any other character encoding is for Text::Iconv
 295             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 296         }
 297
 298         if (@errors) {
 299             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 300             return $original_marc_record, 'failed', \@errors;
 301         } else {
 302             return $marc_record, $source_encoding, [];
 303         }
 304     }
 305
 306 }
 307
 308 =head2 SetMarcUnicodeFlag
 309
 310   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 311
 312 Set both the internal MARC::Record encoding flag
 313 and the appropriate Leader/09 (MARC21) or
 314 100/26-29 (UNIMARC) to indicate that the record
 315 is in UTF-8.  Note that this does B<not> do
 316 any actual character conversion.
 317
 318 =cut
 319
 320 sub SetMarcUnicodeFlag {
 321     my $marc_record = shift;
 322     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 323
 324     $marc_record->encoding('UTF-8');
 325     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 326         my $leader = $marc_record->leader();
 327         substr($leader, 9, 1) = 'a';
 328         $marc_record->leader($leader);
 329     } elsif ($marc_flavour =~/UNIMARC/) {
 330         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 331         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 332         my $string;
 333                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 334                 $string=$marc_record->subfield( 100, "a" );
 335         if (defined $string && length($string)==$subflength) {
 336                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 337         }
 338         else {
 339             $string = POSIX::strftime( "%Y%m%d", localtime );
 340             $string =~ s/\-//g;
 341             $string = sprintf( "%-*s", $subflength, $string );
 342             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 343         }
 344         substr( $string, $encodingposition, 3, "y50" );
 345         if ( $marc_record->subfield( 100, "a" ) ) {
 346                         $marc_record->field('100')->update(a=>$string);
 347                 }
 348                 else {
 349             $marc_record->insert_grouped_field(
 350                 MARC::Field->new( 100, '', '', "a" => $string ) );
 351         }
 352                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 353     } else {
 354         warn "Unrecognized marcflavour: $marc_flavour";
 355     }
 356 }
 357
 358 =head2 StripNonXmlChars
 359
 360   my $new_str = StripNonXmlChars($old_str);
 361
 362 Given a string, return a copy with the
 363 characters that are illegal in XML
 364 removed.
 365
 366 This function exists to work around a problem
 367 that can occur with badly-encoded MARC records.
 368 Specifically, if a UTF-8 MARC record also
 369 has excape (\x1b) characters, MARC::File::XML
 370 will let the escape characters pass through
 371 when as_xml() or as_xml_record() is called.  The
 372 problem is that the escape character is not
 373 legal in well-formed XML documents, so when
 374 MARC::File::XML attempts to parse such a record,
 375 the XML parser will fail.
 376
 377 Stripping such characters will allow a
 378 MARC::Record->new_from_xml()
 379 to work, at the possible risk of some data loss.
 380
 381 =cut
 382
 383 sub StripNonXmlChars {
 384     my $str = shift;
 385     if (!defined($str) || $str eq ""){
 386         return "";
 387     }
 388     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 389     return $str;
 390 }
 391
 392
 393
 394 =head2 nsb_clean
 395
 396 =over 4
 397
 398 nsb_clean($string);
 399
 400 =back
 401
 402 Removes Non Sorting Block characters
 403
 404 =cut
 405 sub nsb_clean {
 406     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 407     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 408     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 409     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 410     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 411
 412     # handles non sorting blocks
 413     my ($string) = @_ ;
 414     $_ = $string ;
 415     s/$NSB//g ;
 416     s/$NSE//g ;
 417     s/$NSB2//g ;
 418     s/$NSE2//g ;
 419     s/$C2//g ;
 420     $string = $_ ;
 421
 422     return($string) ;
 423 }
 424
 425
 426 =head1 INTERNAL FUNCTIONS
 427
 428 =head2 _default_marc21_charconv_to_utf8
 429
 430   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 431
 432 Converts a C<MARC::Record> of unknown character set to UTF-8,
 433 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 434 to UTF-8, then a default conversion that replaces each non-ASCII
 435 character with the replacement character.
 436
 437 The C<$guessed_charset> return value contains the character set
 438 that resulted in a conversion to valid UTF-8; note that
 439 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 440 this is 'failed'.
 441
 442 =cut
 443
 444 sub _default_marc21_charconv_to_utf8 {
 445     my $marc_record = shift;
 446     my $marc_flavour = shift;
 447
 448     my $trial_marc8 = $marc_record->clone();
 449     my @all_errors = ();
 450     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 451     unless (@errors) {
 452         return $trial_marc8, 'MARC-8', [];
 453     }
 454     push @all_errors, @errors;
 455
 456     my $trial_8859_1 = $marc_record->clone();
 457     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 458     unless (@errors) {
 459         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 460                                                 # instead if we wanted to report details
 461                                                 # of the failed attempt at MARC-8 => UTF-8
 462     }
 463     push @all_errors, @errors;
 464
 465     my $default_converted = $marc_record->clone();
 466     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 467     return $default_converted, 'failed', \@all_errors;
 468 }
 469
 470 =head2 _default_unimarc_charconv_to_utf8
 471
 472   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 473
 474 Converts a C<MARC::Record> of unknown character set to UTF-8,
 475 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 476 to UTF-8, then a default conversion that replaces each non-ASCII
 477 character with the replacement character.
 478
 479 The C<$guessed_charset> return value contains the character set
 480 that resulted in a conversion to valid UTF-8; note that
 481 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 482 this is 'failed'.
 483
 484 =cut
 485
 486 sub _default_unimarc_charconv_to_utf8 {
 487     my $marc_record = shift;
 488     my $marc_flavour = shift;
 489
 490     my $trial_marc8 = $marc_record->clone();
 491     my @all_errors = ();
 492     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 493     unless (@errors) {
 494         return $trial_marc8, 'iso-5426';
 495     }
 496     push @all_errors, @errors;
 497
 498     my $trial_8859_1 = $marc_record->clone();
 499     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 500     unless (@errors) {
 501         return $trial_8859_1, 'iso-8859-1';
 502     }
 503     push @all_errors, @errors;
 504
 505     my $default_converted = $marc_record->clone();
 506     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 507     return $default_converted, 'failed', \@all_errors;
 508 }
 509
 510 =head2 _marc_marc8_to_utf8
 511
 512   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 513
 514 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 515 If the conversion fails for some reason, an
 516 appropriate messages will be placed in the returned
 517 C<@errors> array.
 518
 519 =cut
 520
 521 sub _marc_marc8_to_utf8 {
 522     my $marc_record = shift;
 523     my $marc_flavour = shift;
 524
 525     my $prev_ignore = MARC::Charset->ignore_errors();
 526     MARC::Charset->ignore_errors(1);
 527
 528     # trap warnings raised by MARC::Charset
 529     my @errors = ();
 530     local $SIG{__WARN__} = sub {
 531         my $msg = $_[0];
 532         if ($msg =~ /MARC.Charset/) {
 533             # FIXME - purpose of this regexp is to strip out the
 534             # line reference to MARC/Charset.pm, but as it
 535             # exists probably won't work quite on Windows --
 536             # some sort of minimal-bunch back-tracking RE
 537             # would be helpful here
 538             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 539             push @errors, $msg;
 540         } else {
 541             # if warning doesn't come from MARC::Charset, just
 542             # pass it on
 543             warn $msg;
 544         }
 545     };
 546
 547     foreach my $field ($marc_record->fields()) {
 548         if ($field->is_control_field()) {
 549             ; # do nothing -- control fields should not contain non-ASCII characters
 550         } else {
 551             my @converted_subfields;
 552             foreach my $subfield ($field->subfields()) {
 553                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 554                 unless (IsStringUTF8ish($utf8sf)) {
 555                     # Because of a bug in MARC::Charset 0.98, if the string
 556                     # has (a) one or more diacritics that (b) are only in character positions
 557                     # 128 to 255 inclusive, the resulting converted string is not in
 558                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 559                     # occurs, upgrade the string in place.  Moral of the story seems to be
 560                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 561                     # that the resulting string is UTF-8.
 562                     utf8::upgrade($utf8sf);
 563                 }
 564                 push @converted_subfields, $subfield->[0], $utf8sf;
 565             }
 566
 567             $field->replace_with(MARC::Field->new(
 568                 $field->tag(), $field->indicator(1), $field->indicator(2),
 569                 @converted_subfields)
 570             );
 571         }
 572     }
 573
 574     MARC::Charset->ignore_errors($prev_ignore);
 575
 576     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 577
 578     return @errors;
 579 }
 580
 581 =head2 _marc_iso5426_to_utf8
 582
 583   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 584
 585 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 586 If the conversion fails for some reason, an
 587 appropriate messages will be placed in the returned
 588 C<@errors> array.
 589
 590 FIXME - is ISO-5426 equivalent enough to MARC-8
 591 that C<MARC::Charset> can be used instead?
 592
 593 =cut
 594
 595 sub _marc_iso5426_to_utf8 {
 596     my $marc_record = shift;
 597     my $marc_flavour = shift;
 598
 599     my @errors = ();
 600
 601     foreach my $field ($marc_record->fields()) {
 602         if ($field->is_control_field()) {
 603             ; # do nothing -- control fields should not contain non-ASCII characters
 604         } else {
 605             my @converted_subfields;
 606             foreach my $subfield ($field->subfields()) {
 607                 my $utf8sf = char_decode5426($subfield->[1]);
 608                 push @converted_subfields, $subfield->[0], $utf8sf;
 609             }
 610
 611             $field->replace_with(MARC::Field->new(
 612                 $field->tag(), $field->indicator(1), $field->indicator(2),
 613                 @converted_subfields)
 614             );
 615         }
 616     }
 617
 618     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 619
 620     return @errors;
 621 }
 622
 623 =head2 _marc_to_utf8_via_text_iconv
 624
 625   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 626
 627 Convert a C<MARC::Record> to UTF-8 in-place using the
 628 C<Text::Iconv> CPAN module.  Any source encoding accepted
 629 by the user's iconv installation should work.  If
 630 the source encoding is not recognized on the user's
 631 server or the conversion fails for some reason,
 632 appropriate messages will be placed in the returned
 633 C<@errors> array.
 634
 635 =cut
 636
 637 sub _marc_to_utf8_via_text_iconv {
 638     my $marc_record = shift;
 639     my $marc_flavour = shift;
 640     my $source_encoding = shift;
 641
 642     my @errors = ();
 643     my $decoder;
 644     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 645     if ($@) {
 646         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 647         return @errors;
 648     }
 649
 650     my $prev_raise_error = Text::Iconv->raise_error();
 651     Text::Iconv->raise_error(1);
 652
 653     foreach my $field ($marc_record->fields()) {
 654         if ($field->is_control_field()) {
 655             ; # do nothing -- control fields should not contain non-ASCII characters
 656         } else {
 657             my @converted_subfields;
 658             foreach my $subfield ($field->subfields()) {
 659                 my $converted_value;
 660                 my $conversion_ok = 1;
 661                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 662                 if ($@) {
 663                     $conversion_ok = 0;
 664                     push @errors, $@;
 665                 } elsif (not defined $converted_value) {
 666                     $conversion_ok = 0;
 667                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 668                 }
 669
 670                 if ($conversion_ok) {
 671                     push @converted_subfields, $subfield->[0], $converted_value;
 672                 } else {
 673                     $converted_value = $subfield->[1];
 674                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 675                     push @converted_subfields, $subfield->[0], $converted_value;
 676                 }
 677             }
 678
 679             $field->replace_with(MARC::Field->new(
 680                 $field->tag(), $field->indicator(1), $field->indicator(2),
 681                 @converted_subfields)
 682             );
 683         }
 684     }
 685
 686     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 687     Text::Iconv->raise_error($prev_raise_error);
 688
 689     return @errors;
 690 }
 691
 692 =head2 _marc_to_utf8_replacement_char
 693
 694   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 695
 696 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 697 unsatisfactory method of replacing all non-ASCII (e.g.,
 698 where the eight bit is set) octet with the Unicode
 699 replacement character.  This is meant as a last-ditch
 700 method, and would be best used as part of a UI that
 701 lets a cataloguer pick various character conversions
 702 until he or she finds the right one.
 703
 704 =cut
 705
 706 sub _marc_to_utf8_replacement_char {
 707     my $marc_record = shift;
 708     my $marc_flavour = shift;
 709
 710     foreach my $field ($marc_record->fields()) {
 711         if ($field->is_control_field()) {
 712             ; # do nothing -- control fields should not contain non-ASCII characters
 713         } else {
 714             my @converted_subfields;
 715             foreach my $subfield ($field->subfields()) {
 716                 my $value = $subfield->[1];
 717                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 718                 push @converted_subfields, $subfield->[0], $value;
 719             }
 720
 721             $field->replace_with(MARC::Field->new(
 722                 $field->tag(), $field->indicator(1), $field->indicator(2),
 723                 @converted_subfields)
 724             );
 725         }
 726     }
 727
 728     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 729 }
 730
 731 =head2 char_decode5426
 732
 733   my $utf8string = char_decode5426($iso_5426_string);
 734
 735 Converts a string from ISO-5426 to UTF-8.
 736
 737 =cut
 738
 739
 740 my %chars;
 741 $chars{0xb0}=0x0101;#3/0ayn[ain]
 742 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 743 #$chars{0xb2}=0x00e0;#'à';
 744 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 745 #$chars{0xb3}=0x00e7;#'ç';
 746 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 747 # $chars{0xb4}='è';
 748 $chars{0xb4}=0x00e8;
 749 $chars{0xbd}=0x02b9;
 750 $chars{0xbe}=0x02ba;
 751 # $chars{0xb5}='é';
 752 $chars{0xb5}=0x00e9;
 753 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 754 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 755 $chars{0xfa}=0x0153; #oe
 756 $chars{0xea}=0x0152; #oe
 757 $chars{0x81d1}=0x00b0;
 758
 759 ####
 760 ## combined characters iso5426
 761
 762 $chars{0xc041}=0x1ea2; # capital a with hook above
 763 $chars{0xc045}=0x1eba; # capital e with hook above
 764 $chars{0xc049}=0x1ec8; # capital i with hook above
 765 $chars{0xc04f}=0x1ece; # capital o with hook above
 766 $chars{0xc055}=0x1ee6; # capital u with hook above
 767 $chars{0xc059}=0x1ef6; # capital y with hook above
 768 $chars{0xc061}=0x1ea3; # small a with hook above
 769 $chars{0xc065}=0x1ebb; # small e with hook above
 770 $chars{0xc069}=0x1ec9; # small i with hook above
 771 $chars{0xc06f}=0x1ecf; # small o with hook above
 772 $chars{0xc075}=0x1ee7; # small u with hook above
 773 $chars{0xc079}=0x1ef7; # small y with hook above
 774
 775         # 4/1 grave accent
 776 $chars{0xc141}=0x00c0; # capital a with grave accent
 777 $chars{0xc145}=0x00c8; # capital e with grave accent
 778 $chars{0xc149}=0x00cc; # capital i with grave accent
 779 $chars{0xc14f}=0x00d2; # capital o with grave accent
 780 $chars{0xc155}=0x00d9; # capital u with grave accent
 781 $chars{0xc157}=0x1e80; # capital w with grave
 782 $chars{0xc159}=0x1ef2; # capital y with grave
 783 $chars{0xc161}=0x00e0; # small a with grave accent
 784 $chars{0xc165}=0x00e8; # small e with grave accent
 785 $chars{0xc169}=0x00ec; # small i with grave accent
 786 $chars{0xc16f}=0x00f2; # small o with grave accent
 787 $chars{0xc175}=0x00f9; # small u with grave accent
 788 $chars{0xc177}=0x1e81; # small w with grave
 789 $chars{0xc179}=0x1ef3; # small y with grave
 790         # 4/2 acute accent
 791 $chars{0xc241}=0x00c1; # capital a with acute accent
 792 $chars{0xc243}=0x0106; # capital c with acute accent
 793 $chars{0xc245}=0x00c9; # capital e with acute accent
 794 $chars{0xc247}=0x01f4; # capital g with acute
 795 $chars{0xc249}=0x00cd; # capital i with acute accent
 796 $chars{0xc24b}=0x1e30; # capital k with acute
 797 $chars{0xc24c}=0x0139; # capital l with acute accent
 798 $chars{0xc24d}=0x1e3e; # capital m with acute
 799 $chars{0xc24e}=0x0143; # capital n with acute accent
 800 $chars{0xc24f}=0x00d3; # capital o with acute accent
 801 $chars{0xc250}=0x1e54; # capital p with acute
 802 $chars{0xc252}=0x0154; # capital r with acute accent
 803 $chars{0xc253}=0x015a; # capital s with acute accent
 804 $chars{0xc255}=0x00da; # capital u with acute accent
 805 $chars{0xc257}=0x1e82; # capital w with acute
 806 $chars{0xc259}=0x00dd; # capital y with acute accent
 807 $chars{0xc25a}=0x0179; # capital z with acute accent
 808 $chars{0xc261}=0x00e1; # small a with acute accent
 809 $chars{0xc263}=0x0107; # small c with acute accent
 810 $chars{0xc265}=0x00e9; # small e with acute accent
 811 $chars{0xc267}=0x01f5; # small g with acute
 812 $chars{0xc269}=0x00ed; # small i with acute accent
 813 $chars{0xc26b}=0x1e31; # small k with acute
 814 $chars{0xc26c}=0x013a; # small l with acute accent
 815 $chars{0xc26d}=0x1e3f; # small m with acute
 816 $chars{0xc26e}=0x0144; # small n with acute accent
 817 $chars{0xc26f}=0x00f3; # small o with acute accent
 818 $chars{0xc270}=0x1e55; # small p with acute
 819 $chars{0xc272}=0x0155; # small r with acute accent
 820 $chars{0xc273}=0x015b; # small s with acute accent
 821 $chars{0xc275}=0x00fa; # small u with acute accent
 822 $chars{0xc277}=0x1e83; # small w with acute
 823 $chars{0xc279}=0x00fd; # small y with acute accent
 824 $chars{0xc27a}=0x017a; # small z with acute accent
 825 $chars{0xc2e1}=0x01fc; # capital ae with acute
 826 $chars{0xc2f1}=0x01fd; # small ae with acute
 827        # 4/3 circumflex accent
 828 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 829 $chars{0xc343}=0x0108; # capital c with circumflex
 830 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 831 $chars{0xc347}=0x011c; # capital g with circumflex
 832 $chars{0xc348}=0x0124; # capital h with circumflex
 833 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 834 $chars{0xc34a}=0x0134; # capital j with circumflex
 835 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 836 $chars{0xc353}=0x015c; # capital s with circumflex
 837 $chars{0xc355}=0x00db; # capital u with circumflex
 838 $chars{0xc357}=0x0174; # capital w with circumflex
 839 $chars{0xc359}=0x0176; # capital y with circumflex
 840 $chars{0xc35a}=0x1e90; # capital z with circumflex
 841 $chars{0xc361}=0x00e2; # small a with circumflex accent
 842 $chars{0xc363}=0x0109; # small c with circumflex
 843 $chars{0xc365}=0x00ea; # small e with circumflex accent
 844 $chars{0xc367}=0x011d; # small g with circumflex
 845 $chars{0xc368}=0x0125; # small h with circumflex
 846 $chars{0xc369}=0x00ee; # small i with circumflex accent
 847 $chars{0xc36a}=0x0135; # small j with circumflex
 848 $chars{0xc36e}=0x00f1; # small n with tilde
 849 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 850 $chars{0xc373}=0x015d; # small s with circumflex
 851 $chars{0xc375}=0x00fb; # small u with circumflex
 852 $chars{0xc377}=0x0175; # small w with circumflex
 853 $chars{0xc379}=0x0177; # small y with circumflex
 854 $chars{0xc37a}=0x1e91; # small z with circumflex
 855         # 4/4 tilde
 856 $chars{0xc441}=0x00c3; # capital a with tilde
 857 $chars{0xc445}=0x1ebc; # capital e with tilde
 858 $chars{0xc449}=0x0128; # capital i with tilde
 859 $chars{0xc44e}=0x00d1; # capital n with tilde
 860 $chars{0xc44f}=0x00d5; # capital o with tilde
 861 $chars{0xc455}=0x0168; # capital u with tilde
 862 $chars{0xc456}=0x1e7c; # capital v with tilde
 863 $chars{0xc459}=0x1ef8; # capital y with tilde
 864 $chars{0xc461}=0x00e3; # small a with tilde
 865 $chars{0xc465}=0x1ebd; # small e with tilde
 866 $chars{0xc469}=0x0129; # small i with tilde
 867 $chars{0xc46e}=0x00f1; # small n with tilde
 868 $chars{0xc46f}=0x00f5; # small o with tilde
 869 $chars{0xc475}=0x0169; # small u with tilde
 870 $chars{0xc476}=0x1e7d; # small v with tilde
 871 $chars{0xc479}=0x1ef9; # small y with tilde
 872     # 4/5 macron
 873 $chars{0xc541}=0x0100; # capital a with macron
 874 $chars{0xc545}=0x0112; # capital e with macron
 875 $chars{0xc547}=0x1e20; # capital g with macron
 876 $chars{0xc549}=0x012a; # capital i with macron
 877 $chars{0xc54f}=0x014c; # capital o with macron
 878 $chars{0xc555}=0x016a; # capital u with macron
 879 $chars{0xc561}=0x0101; # small a with macron
 880 $chars{0xc565}=0x0113; # small e with macron
 881 $chars{0xc567}=0x1e21; # small g with macron
 882 $chars{0xc569}=0x012b; # small i with macron
 883 $chars{0xc56f}=0x014d; # small o with macron
 884 $chars{0xc575}=0x016b; # small u with macron
 885 $chars{0xc572}=0x0159; # small r with macron
 886 $chars{0xc5e1}=0x01e2; # capital ae with macron
 887 $chars{0xc5f1}=0x01e3; # small ae with macron
 888         # 4/6 breve
 889 $chars{0xc641}=0x0102; # capital a with breve
 890 $chars{0xc645}=0x0114; # capital e with breve
 891 $chars{0xc647}=0x011e; # capital g with breve
 892 $chars{0xc649}=0x012c; # capital i with breve
 893 $chars{0xc64f}=0x014e; # capital o with breve
 894 $chars{0xc655}=0x016c; # capital u with breve
 895 $chars{0xc661}=0x0103; # small a with breve
 896 $chars{0xc665}=0x0115; # small e with breve
 897 $chars{0xc667}=0x011f; # small g with breve
 898 $chars{0xc669}=0x012d; # small i with breve
 899 $chars{0xc66f}=0x014f; # small o with breve
 900 $chars{0xc675}=0x016d; # small u with breve
 901         # 4/7 dot above
 902 $chars{0xc7b0}=0x01e1; # Ain with dot above
 903 $chars{0xc742}=0x1e02; # capital b with dot above
 904 $chars{0xc743}=0x010a; # capital c with dot above
 905 $chars{0xc744}=0x1e0a; # capital d with dot above
 906 $chars{0xc745}=0x0116; # capital e with dot above
 907 $chars{0xc746}=0x1e1e; # capital f with dot above
 908 $chars{0xc747}=0x0120; # capital g with dot above
 909 $chars{0xc748}=0x1e22; # capital h with dot above
 910 $chars{0xc749}=0x0130; # capital i with dot above
 911 $chars{0xc74d}=0x1e40; # capital m with dot above
 912 $chars{0xc74e}=0x1e44; # capital n with dot above
 913 $chars{0xc750}=0x1e56; # capital p with dot above
 914 $chars{0xc752}=0x1e58; # capital r with dot above
 915 $chars{0xc753}=0x1e60; # capital s with dot above
 916 $chars{0xc754}=0x1e6a; # capital t with dot above
 917 $chars{0xc757}=0x1e86; # capital w with dot above
 918 $chars{0xc758}=0x1e8a; # capital x with dot above
 919 $chars{0xc759}=0x1e8e; # capital y with dot above
 920 $chars{0xc75a}=0x017b; # capital z with dot above
 921 $chars{0xc761}=0x0227; # small b with dot above
 922 $chars{0xc762}=0x1e03; # small b with dot above
 923 $chars{0xc763}=0x010b; # small c with dot above
 924 $chars{0xc764}=0x1e0b; # small d with dot above
 925 $chars{0xc765}=0x0117; # small e with dot above
 926 $chars{0xc766}=0x1e1f; # small f with dot above
 927 $chars{0xc767}=0x0121; # small g with dot above
 928 $chars{0xc768}=0x1e23; # small h with dot above
 929 $chars{0xc76d}=0x1e41; # small m with dot above
 930 $chars{0xc76e}=0x1e45; # small n with dot above
 931 $chars{0xc770}=0x1e57; # small p with dot above
 932 $chars{0xc772}=0x1e59; # small r with dot above
 933 $chars{0xc773}=0x1e61; # small s with dot above
 934 $chars{0xc774}=0x1e6b; # small t with dot above
 935 $chars{0xc777}=0x1e87; # small w with dot above
 936 $chars{0xc778}=0x1e8b; # small x with dot above
 937 $chars{0xc779}=0x1e8f; # small y with dot above
 938 $chars{0xc77a}=0x017c; # small z with dot above
 939         # 4/8 trema, diaresis
 940 $chars{0xc820}=0x00a8; # diaeresis
 941 $chars{0xc841}=0x00c4; # capital a with diaeresis
 942 $chars{0xc845}=0x00cb; # capital e with diaeresis
 943 $chars{0xc848}=0x1e26; # capital h with diaeresis
 944 $chars{0xc849}=0x00cf; # capital i with diaeresis
 945 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 946 $chars{0xc855}=0x00dc; # capital u with diaeresis
 947 $chars{0xc857}=0x1e84; # capital w with diaeresis
 948 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 949 $chars{0xc859}=0x0178; # capital y with diaeresis
 950 $chars{0xc861}=0x00e4; # small a with diaeresis
 951 $chars{0xc865}=0x00eb; # small e with diaeresis
 952 $chars{0xc868}=0x1e27; # small h with diaeresis
 953 $chars{0xc869}=0x00ef; # small i with diaeresis
 954 $chars{0xc86f}=0x00f6; # small o with diaeresis
 955 $chars{0xc874}=0x1e97; # small t with diaeresis
 956 $chars{0xc875}=0x00fc; # small u with diaeresis
 957 $chars{0xc877}=0x1e85; # small w with diaeresis
 958 $chars{0xc878}=0x1e8d; # small x with diaeresis
 959 $chars{0xc879}=0x00ff; # small y with diaeresis
 960         # 4/9 umlaut
 961 $chars{0xc920}=0x00a8; # [diaeresis]
 962 $chars{0xc961}=0x00e4; # a with umlaut
 963 $chars{0xc965}=0x00eb; # e with umlaut
 964 $chars{0xc969}=0x00ef; # i with umlaut
 965 $chars{0xc96f}=0x00f6; # o with umlaut
 966 $chars{0xc975}=0x00fc; # u with umlaut
 967         # 4/10 circle above
 968 $chars{0xca41}=0x00c5; # capital a with ring above
 969 $chars{0xcaad}=0x016e; # capital u with ring above
 970 $chars{0xca61}=0x00e5; # small a with ring above
 971 $chars{0xca75}=0x016f; # small u with ring above
 972 $chars{0xca77}=0x1e98; # small w with ring above
 973 $chars{0xca79}=0x1e99; # small y with ring above
 974         # 4/11 high comma off centre
 975         # 4/12 inverted high comma centred
 976         # 4/13 double acute accent
 977 $chars{0xcd4f}=0x0150; # capital o with double acute
 978 $chars{0xcd55}=0x0170; # capital u with double acute
 979 $chars{0xcd6f}=0x0151; # small o with double acute
 980 $chars{0xcd75}=0x0171; # small u with double acute
 981         # 4/14 horn
 982 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 983 $chars{0xce55}=0x01af; # latin capital letter u with horn
 984 $chars{0xce74}=0x01a1; # latin small letter o with horn
 985 $chars{0xce75}=0x01b0; # latin small letter u with horn
 986         # 4/15 caron (hacek
 987 $chars{0xcf41}=0x01cd; # capital a with caron
 988 $chars{0xcf43}=0x010c; # capital c with caron
 989 $chars{0xcf44}=0x010e; # capital d with caron
 990 $chars{0xcf45}=0x011a; # capital e with caron
 991 $chars{0xcf47}=0x01e6; # capital g with caron
 992 $chars{0xcf49}=0x01cf; # capital i with caron
 993 $chars{0xcf4b}=0x01e8; # capital k with caron
 994 $chars{0xcf4c}=0x013d; # capital l with caron
 995 $chars{0xcf4e}=0x0147; # capital n with caron
 996 $chars{0xcf4f}=0x01d1; # capital o with caron
 997 $chars{0xcf52}=0x0158; # capital r with caron
 998 $chars{0xcf53}=0x0160; # capital s with caron
 999 $chars{0xcf54}=0x0164; # capital t with caron
1000 $chars{0xcf55}=0x01d3; # capital u with caron
1001 $chars{0xcf5a}=0x017d; # capital z with caron
1002 $chars{0xcf61}=0x01ce; # small a with caron
1003 $chars{0xcf63}=0x010d; # small c with caron
1004 $chars{0xcf64}=0x010f; # small d with caron
1005 $chars{0xcf65}=0x011b; # small e with caron
1006 $chars{0xcf67}=0x01e7; # small g with caron
1007 $chars{0xcf69}=0x01d0; # small i with caron
1008 $chars{0xcf6a}=0x01f0; # small j with caron
1009 $chars{0xcf6b}=0x01e9; # small k with caron
1010 $chars{0xcf6c}=0x013e; # small l with caron
1011 $chars{0xcf6e}=0x0148; # small n with caron
1012 $chars{0xcf6f}=0x01d2; # small o with caron
1013 $chars{0xcf72}=0x0159; # small r with caron
1014 $chars{0xcf73}=0x0161; # small s with caron
1015 $chars{0xcf74}=0x0165; # small t with caron
1016 $chars{0xcf75}=0x01d4; # small u with caron
1017 $chars{0xcf7a}=0x017e; # small z with caron
1018         # 5/0 cedilla
1019 $chars{0xd020}=0x00b8; # cedilla
1020 $chars{0xd043}=0x00c7; # capital c with cedilla
1021 $chars{0xd044}=0x1e10; # capital d with cedilla
1022 $chars{0xd047}=0x0122; # capital g with cedilla
1023 $chars{0xd048}=0x1e28; # capital h with cedilla
1024 $chars{0xd04b}=0x0136; # capital k with cedilla
1025 $chars{0xd04c}=0x013b; # capital l with cedilla
1026 $chars{0xd04e}=0x0145; # capital n with cedilla
1027 $chars{0xd052}=0x0156; # capital r with cedilla
1028 $chars{0xd053}=0x015e; # capital s with cedilla
1029 $chars{0xd054}=0x0162; # capital t with cedilla
1030 $chars{0xd063}=0x00e7; # small c with cedilla
1031 $chars{0xd064}=0x1e11; # small d with cedilla
1032 $chars{0xd065}=0x0119; # small e with cedilla
1033 $chars{0xd067}=0x0123; # small g with cedilla
1034 $chars{0xd068}=0x1e29; # small h with cedilla
1035 $chars{0xd06b}=0x0137; # small k with cedilla
1036 $chars{0xd06c}=0x013c; # small l with cedilla
1037 $chars{0xd06e}=0x0146; # small n with cedilla
1038 $chars{0xd072}=0x0157; # small r with cedilla
1039 $chars{0xd073}=0x015f; # small s with cedilla
1040 $chars{0xd074}=0x0163; # small t with cedilla
1041         # 5/1 rude
1042         # 5/2 hook to left
1043         # 5/3 ogonek (hook to right
1044 $chars{0xd320}=0x02db; # ogonek
1045 $chars{0xd341}=0x0104; # capital a with ogonek
1046 $chars{0xd345}=0x0118; # capital e with ogonek
1047 $chars{0xd349}=0x012e; # capital i with ogonek
1048 $chars{0xd34f}=0x01ea; # capital o with ogonek
1049 $chars{0xd355}=0x0172; # capital u with ogonek
1050 $chars{0xd361}=0x0105; # small a with ogonek
1051 $chars{0xd365}=0x0119; # small e with ogonek
1052 $chars{0xd369}=0x012f; # small i with ogonek
1053 $chars{0xd36f}=0x01eb; # small o with ogonek
1054 $chars{0xd375}=0x0173; # small u with ogonek
1055         # 5/4 circle below
1056 $chars{0xd441}=0x1e00; # capital a with ring below
1057 $chars{0xd461}=0x1e01; # small a with ring below
1058         # 5/5 half circle below
1059 $chars{0xf948}=0x1e2a; # capital h with breve below
1060 $chars{0xf968}=0x1e2b; # small h with breve below
1061         # 5/6 dot below
1062 $chars{0xd641}=0x1ea0; # capital a with dot below
1063 $chars{0xd642}=0x1e04; # capital b with dot below
1064 $chars{0xd644}=0x1e0c; # capital d with dot below
1065 $chars{0xd645}=0x1eb8; # capital e with dot below
1066 $chars{0xd648}=0x1e24; # capital h with dot below
1067 $chars{0xd649}=0x1eca; # capital i with dot below
1068 $chars{0xd64b}=0x1e32; # capital k with dot below
1069 $chars{0xd64c}=0x1e36; # capital l with dot below
1070 $chars{0xd64d}=0x1e42; # capital m with dot below
1071 $chars{0xd64e}=0x1e46; # capital n with dot below
1072 $chars{0xd64f}=0x1ecc; # capital o with dot below
1073 $chars{0xd652}=0x1e5a; # capital r with dot below
1074 $chars{0xd653}=0x1e62; # capital s with dot below
1075 $chars{0xd654}=0x1e6c; # capital t with dot below
1076 $chars{0xd655}=0x1ee4; # capital u with dot below
1077 $chars{0xd656}=0x1e7e; # capital v with dot below
1078 $chars{0xd657}=0x1e88; # capital w with dot below
1079 $chars{0xd659}=0x1ef4; # capital y with dot below
1080 $chars{0xd65a}=0x1e92; # capital z with dot below
1081 $chars{0xd661}=0x1ea1; # small a with dot below
1082 $chars{0xd662}=0x1e05; # small b with dot below
1083 $chars{0xd664}=0x1e0d; # small d with dot below
1084 $chars{0xd665}=0x1eb9; # small e with dot below
1085 $chars{0xd668}=0x1e25; # small h with dot below
1086 $chars{0xd669}=0x1ecb; # small i with dot below
1087 $chars{0xd66b}=0x1e33; # small k with dot below
1088 $chars{0xd66c}=0x1e37; # small l with dot below
1089 $chars{0xd66d}=0x1e43; # small m with dot below
1090 $chars{0xd66e}=0x1e47; # small n with dot below
1091 $chars{0xd66f}=0x1ecd; # small o with dot below
1092 $chars{0xd672}=0x1e5b; # small r with dot below
1093 $chars{0xd673}=0x1e63; # small s with dot below
1094 $chars{0xd674}=0x1e6d; # small t with dot below
1095 $chars{0xd675}=0x1ee5; # small u with dot below
1096 $chars{0xd676}=0x1e7f; # small v with dot below
1097 $chars{0xd677}=0x1e89; # small w with dot below
1098 $chars{0xd679}=0x1ef5; # small y with dot below
1099 $chars{0xd67a}=0x1e93; # small z with dot below
1100         # 5/7 double dot below
1101 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1102 $chars{0xd775}=0x1e73; # small u with diaeresis below
1103         # 5/8 underline
1104 $chars{0xd820}=0x005f; # underline
1105         # 5/9 double underline
1106 $chars{0xd920}=0x2017; # double underline
1107         # 5/10 small low vertical bar
1108 $chars{0xda20}=0x02cc; #
1109         # 5/11 circumflex below
1110         # 5/12 (this position shall not be used)
1111         # 5/13 left half of ligature sign and of double tilde
1112         # 5/14 right half of ligature sign
1113         # 5/15 right half of double tilde
1114 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1115
1116 sub char_decode5426 {
1117     my ( $string) = @_;
1118     my $result;
1119
1120     my @data = unpack("C*", $string);
1121     my @characters;
1122     my $length=scalar(@data);
1123     for (my $i = 0; $i < scalar(@data); $i++) {
1124       my $char= $data[$i];
1125       if ($char >= 0x00 && $char <= 0x7F){
1126         #IsAscii
1127
1128           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1129       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1130         #Combined Char
1131         my $convchar ;
1132         if ($chars{$char*256+$data[$i+1]}) {
1133           $convchar= $chars{$char * 256 + $data[$i+1]};
1134           $i++;
1135 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1136         } elsif ($chars{$char})  {
1137           $convchar= $chars{$char};
1138 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1139         }else {
1140           $convchar=$char;
1141         }
1142         push @characters,$convchar;
1143       } else {
1144         my $convchar;
1145         if ($chars{$char})  {
1146           $convchar= $chars{$char};
1147 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1148         }else {
1149 #            printf "char %x $char\n",$char;
1150           $convchar=$char;
1151         }
1152         push @characters,$convchar;
1153       }
1154     }
1155     $result=pack "U*",@characters;
1156 #     $result=~s/\x01//;
1157 #     $result=~s/\x00//;
1158      $result=~s/\x0f//;
1159      $result=~s/\x1b.//;
1160      $result=~s/\x0e//;
1161      $result=~s/\x1b\x5b//;
1162 #   map{printf "%x",$_} @characters;
1163 #   printf "\n";
1164   return $result;
1165 }
1166
1167 1;
1168
1169
1170 =head1 AUTHOR
1171
1172 Koha Development Team <http://koha-community.org/>
1173
1174 Galen Charlton <galen.charlton@liblime.com>
1175
1176 =cut