C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use MARC::Charset qw/marc8_to_utf8/;
  22 use Text::Iconv;
  23 use C4::Debug;
  24 use Unicode::Normalize;
  25
  26 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  27
  28 BEGIN {
  29     # set the version for version checking
  30     $VERSION = 3.01;
  31     require Exporter;
  32     @ISA    = qw(Exporter);
  33     @EXPORT = qw(
  34         IsStringUTF8ish
  35         MarcToUTF8Record
  36         SetUTF8Flag
  37         SetMarcUnicodeFlag
  38         StripNonXmlChars
  39     );
  40 }
  41
  42 =head1 NAME
  43
  44 C4::Charset - utilities for handling character set conversions.
  45
  46 =head1 SYNOPSIS
  47
  48 use C4::Charset;
  49
  50 =head1 DESCRIPTION
  51
  52 This module contains routines for dealing with character set
  53 conversions, particularly for MARC records.
  54
  55 A variety of character encodings are in use by various MARC
  56 standards, and even more character encodings are used by
  57 non-standard MARC records.  The various MARC formats generally
  58 do not do a good job of advertising a given record's character
  59 encoding, and even when a record does advertise its encoding,
  60 e.g., via the Leader/09, experience has shown that one cannot
  61 trust it.
  62
  63 Ultimately, all MARC records are stored in Koha in UTF-8 and
  64 must be converted from whatever the source character encoding is.
  65 The goal of this module is to ensure that these conversions
  66 take place accurately.  When a character conversion cannot take
  67 place, or at least not accurately, the module was provide
  68 enough information to allow user-facing code to inform the user
  69 on how to deal with the situation.
  70
  71 =cut
  72
  73 =head1 FUNCTIONS
  74
  75 =head2 IsStringUTF8ish
  76
  77 =over 4
  78
  79 my $is_utf8 = IsStringUTF8ish($str);
  80
  81 =back
  82
  83 Determines if C<$str> is valid UTF-8.  This can mean
  84 one of two things:
  85
  86 =over 2
  87
  88 =item *
  89
  90 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  91
  92 =item *
  93
  94 The Perl UTF-8 flag is B<not> set, but the octets contain
  95 valid UTF-8.
  96
  97 =back
  98
  99 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 100 because in one could be presented with a MARC blob that is
 101 not actually in UTF-8 but whose sequence of octets appears to be
 102 valid UTF-8.  The rest of the MARC character conversion functions
 103 will assume that this situation occur does not very often.
 104
 105 =cut
 106
 107 sub IsStringUTF8ish {
 108     my $str = shift;
 109
 110     return 1 if utf8::is_utf8($str);
 111     return utf8::decode($str);
 112 }
 113
 114 =head2 SetUTF8Flag
 115
 116 =over 4
 117
 118 my $marc_record = SetUTF8Flag($marc_record);
 119
 120 =back
 121
 122 This function sets the PERL UTF8 flag for data.
 123 It is required when using new_from_usmarc
 124 since MARC::File::USMARC does not handle PERL UTF8 setting.
 125 When editing unicode marc records fields and subfields, you
 126 would end up in double encoding without using this function.
 127
 128 FIXME
 129 In my opinion, this function belongs to MARC::Record and not
 130 to this package.
 131 But since it handles charset, and MARC::Record, it finds its way in that package
 132
 133 =cut
 134
 135 sub SetUTF8Flag{
 136         my ($record)=@_;
 137         return unless ($record && $record->fields());
 138         foreach my $field ($record->fields()){
 139                 if ($field->tag()>=10){
 140                         my @subfields;
 141                         foreach my $subfield ($field->subfields()){
 142                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
 143                         }
 144                         my $newfield=MARC::Field->new(
 145                                                         $field->tag(),
 146                                                         $field->indicator(1),
 147                                                         $field->indicator(2),
 148                                                         @subfields
 149                                                 );
 150                         $field->replace_with($newfield);
 151                 }
 152         }
 153 }
 154
 155 =head2 NormalizeString
 156
 157 =over 4
 158
 159     my $normalized_string=NormalizeString($string);
 160
 161 =back
 162         Given
 163             a string
 164         nfc : If you want to set NFC and not NFD
 165         transform : If you expect all the signs to be removed
 166     Sets the PERL UTF8 Flag on your initial data if need be
 167     and applies cleaning if required
 168
 169         Returns a utf8 NFD normalized string
 170
 171         Sample code :
 172         my $string=NormalizeString ("l'ornithoptère");
 173     #results into ornithoptère in NFD form and sets UTF8 Flag
 174 =cut
 175
 176 sub NormalizeString{
 177         my ($string,$nfc,$transform)=@_;
 178         utf8::decode($string) unless (utf8::is_utf8($string));
 179         if ($nfc){
 180                 $string= NFD($string);
 181         }
 182         else {
 183                 $string=NFC($string);
 184         }
 185         if ($transform){
 186     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 187         #removing one letter words "d'" "l'"  was changed into "d " "l "
 188     $string=~s/\b\S\b//g;
 189     $string=~s/\s+$//g;
 190         }
 191     return $string;
 192 }
 193
 194 =head2 MarcToUTF8Record
 195
 196 =over 4
 197
 198 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 199
 200 =back
 201
 202 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 203 optional source encoding, return a C<MARC::Record> that is
 204 converted to UTF-8.
 205
 206 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 207 is not guaranteed to have been converted correctly.  Specifically,
 208 if C<$converted_from> is 'failed', the MARC record returned failed
 209 character conversion and had each of its non-ASCII octets changed
 210 to the Unicode replacement character.
 211
 212 If the source encoding was not specified, this routine will
 213 try to guess it; the character encoding used for a successful
 214 conversion is returned in C<$converted_from>.
 215
 216 =cut
 217
 218 sub MarcToUTF8Record {
 219     my $marc = shift;
 220     my $marc_flavour = shift;
 221     my $source_encoding = shift;
 222
 223     my $marc_record;
 224     my $marc_blob_is_utf8 = 0;
 225     if (ref($marc) eq 'MARC::Record') {
 226         my $marc_blob = $marc->as_usmarc();
 227         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 228         $marc_record = $marc;
 229     } else {
 230         # dealing with a MARC blob
 231
 232         # remove any ersatz whitespace from the beginning and
 233         # end of the MARC blob -- these can creep into MARC
 234         # files produced by several sources -- caller really
 235         # should be doing this, however
 236         $marc =~ s/^\s+//;
 237         $marc =~ s/\s+$//;
 238         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 239         $marc_record = MARC::Record->new_from_usmarc($marc);
 240     }
 241
 242     # If we do not know the source encoding, try some guesses
 243     # as follows:
 244     #   1. Record is UTF-8 already.
 245     #   2. If MARC flavor is MARC21, then
 246     #      a. record is MARC-8
 247     #      b. record is ISO-8859-1
 248     #   3. If MARC flavor is UNIMARC, then
 249     if (not defined $source_encoding) {
 250         if ($marc_blob_is_utf8) {
 251             # note that for MARC21 we are not bothering to check
 252             # if the Leader/09 is set to 'a' or not -- because
 253             # of problems with various ILSs (including Koha in the
 254             # past, alas), this just is not trustworthy.
 255             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 256             return $marc_record, 'UTF-8', [];
 257         } else {
 258             if ($marc_flavour eq 'MARC21') {
 259                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 260             } elsif ($marc_flavour =~/UNIMARC/) {
 261                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 262             } else {
 263                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 264             }
 265         }
 266     } else {
 267         # caller knows the character encoding
 268         my $original_marc_record = $marc_record->clone();
 269         my @errors;
 270         if ($source_encoding =~ /utf-?8/i) {
 271             if ($marc_blob_is_utf8) {
 272                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 273                 return $marc_record, 'UTF-8', [];
 274             } else {
 275                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 276             }
 277         } elsif ($source_encoding =~ /marc-?8/i) {
 278             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 279         } elsif ($source_encoding =~ /5426/) {
 280             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 281         } else {
 282             # assume any other character encoding is for Text::Iconv
 283             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
 284         }
 285
 286         if (@errors) {
 287             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 288             return $original_marc_record, 'failed', \@errors;
 289         } else {
 290             return $marc_record, $source_encoding, [];
 291         }
 292     }
 293
 294 }
 295
 296 =head2 SetMarcUnicodeFlag
 297
 298 =over 4
 299
 300 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 301
 302 =back
 303
 304 Set both the internal MARC::Record encoding flag
 305 and the appropriate Leader/09 (MARC21) or
 306 100/26-29 (UNIMARC) to indicate that the record
 307 is in UTF-8.  Note that this does B<not> do
 308 any actual character conversion.
 309
 310 =cut
 311
 312 sub SetMarcUnicodeFlag {
 313     my $marc_record = shift;
 314     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 315
 316     $marc_record->encoding('UTF-8');
 317     if ($marc_flavour eq 'MARC21') {
 318         my $leader = $marc_record->leader();
 319         substr($leader, 9, 1) = 'a';
 320         $marc_record->leader($leader);
 321     } elsif ($marc_flavour =~/UNIMARC/) {
 322         my $string;
 323                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 324                 $string=$marc_record->subfield( 100, "a" );
 325         if (length($string)==$subflength) {
 326                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 327         }
 328         else {
 329             $string = POSIX::strftime( "%Y%m%d", localtime );
 330             $string =~ s/\-//g;
 331             $string = sprintf( "%-*s", $subflength, $string );
 332         }
 333         substr( $string, $encodingposition, 8, "frey50  " );
 334         if ( $marc_record->subfield( 100, "a" ) ) {
 335                         $marc_record->field('100')->update(a=>$string);
 336                 }
 337                 else {
 338             $marc_record->insert_grouped_field(
 339                 MARC::Field->new( 100, '', '', "a" => $string ) );
 340         }
 341                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 342     } else {
 343         warn "Unrecognized marcflavour: $marc_flavour";
 344     }
 345 }
 346
 347 =head2 StripNonXmlChars
 348
 349 =over 4
 350
 351 my $new_str = StripNonXmlChars($old_str);
 352
 353 =back
 354
 355 Given a string, return a copy with the
 356 characters that are illegal in XML
 357 removed.
 358
 359 This function exists to work around a problem
 360 that can occur with badly-encoded MARC records.
 361 Specifically, if a UTF-8 MARC record also
 362 has excape (\x1b) characters, MARC::File::XML
 363 will let the escape characters pass through
 364 when as_xml() or as_xml_record() is called.  The
 365 problem is that the escape character is not
 366 legal in well-formed XML documents, so when
 367 MARC::File::XML attempts to parse such a record,
 368 the XML parser will fail.
 369
 370 Stripping such characters will allow a
 371 MARC::Record->new_from_xml()
 372 to work, at the possible risk of some data loss.
 373
 374 =cut
 375
 376 sub StripNonXmlChars {
 377     my $str = shift;
 378     if (!defined($str) || $str eq ""){
 379         return "";
 380     }
 381     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 382     return $str;
 383 }
 384
 385 =head1 INTERNAL FUNCTIONS
 386
 387 =head2 _default_marc21_charconv_to_utf8
 388
 389 =over 4
 390
 391 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 392
 393 =back
 394
 395 Converts a C<MARC::Record> of unknown character set to UTF-8,
 396 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 397 to UTF-8, then a default conversion that replaces each non-ASCII
 398 character with the replacement character.
 399
 400 The C<$guessed_charset> return value contains the character set
 401 that resulted in a conversion to valid UTF-8; note that
 402 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 403 this is 'failed'.
 404
 405 =cut
 406
 407 sub _default_marc21_charconv_to_utf8 {
 408     my $marc_record = shift;
 409     my $marc_flavour = shift;
 410
 411     my $trial_marc8 = $marc_record->clone();
 412     my @all_errors = ();
 413     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 414     unless (@errors) {
 415         return $trial_marc8, 'MARC-8', [];
 416     }
 417     push @all_errors, @errors;
 418
 419     my $trial_8859_1 = $marc_record->clone();
 420     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 421     unless (@errors) {
 422         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 423                                                 # instead if we wanted to report details
 424                                                 # of the failed attempt at MARC-8 => UTF-8
 425     }
 426     push @all_errors, @errors;
 427
 428     my $default_converted = $marc_record->clone();
 429     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 430     return $default_converted, 'failed', \@all_errors;
 431 }
 432
 433 =head2 _default_unimarc_charconv_to_utf8
 434
 435 =over 4
 436
 437 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 438
 439 =back
 440
 441 Converts a C<MARC::Record> of unknown character set to UTF-8,
 442 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 443 to UTF-8, then a default conversion that replaces each non-ASCII
 444 character with the replacement character.
 445
 446 The C<$guessed_charset> return value contains the character set
 447 that resulted in a conversion to valid UTF-8; note that
 448 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 449 this is 'failed'.
 450
 451 =cut
 452
 453 sub _default_unimarc_charconv_to_utf8 {
 454     my $marc_record = shift;
 455     my $marc_flavour = shift;
 456
 457     my $trial_marc8 = $marc_record->clone();
 458     my @all_errors = ();
 459     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 460     unless (@errors) {
 461         return $trial_marc8, 'iso-5426';
 462     }
 463     push @all_errors, @errors;
 464
 465     my $trial_8859_1 = $marc_record->clone();
 466     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 467     unless (@errors) {
 468         return $trial_8859_1, 'iso-8859-1';
 469     }
 470     push @all_errors, @errors;
 471
 472     my $default_converted = $marc_record->clone();
 473     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 474     return $default_converted, 'failed', \@all_errors;
 475 }
 476
 477 =head2 _marc_marc8_to_utf8
 478
 479 =over 4
 480
 481 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 482
 483 =back
 484
 485 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 486 If the conversion fails for some reason, an
 487 appropriate messages will be placed in the returned
 488 C<@errors> array.
 489
 490 =cut
 491
 492 sub _marc_marc8_to_utf8 {
 493     my $marc_record = shift;
 494     my $marc_flavour = shift;
 495
 496     my $prev_ignore = MARC::Charset->ignore_errors();
 497     MARC::Charset->ignore_errors(1);
 498
 499     # trap warnings raised by MARC::Charset
 500     my @errors = ();
 501     local $SIG{__WARN__} = sub {
 502         my $msg = $_[0];
 503         if ($msg =~ /MARC.Charset/) {
 504             # FIXME - purpose of this regexp is to strip out the
 505             # line reference to MARC/Charset.pm, but as it
 506             # exists probably won't work quite on Windows --
 507             # some sort of minimal-bunch back-tracking RE
 508             # would be helpful here
 509             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 510             push @errors, $msg;
 511         } else {
 512             # if warning doesn't come from MARC::Charset, just
 513             # pass it on
 514             warn $msg;
 515         }
 516     };
 517
 518     foreach my $field ($marc_record->fields()) {
 519         if ($field->is_control_field()) {
 520             ; # do nothing -- control fields should not contain non-ASCII characters
 521         } else {
 522             my @converted_subfields;
 523             foreach my $subfield ($field->subfields()) {
 524                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 525                 unless (IsStringUTF8ish($utf8sf)) {
 526                     # Because of a bug in MARC::Charset 0.98, if the string
 527                     # has (a) one or more diacritics that (b) are only in character positions
 528                     # 128 to 255 inclusive, the resulting converted string is not in
 529                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 530                     # occurs, upgrade the string in place.  Moral of the story seems to be
 531                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 532                     # that the resulting string is UTF-8.
 533                     utf8::upgrade($utf8sf);
 534                 }
 535                 push @converted_subfields, $subfield->[0], $utf8sf;
 536             }
 537
 538             $field->replace_with(MARC::Field->new(
 539                 $field->tag(), $field->indicator(1), $field->indicator(2),
 540                 @converted_subfields)
 541             );
 542         }
 543     }
 544
 545     MARC::Charset->ignore_errors($prev_ignore);
 546
 547     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 548
 549     return @errors;
 550 }
 551
 552 =head2 _marc_iso5426_to_utf8
 553
 554 =over 4
 555
 556 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 557
 558 =back
 559
 560 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 561 If the conversion fails for some reason, an
 562 appropriate messages will be placed in the returned
 563 C<@errors> array.
 564
 565 FIXME - is ISO-5426 equivalent enough to MARC-8
 566 that C<MARC::Charset> can be used instead?
 567
 568 =cut
 569
 570 sub _marc_iso5426_to_utf8 {
 571     my $marc_record = shift;
 572     my $marc_flavour = shift;
 573
 574     my @errors = ();
 575
 576     foreach my $field ($marc_record->fields()) {
 577         if ($field->is_control_field()) {
 578             ; # do nothing -- control fields should not contain non-ASCII characters
 579         } else {
 580             my @converted_subfields;
 581             foreach my $subfield ($field->subfields()) {
 582                 my $utf8sf = char_decode5426($subfield->[1]);
 583                 push @converted_subfields, $subfield->[0], $utf8sf;
 584             }
 585
 586             $field->replace_with(MARC::Field->new(
 587                 $field->tag(), $field->indicator(1), $field->indicator(2),
 588                 @converted_subfields)
 589             );
 590         }
 591     }
 592
 593     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 594
 595     return @errors;
 596 }
 597
 598 =head2 _marc_to_utf8_via_text_iconv
 599
 600 =over 4
 601
 602 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 603
 604 =back
 605
 606 Convert a C<MARC::Record> to UTF-8 in-place using the
 607 C<Text::Iconv> CPAN module.  Any source encoding accepted
 608 by the user's iconv installation should work.  If
 609 the source encoding is not recognized on the user's
 610 server or the conversion fails for some reason,
 611 appropriate messages will be placed in the returned
 612 C<@errors> array.
 613
 614 =cut
 615
 616 sub _marc_to_utf8_via_text_iconv {
 617     my $marc_record = shift;
 618     my $marc_flavour = shift;
 619     my $source_encoding = shift;
 620
 621     my @errors = ();
 622     my $decoder;
 623     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 624     if ($@) {
 625         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 626         return @errors;
 627     }
 628
 629     my $prev_raise_error = Text::Iconv->raise_error();
 630     Text::Iconv->raise_error(1);
 631
 632     foreach my $field ($marc_record->fields()) {
 633         if ($field->is_control_field()) {
 634             ; # do nothing -- control fields should not contain non-ASCII characters
 635         } else {
 636             my @converted_subfields;
 637             foreach my $subfield ($field->subfields()) {
 638                 my $converted_value;
 639                 my $conversion_ok = 1;
 640                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 641                 if ($@) {
 642                     $conversion_ok = 0;
 643                     push @errors, $@;
 644                 } elsif (not defined $converted_value) {
 645                     $conversion_ok = 0;
 646                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 647                 }
 648
 649                 if ($conversion_ok) {
 650                     push @converted_subfields, $subfield->[0], $converted_value;
 651                 } else {
 652                     $converted_value = $subfield->[1];
 653                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 654                     push @converted_subfields, $subfield->[0], $converted_value;
 655                 }
 656             }
 657
 658             $field->replace_with(MARC::Field->new(
 659                 $field->tag(), $field->indicator(1), $field->indicator(2),
 660                 @converted_subfields)
 661             );
 662         }
 663     }
 664
 665     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 666     Text::Iconv->raise_error($prev_raise_error);
 667
 668     return @errors;
 669 }
 670
 671 =head2 _marc_to_utf8_replacement_char
 672
 673 =over 4
 674
 675 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 676
 677 =back
 678
 679 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 680 unsatisfactory method of replacing all non-ASCII (e.g.,
 681 where the eight bit is set) octet with the Unicode
 682 replacement character.  This is meant as a last-ditch
 683 method, and would be best used as part of a UI that
 684 lets a cataloguer pick various character conversions
 685 until he or she finds the right one.
 686
 687 =cut
 688
 689 sub _marc_to_utf8_replacement_char {
 690     my $marc_record = shift;
 691     my $marc_flavour = shift;
 692
 693     foreach my $field ($marc_record->fields()) {
 694         if ($field->is_control_field()) {
 695             ; # do nothing -- control fields should not contain non-ASCII characters
 696         } else {
 697             my @converted_subfields;
 698             foreach my $subfield ($field->subfields()) {
 699                 my $value = $subfield->[1];
 700                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 701                 push @converted_subfields, $subfield->[0], $value;
 702             }
 703
 704             $field->replace_with(MARC::Field->new(
 705                 $field->tag(), $field->indicator(1), $field->indicator(2),
 706                 @converted_subfields)
 707             );
 708         }
 709     }
 710
 711     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 712 }
 713
 714 =head2 char_decode5426
 715
 716 =over 4
 717
 718 my $utf8string = char_decode5426($iso_5426_string);
 719
 720 =back
 721
 722 Converts a string from ISO-5426 to UTF-8.
 723
 724 =cut
 725
 726
 727 my %chars;
 728 # $chars{0xb0}=0x0101;#3/0ayn[ain] # commented because it seems to be wrong at use
 729 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 730 #$chars{0xb2}=0x00e0;#'à';
 731 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 732 #$chars{0xb3}=0x00e7;#'ç';
 733 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 734 # $chars{0xb4}='è';
 735 $chars{0xb4}=0x00e8;
 736 # $chars{0xb5}='é';
 737 $chars{0xb5}=0x00e9;
 738 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 739 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 740 $chars{0xfa}=0x0153;#oe
 741 $chars{0x81d1}=0x00b0;
 742
 743 ####
 744 ## combined characters iso5426
 745
 746 $chars{0xc041}=0x1ea2; # capital a with hook above
 747 $chars{0xc045}=0x1eba; # capital e with hook above
 748 $chars{0xc049}=0x1ec8; # capital i with hook above
 749 $chars{0xc04f}=0x1ece; # capital o with hook above
 750 $chars{0xc055}=0x1ee6; # capital u with hook above
 751 $chars{0xc059}=0x1ef6; # capital y with hook above
 752 $chars{0xc061}=0x1ea3; # small a with hook above
 753 $chars{0xc065}=0x1ebb; # small e with hook above
 754 $chars{0xc069}=0x1ec9; # small i with hook above
 755 $chars{0xc06f}=0x1ecf; # small o with hook above
 756 $chars{0xc075}=0x1ee7; # small u with hook above
 757 $chars{0xc079}=0x1ef7; # small y with hook above
 758
 759         # 4/1 grave accent
 760 $chars{0xc141}=0x00c0; # capital a with grave accent
 761 $chars{0xc145}=0x00c8; # capital e with grave accent
 762 $chars{0xc149}=0x00cc; # capital i with grave accent
 763 $chars{0xc14f}=0x00d2; # capital o with grave accent
 764 $chars{0xc155}=0x00d9; # capital u with grave accent
 765 $chars{0xc157}=0x1e80; # capital w with grave
 766 $chars{0xc159}=0x1ef2; # capital y with grave
 767 $chars{0xc161}=0x00e0; # small a with grave accent
 768 $chars{0xc165}=0x00e8; # small e with grave accent
 769 $chars{0xc169}=0x00ec; # small i with grave accent
 770 $chars{0xc16f}=0x00f2; # small o with grave accent
 771 $chars{0xc175}=0x00f9; # small u with grave accent
 772 $chars{0xc177}=0x1e81; # small w with grave
 773 $chars{0xc179}=0x1ef3; # small y with grave
 774         # 4/2 acute accent
 775 $chars{0xc241}=0x00c1; # capital a with acute accent
 776 $chars{0xc243}=0x0106; # capital c with acute accent
 777 $chars{0xc245}=0x00c9; # capital e with acute accent
 778 $chars{0xc247}=0x01f4; # capital g with acute
 779 $chars{0xc249}=0x00cd; # capital i with acute accent
 780 $chars{0xc24b}=0x1e30; # capital k with acute
 781 $chars{0xc24c}=0x0139; # capital l with acute accent
 782 $chars{0xc24d}=0x1e3e; # capital m with acute
 783 $chars{0xc24e}=0x0143; # capital n with acute accent
 784 $chars{0xc24f}=0x00d3; # capital o with acute accent
 785 $chars{0xc250}=0x1e54; # capital p with acute
 786 $chars{0xc252}=0x0154; # capital r with acute accent
 787 $chars{0xc253}=0x015a; # capital s with acute accent
 788 $chars{0xc255}=0x00da; # capital u with acute accent
 789 $chars{0xc257}=0x1e82; # capital w with acute
 790 $chars{0xc259}=0x00dd; # capital y with acute accent
 791 $chars{0xc25a}=0x0179; # capital z with acute accent
 792 $chars{0xc261}=0x00e1; # small a with acute accent
 793 $chars{0xc263}=0x0107; # small c with acute accent
 794 $chars{0xc265}=0x00e9; # small e with acute accent
 795 $chars{0xc267}=0x01f5; # small g with acute
 796 $chars{0xc269}=0x00ed; # small i with acute accent
 797 $chars{0xc26b}=0x1e31; # small k with acute
 798 $chars{0xc26c}=0x013a; # small l with acute accent
 799 $chars{0xc26d}=0x1e3f; # small m with acute
 800 $chars{0xc26e}=0x0144; # small n with acute accent
 801 $chars{0xc26f}=0x00f3; # small o with acute accent
 802 $chars{0xc270}=0x1e55; # small p with acute
 803 $chars{0xc272}=0x0155; # small r with acute accent
 804 $chars{0xc273}=0x015b; # small s with acute accent
 805 $chars{0xc275}=0x00fa; # small u with acute accent
 806 $chars{0xc277}=0x1e83; # small w with acute
 807 $chars{0xc279}=0x00fd; # small y with acute accent
 808 $chars{0xc27a}=0x017a; # small z with acute accent
 809 $chars{0xc2e1}=0x01fc; # capital ae with acute
 810 $chars{0xc2f1}=0x01fd; # small ae with acute
 811        # 4/3 circumflex accent
 812 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 813 $chars{0xc343}=0x0108; # capital c with circumflex
 814 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 815 $chars{0xc347}=0x011c; # capital g with circumflex
 816 $chars{0xc348}=0x0124; # capital h with circumflex
 817 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 818 $chars{0xc34a}=0x0134; # capital j with circumflex
 819 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 820 $chars{0xc353}=0x015c; # capital s with circumflex
 821 $chars{0xc355}=0x00db; # capital u with circumflex
 822 $chars{0xc357}=0x0174; # capital w with circumflex
 823 $chars{0xc359}=0x0176; # capital y with circumflex
 824 $chars{0xc35a}=0x1e90; # capital z with circumflex
 825 $chars{0xc361}=0x00e2; # small a with circumflex accent
 826 $chars{0xc363}=0x0109; # small c with circumflex
 827 $chars{0xc365}=0x00ea; # small e with circumflex accent
 828 $chars{0xc367}=0x011d; # small g with circumflex
 829 $chars{0xc368}=0x0125; # small h with circumflex
 830 $chars{0xc369}=0x00ee; # small i with circumflex accent
 831 $chars{0xc36a}=0x0135; # small j with circumflex
 832 $chars{0xc36e}=0x00f1; # small n with tilde
 833 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 834 $chars{0xc373}=0x015d; # small s with circumflex
 835 $chars{0xc375}=0x00fb; # small u with circumflex
 836 $chars{0xc377}=0x0175; # small w with circumflex
 837 $chars{0xc379}=0x0177; # small y with circumflex
 838 $chars{0xc37a}=0x1e91; # small z with circumflex
 839         # 4/4 tilde
 840 $chars{0xc441}=0x00c3; # capital a with tilde
 841 $chars{0xc445}=0x1ebc; # capital e with tilde
 842 $chars{0xc449}=0x0128; # capital i with tilde
 843 $chars{0xc44e}=0x00d1; # capital n with tilde
 844 $chars{0xc44f}=0x00d5; # capital o with tilde
 845 $chars{0xc455}=0x0168; # capital u with tilde
 846 $chars{0xc456}=0x1e7c; # capital v with tilde
 847 $chars{0xc459}=0x1ef8; # capital y with tilde
 848 $chars{0xc461}=0x00e3; # small a with tilde
 849 $chars{0xc465}=0x1ebd; # small e with tilde
 850 $chars{0xc469}=0x0129; # small i with tilde
 851 $chars{0xc46e}=0x00f1; # small n with tilde
 852 $chars{0xc46f}=0x00f5; # small o with tilde
 853 $chars{0xc475}=0x0169; # small u with tilde
 854 $chars{0xc476}=0x1e7d; # small v with tilde
 855 $chars{0xc479}=0x1ef9; # small y with tilde
 856     # 4/5 macron
 857 $chars{0xc541}=0x0100; # capital a with macron
 858 $chars{0xc545}=0x0112; # capital e with macron
 859 $chars{0xc547}=0x1e20; # capital g with macron
 860 $chars{0xc549}=0x012a; # capital i with macron
 861 $chars{0xc54f}=0x014c; # capital o with macron
 862 $chars{0xc555}=0x016a; # capital u with macron
 863 $chars{0xc561}=0x0101; # small a with macron
 864 $chars{0xc565}=0x0113; # small e with macron
 865 $chars{0xc567}=0x1e21; # small g with macron
 866 $chars{0xc569}=0x012b; # small i with macron
 867 $chars{0xc56f}=0x014d; # small o with macron
 868 $chars{0xc575}=0x016b; # small u with macron
 869 $chars{0xc572}=0x0159; # small r with macron
 870 $chars{0xc5e1}=0x01e2; # capital ae with macron
 871 $chars{0xc5f1}=0x01e3; # small ae with macron
 872         # 4/6 breve
 873 $chars{0xc641}=0x0102; # capital a with breve
 874 $chars{0xc645}=0x0114; # capital e with breve
 875 $chars{0xc647}=0x011e; # capital g with breve
 876 $chars{0xc649}=0x012c; # capital i with breve
 877 $chars{0xc64f}=0x014e; # capital o with breve
 878 $chars{0xc655}=0x016c; # capital u with breve
 879 $chars{0xc661}=0x0103; # small a with breve
 880 $chars{0xc665}=0x0115; # small e with breve
 881 $chars{0xc667}=0x011f; # small g with breve
 882 $chars{0xc669}=0x012d; # small i with breve
 883 $chars{0xc66f}=0x014f; # small o with breve
 884 $chars{0xc675}=0x016d; # small u with breve
 885         # 4/7 dot above
 886 $chars{0xc7b0}=0x01e1; # Ain with dot above
 887 $chars{0xc742}=0x1e02; # capital b with dot above
 888 $chars{0xc743}=0x010a; # capital c with dot above
 889 $chars{0xc744}=0x1e0a; # capital d with dot above
 890 $chars{0xc745}=0x0116; # capital e with dot above
 891 $chars{0xc746}=0x1e1e; # capital f with dot above
 892 $chars{0xc747}=0x0120; # capital g with dot above
 893 $chars{0xc748}=0x1e22; # capital h with dot above
 894 $chars{0xc749}=0x0130; # capital i with dot above
 895 $chars{0xc74d}=0x1e40; # capital m with dot above
 896 $chars{0xc74e}=0x1e44; # capital n with dot above
 897 $chars{0xc750}=0x1e56; # capital p with dot above
 898 $chars{0xc752}=0x1e58; # capital r with dot above
 899 $chars{0xc753}=0x1e60; # capital s with dot above
 900 $chars{0xc754}=0x1e6a; # capital t with dot above
 901 $chars{0xc757}=0x1e86; # capital w with dot above
 902 $chars{0xc758}=0x1e8a; # capital x with dot above
 903 $chars{0xc759}=0x1e8e; # capital y with dot above
 904 $chars{0xc75a}=0x017b; # capital z with dot above
 905 $chars{0xc761}=0x0227; # small b with dot above
 906 $chars{0xc762}=0x1e03; # small b with dot above
 907 $chars{0xc763}=0x010b; # small c with dot above
 908 $chars{0xc764}=0x1e0b; # small d with dot above
 909 $chars{0xc765}=0x0117; # small e with dot above
 910 $chars{0xc766}=0x1e1f; # small f with dot above
 911 $chars{0xc767}=0x0121; # small g with dot above
 912 $chars{0xc768}=0x1e23; # small h with dot above
 913 $chars{0xc76d}=0x1e41; # small m with dot above
 914 $chars{0xc76e}=0x1e45; # small n with dot above
 915 $chars{0xc770}=0x1e57; # small p with dot above
 916 $chars{0xc772}=0x1e59; # small r with dot above
 917 $chars{0xc773}=0x1e61; # small s with dot above
 918 $chars{0xc774}=0x1e6b; # small t with dot above
 919 $chars{0xc777}=0x1e87; # small w with dot above
 920 $chars{0xc778}=0x1e8b; # small x with dot above
 921 $chars{0xc779}=0x1e8f; # small y with dot above
 922 $chars{0xc77a}=0x017c; # small z with dot above
 923         # 4/8 trema, diaresis
 924 $chars{0xc820}=0x00a8; # diaeresis
 925 $chars{0xc841}=0x00c4; # capital a with diaeresis
 926 $chars{0xc845}=0x00cb; # capital e with diaeresis
 927 $chars{0xc848}=0x1e26; # capital h with diaeresis
 928 $chars{0xc849}=0x00cf; # capital i with diaeresis
 929 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 930 $chars{0xc855}=0x00dc; # capital u with diaeresis
 931 $chars{0xc857}=0x1e84; # capital w with diaeresis
 932 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 933 $chars{0xc859}=0x0178; # capital y with diaeresis
 934 $chars{0xc861}=0x00e4; # small a with diaeresis
 935 $chars{0xc865}=0x00eb; # small e with diaeresis
 936 $chars{0xc868}=0x1e27; # small h with diaeresis
 937 $chars{0xc869}=0x00ef; # small i with diaeresis
 938 $chars{0xc86f}=0x00f6; # small o with diaeresis
 939 $chars{0xc874}=0x1e97; # small t with diaeresis
 940 $chars{0xc875}=0x00fc; # small u with diaeresis
 941 $chars{0xc877}=0x1e85; # small w with diaeresis
 942 $chars{0xc878}=0x1e8d; # small x with diaeresis
 943 $chars{0xc879}=0x00ff; # small y with diaeresis
 944         # 4/9 umlaut
 945 $chars{0xc920}=0x00a8; # [diaeresis]
 946 $chars{0xc961}=0x00e4; # a with umlaut
 947 $chars{0xc965}=0x00eb; # e with umlaut
 948 $chars{0xc969}=0x00ef; # i with umlaut
 949 $chars{0xc96f}=0x00f6; # o with umlaut
 950 $chars{0xc975}=0x00fc; # u with umlaut
 951         # 4/10 circle above
 952 $chars{0xca41}=0x00c5; # capital a with ring above
 953 $chars{0xcaad}=0x016e; # capital u with ring above
 954 $chars{0xca61}=0x00e5; # small a with ring above
 955 $chars{0xca75}=0x016f; # small u with ring above
 956 $chars{0xca77}=0x1e98; # small w with ring above
 957 $chars{0xca79}=0x1e99; # small y with ring above
 958         # 4/11 high comma off centre
 959         # 4/12 inverted high comma centred
 960         # 4/13 double acute accent
 961 $chars{0xcd4f}=0x0150; # capital o with double acute
 962 $chars{0xcd55}=0x0170; # capital u with double acute
 963 $chars{0xcd6f}=0x0151; # small o with double acute
 964 $chars{0xcd75}=0x0171; # small u with double acute
 965         # 4/14 horn
 966 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 967 $chars{0xce55}=0x01af; # latin capital letter u with horn
 968 $chars{0xce74}=0x01a1; # latin small letter o with horn
 969 $chars{0xce75}=0x01b0; # latin small letter u with horn
 970         # 4/15 caron (hacek
 971 $chars{0xcf41}=0x01cd; # capital a with caron
 972 $chars{0xcf43}=0x010c; # capital c with caron
 973 $chars{0xcf44}=0x010e; # capital d with caron
 974 $chars{0xcf45}=0x011a; # capital e with caron
 975 $chars{0xcf47}=0x01e6; # capital g with caron
 976 $chars{0xcf49}=0x01cf; # capital i with caron
 977 $chars{0xcf4b}=0x01e8; # capital k with caron
 978 $chars{0xcf4c}=0x013d; # capital l with caron
 979 $chars{0xcf4e}=0x0147; # capital n with caron
 980 $chars{0xcf4f}=0x01d1; # capital o with caron
 981 $chars{0xcf52}=0x0158; # capital r with caron
 982 $chars{0xcf53}=0x0160; # capital s with caron
 983 $chars{0xcf54}=0x0164; # capital t with caron
 984 $chars{0xcf55}=0x01d3; # capital u with caron
 985 $chars{0xcf5a}=0x017d; # capital z with caron
 986 $chars{0xcf61}=0x01ce; # small a with caron
 987 $chars{0xcf63}=0x010d; # small c with caron
 988 $chars{0xcf64}=0x010f; # small d with caron
 989 $chars{0xcf65}=0x011b; # small e with caron
 990 $chars{0xcf67}=0x01e7; # small g with caron
 991 $chars{0xcf69}=0x01d0; # small i with caron
 992 $chars{0xcf6a}=0x01f0; # small j with caron
 993 $chars{0xcf6b}=0x01e9; # small k with caron
 994 $chars{0xcf6c}=0x013e; # small l with caron
 995 $chars{0xcf6e}=0x0148; # small n with caron
 996 $chars{0xcf6f}=0x01d2; # small o with caron
 997 $chars{0xcf72}=0x0159; # small r with caron
 998 $chars{0xcf73}=0x0161; # small s with caron
 999 $chars{0xcf74}=0x0165; # small t with caron
1000 $chars{0xcf75}=0x01d4; # small u with caron
1001 $chars{0xcf7a}=0x017e; # small z with caron
1002         # 5/0 cedilla
1003 $chars{0xd020}=0x00b8; # cedilla
1004 $chars{0xd043}=0x00c7; # capital c with cedilla
1005 $chars{0xd044}=0x1e10; # capital d with cedilla
1006 $chars{0xd047}=0x0122; # capital g with cedilla
1007 $chars{0xd048}=0x1e28; # capital h with cedilla
1008 $chars{0xd04b}=0x0136; # capital k with cedilla
1009 $chars{0xd04c}=0x013b; # capital l with cedilla
1010 $chars{0xd04e}=0x0145; # capital n with cedilla
1011 $chars{0xd052}=0x0156; # capital r with cedilla
1012 $chars{0xd053}=0x015e; # capital s with cedilla
1013 $chars{0xd054}=0x0162; # capital t with cedilla
1014 $chars{0xd063}=0x00e7; # small c with cedilla
1015 $chars{0xd064}=0x1e11; # small d with cedilla
1016 $chars{0xd065}=0x0119; # small e with cedilla
1017 $chars{0xd067}=0x0123; # small g with cedilla
1018 $chars{0xd068}=0x1e29; # small h with cedilla
1019 $chars{0xd06b}=0x0137; # small k with cedilla
1020 $chars{0xd06c}=0x013c; # small l with cedilla
1021 $chars{0xd06e}=0x0146; # small n with cedilla
1022 $chars{0xd072}=0x0157; # small r with cedilla
1023 $chars{0xd073}=0x015f; # small s with cedilla
1024 $chars{0xd074}=0x0163; # small t with cedilla
1025         # 5/1 rude
1026         # 5/2 hook to left
1027         # 5/3 ogonek (hook to right
1028 $chars{0xd320}=0x02db; # ogonek
1029 $chars{0xd341}=0x0104; # capital a with ogonek
1030 $chars{0xd345}=0x0118; # capital e with ogonek
1031 $chars{0xd349}=0x012e; # capital i with ogonek
1032 $chars{0xd34f}=0x01ea; # capital o with ogonek
1033 $chars{0xd355}=0x0172; # capital u with ogonek
1034 $chars{0xd361}=0x0105; # small a with ogonek
1035 $chars{0xd365}=0x0119; # small e with ogonek
1036 $chars{0xd369}=0x012f; # small i with ogonek
1037 $chars{0xd36f}=0x01eb; # small o with ogonek
1038 $chars{0xd375}=0x0173; # small u with ogonek
1039         # 5/4 circle below
1040 $chars{0xd441}=0x1e00; # capital a with ring below
1041 $chars{0xd461}=0x1e01; # small a with ring below
1042         # 5/5 half circle below
1043 $chars{0xf948}=0x1e2a; # capital h with breve below
1044 $chars{0xf968}=0x1e2b; # small h with breve below
1045         # 5/6 dot below
1046 $chars{0xd641}=0x1ea0; # capital a with dot below
1047 $chars{0xd642}=0x1e04; # capital b with dot below
1048 $chars{0xd644}=0x1e0c; # capital d with dot below
1049 $chars{0xd645}=0x1eb8; # capital e with dot below
1050 $chars{0xd648}=0x1e24; # capital h with dot below
1051 $chars{0xd649}=0x1eca; # capital i with dot below
1052 $chars{0xd64b}=0x1e32; # capital k with dot below
1053 $chars{0xd64c}=0x1e36; # capital l with dot below
1054 $chars{0xd64d}=0x1e42; # capital m with dot below
1055 $chars{0xd64e}=0x1e46; # capital n with dot below
1056 $chars{0xd64f}=0x1ecc; # capital o with dot below
1057 $chars{0xd652}=0x1e5a; # capital r with dot below
1058 $chars{0xd653}=0x1e62; # capital s with dot below
1059 $chars{0xd654}=0x1e6c; # capital t with dot below
1060 $chars{0xd655}=0x1ee4; # capital u with dot below
1061 $chars{0xd656}=0x1e7e; # capital v with dot below
1062 $chars{0xd657}=0x1e88; # capital w with dot below
1063 $chars{0xd659}=0x1ef4; # capital y with dot below
1064 $chars{0xd65a}=0x1e92; # capital z with dot below
1065 $chars{0xd661}=0x1ea1; # small a with dot below
1066 $chars{0xd662}=0x1e05; # small b with dot below
1067 $chars{0xd664}=0x1e0d; # small d with dot below
1068 $chars{0xd665}=0x1eb9; # small e with dot below
1069 $chars{0xd668}=0x1e25; # small h with dot below
1070 $chars{0xd669}=0x1ecb; # small i with dot below
1071 $chars{0xd66b}=0x1e33; # small k with dot below
1072 $chars{0xd66c}=0x1e37; # small l with dot below
1073 $chars{0xd66d}=0x1e43; # small m with dot below
1074 $chars{0xd66e}=0x1e47; # small n with dot below
1075 $chars{0xd66f}=0x1ecd; # small o with dot below
1076 $chars{0xd672}=0x1e5b; # small r with dot below
1077 $chars{0xd673}=0x1e63; # small s with dot below
1078 $chars{0xd674}=0x1e6d; # small t with dot below
1079 $chars{0xd675}=0x1ee5; # small u with dot below
1080 $chars{0xd676}=0x1e7f; # small v with dot below
1081 $chars{0xd677}=0x1e89; # small w with dot below
1082 $chars{0xd679}=0x1ef5; # small y with dot below
1083 $chars{0xd67a}=0x1e93; # small z with dot below
1084         # 5/7 double dot below
1085 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1086 $chars{0xd775}=0x1e73; # small u with diaeresis below
1087         # 5/8 underline
1088 $chars{0xd820}=0x005f; # underline
1089         # 5/9 double underline
1090 $chars{0xd920}=0x2017; # double underline
1091         # 5/10 small low vertical bar
1092 $chars{0xda20}=0x02cc; #
1093         # 5/11 circumflex below
1094         # 5/12 (this position shall not be used)
1095         # 5/13 left half of ligature sign and of double tilde
1096         # 5/14 right half of ligature sign
1097         # 5/15 right half of double tilde
1098 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1099
1100 sub char_decode5426 {
1101     my ( $string) = @_;
1102     my $result;
1103
1104     my @data = unpack("C*", $string);
1105     my @characters;
1106     my $length=scalar(@data);
1107     for (my $i = 0; $i < scalar(@data); $i++) {
1108       my $char= $data[$i];
1109       if ($char >= 0x00 && $char <= 0x7F){
1110         #IsAscii
1111
1112           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1113       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1114         #Combined Char
1115         my $convchar ;
1116         if ($chars{$char*256+$data[$i+1]}) {
1117           $convchar= $chars{$char * 256 + $data[$i+1]};
1118           $i++;
1119 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1120         } elsif ($chars{$char})  {
1121           $convchar= $chars{$char};
1122 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1123         }else {
1124           $convchar=$char;
1125         }
1126         push @characters,$convchar;
1127       } else {
1128         my $convchar;
1129         if ($chars{$char})  {
1130           $convchar= $chars{$char};
1131 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1132         }else {
1133 #            printf "char %x $char\n",$char;
1134           $convchar=$char;
1135         }
1136         push @characters,$convchar;
1137       }
1138     }
1139     $result=pack "U*",@characters;
1140 #     $result=~s/\x01//;
1141 #     $result=~s/\x00//;
1142      $result=~s/\x0f//;
1143      $result=~s/\x1b.//;
1144      $result=~s/\x0e//;
1145      $result=~s/\x1b\x5b//;
1146 #   map{printf "%x",$_} @characters;
1147 #   printf "\n";
1148   return $result;
1149 }
1150
1151 1;
1152
1153
1154 =head1 AUTHOR
1155
1156 Koha Development Team <info@koha.org>
1157
1158 Galen Charlton <galen.charlton@liblime.com>
1159
1160 =cut