C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use MARC::Charset qw/marc8_to_utf8/;
  22 use Text::Iconv;
  23
  24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  25
  26 BEGIN {
  27     # set the version for version checking
  28     $VERSION = 3.01;
  29     require Exporter;
  30     @ISA    = qw(Exporter);
  31     @EXPORT = qw(
  32         IsStringUTF8ish
  33         MarcToUTF8Record
  34         SetMarcUnicodeFlag
  35         StripNonXmlChars
  36     );
  37 }
  38
  39 =head1 NAME
  40
  41 C4::Charset - utilities for handling character set conversions.
  42
  43 =head1 SYNOPSIS
  44
  45 use C4::Charset;
  46
  47 =head1 DESCRIPTION
  48
  49 This module contains routines for dealing with character set
  50 conversions, particularly for MARC records.
  51
  52 A variety of character encodings are in use by various MARC
  53 standards, and even more character encodings are used by
  54 non-standard MARC records.  The various MARC formats generally
  55 do not do a good job of advertising a given record's character
  56 encoding, and even when a record does advertise its encoding,
  57 e.g., via the Leader/09, experience has shown that one cannot
  58 trust it.
  59
  60 Ultimately, all MARC records are stored in Koha in UTF-8 and
  61 must be converted from whatever the source character encoding is.
  62 The goal of this module is to ensure that these conversions
  63 take place accurately.  When a character conversion cannot take
  64 place, or at least not accurately, the module was provide
  65 enough information to allow user-facing code to inform the user
  66 on how to deal with the situation.
  67
  68 =cut
  69
  70 =head1 FUNCTIONS
  71
  72 =head2 IsStringUTF8ish
  73
  74 =over 4
  75
  76 my $is_utf8 = IsStringUTF8ish($str);
  77
  78 =back
  79
  80 Determines if C<$str> is valid UTF-8.  This can mean
  81 one of two things:
  82
  83 =over 2
  84
  85 =item *
  86
  87 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  88
  89 =item *
  90
  91 The Perl UTF-8 flag is B<not> set, but the octets contain
  92 valid UTF-8.
  93
  94 =back
  95
  96 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  97 because in one could be presented with a MARC blob that is
  98 not actually in UTF-8 but whose sequence of octets appears to be
  99 valid UTF-8.  The rest of the MARC character conversion functions
 100 will assume that this situation occur does not very often.
 101
 102 =cut
 103
 104 sub IsStringUTF8ish {
 105     my $str = shift;
 106
 107     return 1 if utf8::is_utf8($str);
 108     return utf8::decode($str);
 109 }
 110
 111 =head2 MarcToUTF8Record
 112
 113 =over 4
 114
 115 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 116
 117 =back
 118
 119 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 120 optional source encoding, return a C<MARC::Record> that is
 121 converted to UTF-8.
 122
 123 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 124 is not guaranteed to have been converted correctly.  Specifically,
 125 if C<$converted_from> is 'failed', the MARC record returned failed
 126 character conversion and had each of its non-ASCII octets changed
 127 to the Unicode replacement character.
 128
 129 If the source encoding was not specified, this routine will
 130 try to guess it; the character encoding used for a successful
 131 conversion is returned in C<$converted_from>.
 132
 133 =cut
 134
 135 sub MarcToUTF8Record {
 136     my $marc = shift;
 137     my $marc_flavour = shift;
 138     my $source_encoding = shift;
 139
 140     my $marc_record;
 141     my $marc_blob_is_utf8 = 0;
 142     if (ref($marc) eq 'MARC::Record') {
 143         my $marc_blob = $marc->as_usmarc();
 144         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 145         $marc_record = $marc;
 146     } else {
 147         # dealing with a MARC blob
 148
 149         # remove any ersatz whitespace from the beginning and
 150         # end of the MARC blob -- these can creep into MARC
 151         # files produced by several sources -- caller really
 152         # should be doing this, however
 153         $marc =~ s/^\s+//;
 154         $marc =~ s/\s+$//;
 155         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 156         $marc_record = MARC::Record->new_from_usmarc($marc);
 157     }
 158
 159     # If we do not know the source encoding, try some guesses
 160     # as follows:
 161     #   1. Record is UTF-8 already.
 162     #   2. If MARC flavor is MARC21, then
 163     #      a. record is MARC-8
 164     #      b. record is ISO-8859-1
 165     #   3. If MARC flavor is UNIMARC, then
 166     if (not defined $source_encoding) {
 167         if ($marc_blob_is_utf8) {
 168             # note that for MARC21 we are not bothering to check
 169             # if the Leader/09 is set to 'a' or not -- because
 170             # of problems with various ILSs (including Koha in the
 171             # past, alas), this just is not trustworthy.
 172             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 173             return $marc_record, 'UTF-8', [];
 174         } else {
 175             if ($marc_flavour eq 'MARC21') {
 176                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 177             } elsif ($marc_flavour eq 'UNIMARC') {
 178                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 179             } else {
 180                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 181             }
 182         }
 183     } else {
 184         # caller knows the character encoding
 185         my $original_marc_record = $marc_record->clone();
 186         my @errors;
 187         if ($source_encoding =~ /utf-?8/i) {
 188             if ($marc_blob_is_utf8) {
 189                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 190                 return $marc_record, 'UTF-8', [];
 191             } else {
 192                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 193             }
 194         } elsif ($source_encoding =~ /marc-?8/i) {
 195             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 196         } elsif ($source_encoding =~ /5426/) {
 197             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 198         } else {
 199             # assume any other character encoding is for Text::Iconv
 200             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
 201         }
 202
 203         if (@errors) {
 204             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 205             return $original_marc_record, 'failed', \@errors;
 206         } else {
 207             return $marc_record, $source_encoding, [];
 208         }
 209     }
 210
 211 }
 212
 213 =head2 SetMarcUnicodeFlag
 214
 215 =over 4
 216
 217 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 218
 219 =back
 220
 221 Set both the internal MARC::Record encoding flag
 222 and the appropriate Leader/09 (MARC21) or
 223 100/26-29 (UNIMARC) to indicate that the record
 224 is in UTF-8.  Note that this does B<not> do
 225 any actual character conversion.
 226
 227 =cut
 228
 229 sub SetMarcUnicodeFlag {
 230     my $marc_record = shift;
 231     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 232
 233     $marc_record->encoding('UTF-8');
 234     if ($marc_flavour eq 'MARC21') {
 235         my $leader = $marc_record->leader();
 236         substr($leader, 9, 1) = 'a';
 237         $marc_record->leader($leader);
 238     } elsif ($marc_flavour eq "UNIMARC") {
 239         if (my $field = $marc_record->field('100')) {
 240             my $sfa = $field->subfield('a');
 241
 242             my $subflength = 36;
 243             # fix the length of the field
 244             $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
 245             $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
 246
 247             substr($sfa, 26, 4) = '50  ';
 248             $field->update('a' => $sfa);
 249         }
 250     } else {
 251         warn "Unrecognized marcflavour: $marc_flavour";
 252     }
 253 }
 254
 255 =head2 StripNonXmlChars
 256
 257 =over 4
 258
 259 my $new_str = StripNonXmlChars($old_str);
 260
 261 =back
 262
 263 Given a string, return a copy with the
 264 characters that are illegal in XML
 265 removed.
 266
 267 This function exists to work around a problem
 268 that can occur with badly-encoded MARC records.
 269 Specifically, if a UTF-8 MARC record also
 270 has excape (\x1b) characters, MARC::File::XML
 271 will let the escape characters pass through
 272 when as_xml() or as_xml_record() is called.  The
 273 problem is that the escape character is not
 274 legal in well-formed XML documents, so when
 275 MARC::File::XML attempts to parse such a record,
 276 the XML parser will fail.
 277
 278 Stripping such characters will allow a
 279 MARC::Record->new_from_xml()
 280 to work, at the possible risk of some data loss.
 281
 282 =cut
 283
 284 sub StripNonXmlChars {
 285     my $str = shift;
 286     if (!defined($str) || $str eq ""){
 287         return "";
 288     }
 289     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 290     return $str;
 291 }
 292
 293 =head1 INTERNAL FUNCTIONS
 294
 295 =head2 _default_marc21_charconv_to_utf8
 296
 297 =over 4
 298
 299 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 300
 301 =back
 302
 303 Converts a C<MARC::Record> of unknown character set to UTF-8,
 304 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 305 to UTF-8, then a default conversion that replaces each non-ASCII
 306 character with the replacement character.
 307
 308 The C<$guessed_charset> return value contains the character set
 309 that resulted in a conversion to valid UTF-8; note that
 310 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 311 this is 'failed'.
 312
 313 =cut
 314
 315 sub _default_marc21_charconv_to_utf8 {
 316     my $marc_record = shift;
 317     my $marc_flavour = shift;
 318
 319     my $trial_marc8 = $marc_record->clone();
 320     my @all_errors = ();
 321     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 322     unless (@errors) {
 323         return $trial_marc8, 'MARC-8', [];
 324     }
 325     push @all_errors, @errors;
 326
 327     my $trial_8859_1 = $marc_record->clone();
 328     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 329     unless (@errors) {
 330         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 331                                                 # instead if we wanted to report details
 332                                                 # of the failed attempt at MARC-8 => UTF-8
 333     }
 334     push @all_errors, @errors;
 335
 336     my $default_converted = $marc_record->clone();
 337     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 338     return $default_converted, 'failed', \@all_errors;
 339 }
 340
 341 =head2 _default_unimarc_charconv_to_utf8
 342
 343 =over 4
 344
 345 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 346
 347 =back
 348
 349 Converts a C<MARC::Record> of unknown character set to UTF-8,
 350 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 351 to UTF-8, then a default conversion that replaces each non-ASCII
 352 character with the replacement character.
 353
 354 The C<$guessed_charset> return value contains the character set
 355 that resulted in a conversion to valid UTF-8; note that
 356 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 357 this is 'failed'.
 358
 359 =cut
 360
 361 sub _default_unimarc_charconv_to_utf8 {
 362     my $marc_record = shift;
 363     my $marc_flavour = shift;
 364
 365     my $trial_marc8 = $marc_record->clone();
 366     my @all_errors = ();
 367     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 368     unless (@errors) {
 369         return $trial_marc8, 'iso-5426';
 370     }
 371     push @all_errors, @errors;
 372
 373     my $trial_8859_1 = $marc_record->clone();
 374     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 375     unless (@errors) {
 376         return $trial_8859_1, 'iso-8859-1';
 377     }
 378     push @all_errors, @errors;
 379
 380     my $default_converted = $marc_record->clone();
 381     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 382     return $default_converted, 'failed', \@all_errors;
 383 }
 384
 385 =head2 _marc_marc8_to_utf8
 386
 387 =over 4
 388
 389 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 390
 391 =back
 392
 393 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 394 If the conversion fails for some reason, an
 395 appropriate messages will be placed in the returned
 396 C<@errors> array.
 397
 398 =cut
 399
 400 sub _marc_marc8_to_utf8 {
 401     my $marc_record = shift;
 402     my $marc_flavour = shift;
 403
 404     my $prev_ignore = MARC::Charset->ignore_errors();
 405     MARC::Charset->ignore_errors(1);
 406
 407     # trap warnings raised by MARC::Charset
 408     my @errors = ();
 409     local $SIG{__WARN__} = sub {
 410         my $msg = $_[0];
 411         if ($msg =~ /MARC.Charset/) {
 412             # FIXME - purpose of this regexp is to strip out the
 413             # line reference to MARC/Charset.pm, but as it
 414             # exists probably won't work quite on Windows --
 415             # some sort of minimal-bunch back-tracking RE
 416             # would be helpful here
 417             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 418             push @errors, $msg;
 419         } else {
 420             # if warning doesn't come from MARC::Charset, just
 421             # pass it on
 422             warn $msg;
 423         }
 424     };
 425
 426     foreach my $field ($marc_record->fields()) {
 427         if ($field->is_control_field()) {
 428             ; # do nothing -- control fields should not contain non-ASCII characters
 429         } else {
 430             my @converted_subfields;
 431             foreach my $subfield ($field->subfields()) {
 432                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 433                 unless (IsStringUTF8ish($utf8sf)) {
 434                     # Because of a bug in MARC::Charset 0.98, if the string
 435                     # has (a) one or more diacritics that (b) are only in character positions
 436                     # 128 to 255 inclusive, the resulting converted string is not in
 437                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 438                     # occurs, upgrade the string in place.  Moral of the story seems to be
 439                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 440                     # that the resulting string is UTF-8.
 441                     utf8::upgrade($utf8sf);
 442                 }
 443                 push @converted_subfields, $subfield->[0], $utf8sf;
 444             }
 445
 446             $field->replace_with(MARC::Field->new(
 447                 $field->tag(), $field->indicator(1), $field->indicator(2),
 448                 @converted_subfields)
 449             );
 450         }
 451     }
 452
 453     MARC::Charset->ignore_errors($prev_ignore);
 454
 455     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 456
 457     return @errors;
 458 }
 459
 460 =head2 _marc_iso5426_to_utf8
 461
 462 =over 4
 463
 464 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 465
 466 =back
 467
 468 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 469 If the conversion fails for some reason, an
 470 appropriate messages will be placed in the returned
 471 C<@errors> array.
 472
 473 FIXME - is ISO-5426 equivalent enough to MARC-8
 474 that C<MARC::Charset> can be used instead?
 475
 476 =cut
 477
 478 sub _marc_iso5426_to_utf8 {
 479     my $marc_record = shift;
 480     my $marc_flavour = shift;
 481
 482     my @errors = ();
 483
 484     foreach my $field ($marc_record->fields()) {
 485         if ($field->is_control_field()) {
 486             ; # do nothing -- control fields should not contain non-ASCII characters
 487         } else {
 488             my @converted_subfields;
 489             foreach my $subfield ($field->subfields()) {
 490                 my $utf8sf = char_decode5426($subfield->[1]);
 491                 push @converted_subfields, $subfield->[0], $utf8sf;
 492             }
 493
 494             $field->replace_with(MARC::Field->new(
 495                 $field->tag(), $field->indicator(1), $field->indicator(2),
 496                 @converted_subfields)
 497             );
 498         }
 499     }
 500
 501     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 502
 503     return @errors;
 504 }
 505
 506 =head2 _marc_to_utf8_via_text_iconv
 507
 508 =over 4
 509
 510 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 511
 512 =back
 513
 514 Convert a C<MARC::Record> to UTF-8 in-place using the
 515 C<Text::Iconv> CPAN module.  Any source encoding accepted
 516 by the user's iconv installation should work.  If
 517 the source encoding is not recognized on the user's
 518 server or the conversion fails for some reason,
 519 appropriate messages will be placed in the returned
 520 C<@errors> array.
 521
 522 =cut
 523
 524 sub _marc_to_utf8_via_text_iconv {
 525     my $marc_record = shift;
 526     my $marc_flavour = shift;
 527     my $source_encoding = shift;
 528
 529     my @errors = ();
 530     my $decoder;
 531     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 532     if ($@) {
 533         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 534         return @errors;
 535     }
 536
 537     my $prev_raise_error = Text::Iconv->raise_error();
 538     Text::Iconv->raise_error(1);
 539
 540     foreach my $field ($marc_record->fields()) {
 541         if ($field->is_control_field()) {
 542             ; # do nothing -- control fields should not contain non-ASCII characters
 543         } else {
 544             my @converted_subfields;
 545             foreach my $subfield ($field->subfields()) {
 546                 my $converted_value;
 547                 my $conversion_ok = 1;
 548                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 549                 if ($@) {
 550                     $conversion_ok = 0;
 551                     push @errors, $@;
 552                 } elsif (not defined $converted_value) {
 553                     $conversion_ok = 0;
 554                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 555                 }
 556
 557                 if ($conversion_ok) {
 558                     push @converted_subfields, $subfield->[0], $converted_value;
 559                 } else {
 560                     $converted_value = $subfield->[1];
 561                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 562                     push @converted_subfields, $subfield->[0], $converted_value;
 563                 }
 564             }
 565
 566             $field->replace_with(MARC::Field->new(
 567                 $field->tag(), $field->indicator(1), $field->indicator(2),
 568                 @converted_subfields)
 569             );
 570         }
 571     }
 572
 573     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 574     Text::Iconv->raise_error($prev_raise_error);
 575
 576     return @errors;
 577 }
 578
 579 =head2 _marc_to_utf8_replacement_char
 580
 581 =over 4
 582
 583 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 584
 585 =back
 586
 587 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 588 unsatisfactory method of replacing all non-ASCII (e.g.,
 589 where the eight bit is set) octet with the Unicode
 590 replacement character.  This is meant as a last-ditch
 591 method, and would be best used as part of a UI that
 592 lets a cataloguer pick various character conversions
 593 until he or she finds the right one.
 594
 595 =cut
 596
 597 sub _marc_to_utf8_replacement_char {
 598     my $marc_record = shift;
 599     my $marc_flavour = shift;
 600
 601     foreach my $field ($marc_record->fields()) {
 602         if ($field->is_control_field()) {
 603             ; # do nothing -- control fields should not contain non-ASCII characters
 604         } else {
 605             my @converted_subfields;
 606             foreach my $subfield ($field->subfields()) {
 607                 my $value = $subfield->[1];
 608                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 609                 push @converted_subfields, $subfield->[0], $value;
 610             }
 611
 612             $field->replace_with(MARC::Field->new(
 613                 $field->tag(), $field->indicator(1), $field->indicator(2),
 614                 @converted_subfields)
 615             );
 616         }
 617     }
 618
 619     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 620 }
 621
 622 =head2 char_decode5426
 623
 624 =over 4
 625
 626 my $utf8string = char_decode5426($iso_5426_string);
 627
 628 =back
 629
 630 Converts a string from ISO-5426 to UTF-8.
 631
 632 =cut
 633
 634
 635 my %chars;
 636 $chars{0xb0}=0x0101;#3/0ayn[ain]
 637 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 638 #$chars{0xb2}=0x00e0;#'à';
 639 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 640 #$chars{0xb3}=0x00e7;#'ç';
 641 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 642 # $chars{0xb4}='è';
 643 $chars{0xb4}=0x00e8;
 644 # $chars{0xb5}='é';
 645 $chars{0xb5}=0x00e9;
 646 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 647 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 648 $chars{0xfa}=0x0153;#oe
 649 $chars{0x81d1}=0x00b0;
 650
 651 ####
 652 ## combined characters iso5426
 653
 654 $chars{0xc041}=0x1ea2; # capital a with hook above
 655 $chars{0xc045}=0x1eba; # capital e with hook above
 656 $chars{0xc049}=0x1ec8; # capital i with hook above
 657 $chars{0xc04f}=0x1ece; # capital o with hook above
 658 $chars{0xc055}=0x1ee6; # capital u with hook above
 659 $chars{0xc059}=0x1ef6; # capital y with hook above
 660 $chars{0xc061}=0x1ea3; # small a with hook above
 661 $chars{0xc065}=0x1ebb; # small e with hook above
 662 $chars{0xc069}=0x1ec9; # small i with hook above
 663 $chars{0xc06f}=0x1ecf; # small o with hook above
 664 $chars{0xc075}=0x1ee7; # small u with hook above
 665 $chars{0xc079}=0x1ef7; # small y with hook above
 666
 667         # 4/1 grave accent
 668 $chars{0xc141}=0x00c0; # capital a with grave accent
 669 $chars{0xc145}=0x00c8; # capital e with grave accent
 670 $chars{0xc149}=0x00cc; # capital i with grave accent
 671 $chars{0xc14f}=0x00d2; # capital o with grave accent
 672 $chars{0xc155}=0x00d9; # capital u with grave accent
 673 $chars{0xc157}=0x1e80; # capital w with grave
 674 $chars{0xc159}=0x1ef2; # capital y with grave
 675 $chars{0xc161}=0x00e0; # small a with grave accent
 676 $chars{0xc165}=0x00e8; # small e with grave accent
 677 $chars{0xc169}=0x00ec; # small i with grave accent
 678 $chars{0xc16f}=0x00f2; # small o with grave accent
 679 $chars{0xc175}=0x00f9; # small u with grave accent
 680 $chars{0xc177}=0x1e81; # small w with grave
 681 $chars{0xc179}=0x1ef3; # small y with grave
 682         # 4/2 acute accent
 683 $chars{0xc241}=0x00c1; # capital a with acute accent
 684 $chars{0xc243}=0x0106; # capital c with acute accent
 685 $chars{0xc245}=0x00c9; # capital e with acute accent
 686 $chars{0xc247}=0x01f4; # capital g with acute
 687 $chars{0xc249}=0x00cd; # capital i with acute accent
 688 $chars{0xc24b}=0x1e30; # capital k with acute
 689 $chars{0xc24c}=0x0139; # capital l with acute accent
 690 $chars{0xc24d}=0x1e3e; # capital m with acute
 691 $chars{0xc24e}=0x0143; # capital n with acute accent
 692 $chars{0xc24f}=0x00d3; # capital o with acute accent
 693 $chars{0xc250}=0x1e54; # capital p with acute
 694 $chars{0xc252}=0x0154; # capital r with acute accent
 695 $chars{0xc253}=0x015a; # capital s with acute accent
 696 $chars{0xc255}=0x00da; # capital u with acute accent
 697 $chars{0xc257}=0x1e82; # capital w with acute
 698 $chars{0xc259}=0x00dd; # capital y with acute accent
 699 $chars{0xc25a}=0x0179; # capital z with acute accent
 700 $chars{0xc261}=0x00e1; # small a with acute accent
 701 $chars{0xc263}=0x0107; # small c with acute accent
 702 $chars{0xc265}=0x00e9; # small e with acute accent
 703 $chars{0xc267}=0x01f5; # small g with acute
 704 $chars{0xc269}=0x00ed; # small i with acute accent
 705 $chars{0xc26b}=0x1e31; # small k with acute
 706 $chars{0xc26c}=0x013a; # small l with acute accent
 707 $chars{0xc26d}=0x1e3f; # small m with acute
 708 $chars{0xc26e}=0x0144; # small n with acute accent
 709 $chars{0xc26f}=0x00f3; # small o with acute accent
 710 $chars{0xc270}=0x1e55; # small p with acute
 711 $chars{0xc272}=0x0155; # small r with acute accent
 712 $chars{0xc273}=0x015b; # small s with acute accent
 713 $chars{0xc275}=0x00fa; # small u with acute accent
 714 $chars{0xc277}=0x1e83; # small w with acute
 715 $chars{0xc279}=0x00fd; # small y with acute accent
 716 $chars{0xc27a}=0x017a; # small z with acute accent
 717 $chars{0xc2e1}=0x01fc; # capital ae with acute
 718 $chars{0xc2f1}=0x01fd; # small ae with acute
 719        # 4/3 circumflex accent
 720 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 721 $chars{0xc343}=0x0108; # capital c with circumflex
 722 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 723 $chars{0xc347}=0x011c; # capital g with circumflex
 724 $chars{0xc348}=0x0124; # capital h with circumflex
 725 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 726 $chars{0xc34a}=0x0134; # capital j with circumflex
 727 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 728 $chars{0xc353}=0x015c; # capital s with circumflex
 729 $chars{0xc355}=0x00db; # capital u with circumflex
 730 $chars{0xc357}=0x0174; # capital w with circumflex
 731 $chars{0xc359}=0x0176; # capital y with circumflex
 732 $chars{0xc35a}=0x1e90; # capital z with circumflex
 733 $chars{0xc361}=0x00e2; # small a with circumflex accent
 734 $chars{0xc363}=0x0109; # small c with circumflex
 735 $chars{0xc365}=0x00ea; # small e with circumflex accent
 736 $chars{0xc367}=0x011d; # small g with circumflex
 737 $chars{0xc368}=0x0125; # small h with circumflex
 738 $chars{0xc369}=0x00ee; # small i with circumflex accent
 739 $chars{0xc36a}=0x0135; # small j with circumflex
 740 $chars{0xc36e}=0x00f1; # small n with tilde
 741 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 742 $chars{0xc373}=0x015d; # small s with circumflex
 743 $chars{0xc375}=0x00fb; # small u with circumflex
 744 $chars{0xc377}=0x0175; # small w with circumflex
 745 $chars{0xc379}=0x0177; # small y with circumflex
 746 $chars{0xc37a}=0x1e91; # small z with circumflex
 747         # 4/4 tilde
 748 $chars{0xc441}=0x00c3; # capital a with tilde
 749 $chars{0xc445}=0x1ebc; # capital e with tilde
 750 $chars{0xc449}=0x0128; # capital i with tilde
 751 $chars{0xc44e}=0x00d1; # capital n with tilde
 752 $chars{0xc44f}=0x00d5; # capital o with tilde
 753 $chars{0xc455}=0x0168; # capital u with tilde
 754 $chars{0xc456}=0x1e7c; # capital v with tilde
 755 $chars{0xc459}=0x1ef8; # capital y with tilde
 756 $chars{0xc461}=0x00e3; # small a with tilde
 757 $chars{0xc465}=0x1ebd; # small e with tilde
 758 $chars{0xc469}=0x0129; # small i with tilde
 759 $chars{0xc46e}=0x00f1; # small n with tilde
 760 $chars{0xc46f}=0x00f5; # small o with tilde
 761 $chars{0xc475}=0x0169; # small u with tilde
 762 $chars{0xc476}=0x1e7d; # small v with tilde
 763 $chars{0xc479}=0x1ef9; # small y with tilde
 764     # 4/5 macron
 765 $chars{0xc541}=0x0100; # capital a with macron
 766 $chars{0xc545}=0x0112; # capital e with macron
 767 $chars{0xc547}=0x1e20; # capital g with macron
 768 $chars{0xc549}=0x012a; # capital i with macron
 769 $chars{0xc54f}=0x014c; # capital o with macron
 770 $chars{0xc555}=0x016a; # capital u with macron
 771 $chars{0xc561}=0x0101; # small a with macron
 772 $chars{0xc565}=0x0113; # small e with macron
 773 $chars{0xc567}=0x1e21; # small g with macron
 774 $chars{0xc569}=0x012b; # small i with macron
 775 $chars{0xc56f}=0x014d; # small o with macron
 776 $chars{0xc575}=0x016b; # small u with macron
 777 $chars{0xc572}=0x0159; # small r with macron
 778 $chars{0xc5e1}=0x01e2; # capital ae with macron
 779 $chars{0xc5f1}=0x01e3; # small ae with macron
 780         # 4/6 breve
 781 $chars{0xc641}=0x0102; # capital a with breve
 782 $chars{0xc645}=0x0114; # capital e with breve
 783 $chars{0xc647}=0x011e; # capital g with breve
 784 $chars{0xc649}=0x012c; # capital i with breve
 785 $chars{0xc64f}=0x014e; # capital o with breve
 786 $chars{0xc655}=0x016c; # capital u with breve
 787 $chars{0xc661}=0x0103; # small a with breve
 788 $chars{0xc665}=0x0115; # small e with breve
 789 $chars{0xc667}=0x011f; # small g with breve
 790 $chars{0xc669}=0x012d; # small i with breve
 791 $chars{0xc66f}=0x014f; # small o with breve
 792 $chars{0xc675}=0x016d; # small u with breve
 793         # 4/7 dot above
 794 $chars{0xc7b0}=0x01e1; # Ain with dot above
 795 $chars{0xc742}=0x1e02; # capital b with dot above
 796 $chars{0xc743}=0x010a; # capital c with dot above
 797 $chars{0xc744}=0x1e0a; # capital d with dot above
 798 $chars{0xc745}=0x0116; # capital e with dot above
 799 $chars{0xc746}=0x1e1e; # capital f with dot above
 800 $chars{0xc747}=0x0120; # capital g with dot above
 801 $chars{0xc748}=0x1e22; # capital h with dot above
 802 $chars{0xc749}=0x0130; # capital i with dot above
 803 $chars{0xc74d}=0x1e40; # capital m with dot above
 804 $chars{0xc74e}=0x1e44; # capital n with dot above
 805 $chars{0xc750}=0x1e56; # capital p with dot above
 806 $chars{0xc752}=0x1e58; # capital r with dot above
 807 $chars{0xc753}=0x1e60; # capital s with dot above
 808 $chars{0xc754}=0x1e6a; # capital t with dot above
 809 $chars{0xc757}=0x1e86; # capital w with dot above
 810 $chars{0xc758}=0x1e8a; # capital x with dot above
 811 $chars{0xc759}=0x1e8e; # capital y with dot above
 812 $chars{0xc75a}=0x017b; # capital z with dot above
 813 $chars{0xc761}=0x0227; # small b with dot above
 814 $chars{0xc762}=0x1e03; # small b with dot above
 815 $chars{0xc763}=0x010b; # small c with dot above
 816 $chars{0xc764}=0x1e0b; # small d with dot above
 817 $chars{0xc765}=0x0117; # small e with dot above
 818 $chars{0xc766}=0x1e1f; # small f with dot above
 819 $chars{0xc767}=0x0121; # small g with dot above
 820 $chars{0xc768}=0x1e23; # small h with dot above
 821 $chars{0xc76d}=0x1e41; # small m with dot above
 822 $chars{0xc76e}=0x1e45; # small n with dot above
 823 $chars{0xc770}=0x1e57; # small p with dot above
 824 $chars{0xc772}=0x1e59; # small r with dot above
 825 $chars{0xc773}=0x1e61; # small s with dot above
 826 $chars{0xc774}=0x1e6b; # small t with dot above
 827 $chars{0xc777}=0x1e87; # small w with dot above
 828 $chars{0xc778}=0x1e8b; # small x with dot above
 829 $chars{0xc779}=0x1e8f; # small y with dot above
 830 $chars{0xc77a}=0x017c; # small z with dot above
 831         # 4/8 trema, diaresis
 832 $chars{0xc820}=0x00a8; # diaeresis
 833 $chars{0xc841}=0x00c4; # capital a with diaeresis
 834 $chars{0xc845}=0x00cb; # capital e with diaeresis
 835 $chars{0xc848}=0x1e26; # capital h with diaeresis
 836 $chars{0xc849}=0x00cf; # capital i with diaeresis
 837 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 838 $chars{0xc855}=0x00dc; # capital u with diaeresis
 839 $chars{0xc857}=0x1e84; # capital w with diaeresis
 840 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 841 $chars{0xc859}=0x0178; # capital y with diaeresis
 842 $chars{0xc861}=0x00e4; # small a with diaeresis
 843 $chars{0xc865}=0x00eb; # small e with diaeresis
 844 $chars{0xc868}=0x1e27; # small h with diaeresis
 845 $chars{0xc869}=0x00ef; # small i with diaeresis
 846 $chars{0xc86f}=0x00f6; # small o with diaeresis
 847 $chars{0xc874}=0x1e97; # small t with diaeresis
 848 $chars{0xc875}=0x00fc; # small u with diaeresis
 849 $chars{0xc877}=0x1e85; # small w with diaeresis
 850 $chars{0xc878}=0x1e8d; # small x with diaeresis
 851 $chars{0xc879}=0x00ff; # small y with diaeresis
 852         # 4/9 umlaut
 853 $chars{0xc920}=0x00a8; # [diaeresis]
 854 $chars{0xc961}=0x00e4; # a with umlaut
 855 $chars{0xc965}=0x00eb; # e with umlaut
 856 $chars{0xc969}=0x00ef; # i with umlaut
 857 $chars{0xc96f}=0x00f6; # o with umlaut
 858 $chars{0xc975}=0x00fc; # u with umlaut
 859         # 4/10 circle above
 860 $chars{0xca41}=0x00c5; # capital a with ring above
 861 $chars{0xcaad}=0x016e; # capital u with ring above
 862 $chars{0xca61}=0x00e5; # small a with ring above
 863 $chars{0xca75}=0x016f; # small u with ring above
 864 $chars{0xca77}=0x1e98; # small w with ring above
 865 $chars{0xca79}=0x1e99; # small y with ring above
 866         # 4/11 high comma off centre
 867         # 4/12 inverted high comma centred
 868         # 4/13 double acute accent
 869 $chars{0xcd4f}=0x0150; # capital o with double acute
 870 $chars{0xcd55}=0x0170; # capital u with double acute
 871 $chars{0xcd6f}=0x0151; # small o with double acute
 872 $chars{0xcd75}=0x0171; # small u with double acute
 873         # 4/14 horn
 874 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 875 $chars{0xce55}=0x01af; # latin capital letter u with horn
 876 $chars{0xce74}=0x01a1; # latin small letter o with horn
 877 $chars{0xce75}=0x01b0; # latin small letter u with horn
 878         # 4/15 caron (hacek
 879 $chars{0xcf41}=0x01cd; # capital a with caron
 880 $chars{0xcf43}=0x010c; # capital c with caron
 881 $chars{0xcf44}=0x010e; # capital d with caron
 882 $chars{0xcf45}=0x011a; # capital e with caron
 883 $chars{0xcf47}=0x01e6; # capital g with caron
 884 $chars{0xcf49}=0x01cf; # capital i with caron
 885 $chars{0xcf4b}=0x01e8; # capital k with caron
 886 $chars{0xcf4c}=0x013d; # capital l with caron
 887 $chars{0xcf4e}=0x0147; # capital n with caron
 888 $chars{0xcf4f}=0x01d1; # capital o with caron
 889 $chars{0xcf52}=0x0158; # capital r with caron
 890 $chars{0xcf53}=0x0160; # capital s with caron
 891 $chars{0xcf54}=0x0164; # capital t with caron
 892 $chars{0xcf55}=0x01d3; # capital u with caron
 893 $chars{0xcf5a}=0x017d; # capital z with caron
 894 $chars{0xcf61}=0x01ce; # small a with caron
 895 $chars{0xcf63}=0x010d; # small c with caron
 896 $chars{0xcf64}=0x010f; # small d with caron
 897 $chars{0xcf65}=0x011b; # small e with caron
 898 $chars{0xcf67}=0x01e7; # small g with caron
 899 $chars{0xcf69}=0x01d0; # small i with caron
 900 $chars{0xcf6a}=0x01f0; # small j with caron
 901 $chars{0xcf6b}=0x01e9; # small k with caron
 902 $chars{0xcf6c}=0x013e; # small l with caron
 903 $chars{0xcf6e}=0x0148; # small n with caron
 904 $chars{0xcf6f}=0x01d2; # small o with caron
 905 $chars{0xcf72}=0x0159; # small r with caron
 906 $chars{0xcf73}=0x0161; # small s with caron
 907 $chars{0xcf74}=0x0165; # small t with caron
 908 $chars{0xcf75}=0x01d4; # small u with caron
 909 $chars{0xcf7a}=0x017e; # small z with caron
 910         # 5/0 cedilla
 911 $chars{0xd020}=0x00b8; # cedilla
 912 $chars{0xd043}=0x00c7; # capital c with cedilla
 913 $chars{0xd044}=0x1e10; # capital d with cedilla
 914 $chars{0xd047}=0x0122; # capital g with cedilla
 915 $chars{0xd048}=0x1e28; # capital h with cedilla
 916 $chars{0xd04b}=0x0136; # capital k with cedilla
 917 $chars{0xd04c}=0x013b; # capital l with cedilla
 918 $chars{0xd04e}=0x0145; # capital n with cedilla
 919 $chars{0xd052}=0x0156; # capital r with cedilla
 920 $chars{0xd053}=0x015e; # capital s with cedilla
 921 $chars{0xd054}=0x0162; # capital t with cedilla
 922 $chars{0xd063}=0x00e7; # small c with cedilla
 923 $chars{0xd064}=0x1e11; # small d with cedilla
 924 $chars{0xd065}=0x0119; # small e with cedilla
 925 $chars{0xd067}=0x0123; # small g with cedilla
 926 $chars{0xd068}=0x1e29; # small h with cedilla
 927 $chars{0xd06b}=0x0137; # small k with cedilla
 928 $chars{0xd06c}=0x013c; # small l with cedilla
 929 $chars{0xd06e}=0x0146; # small n with cedilla
 930 $chars{0xd072}=0x0157; # small r with cedilla
 931 $chars{0xd073}=0x015f; # small s with cedilla
 932 $chars{0xd074}=0x0163; # small t with cedilla
 933         # 5/1 rude
 934         # 5/2 hook to left
 935         # 5/3 ogonek (hook to right
 936 $chars{0xd320}=0x02db; # ogonek
 937 $chars{0xd341}=0x0104; # capital a with ogonek
 938 $chars{0xd345}=0x0118; # capital e with ogonek
 939 $chars{0xd349}=0x012e; # capital i with ogonek
 940 $chars{0xd34f}=0x01ea; # capital o with ogonek
 941 $chars{0xd355}=0x0172; # capital u with ogonek
 942 $chars{0xd361}=0x0105; # small a with ogonek
 943 $chars{0xd365}=0x0119; # small e with ogonek
 944 $chars{0xd369}=0x012f; # small i with ogonek
 945 $chars{0xd36f}=0x01eb; # small o with ogonek
 946 $chars{0xd375}=0x0173; # small u with ogonek
 947         # 5/4 circle below
 948 $chars{0xd441}=0x1e00; # capital a with ring below
 949 $chars{0xd461}=0x1e01; # small a with ring below
 950         # 5/5 half circle below
 951 $chars{0xf948}=0x1e2a; # capital h with breve below
 952 $chars{0xf968}=0x1e2b; # small h with breve below
 953         # 5/6 dot below
 954 $chars{0xd641}=0x1ea0; # capital a with dot below
 955 $chars{0xd642}=0x1e04; # capital b with dot below
 956 $chars{0xd644}=0x1e0c; # capital d with dot below
 957 $chars{0xd645}=0x1eb8; # capital e with dot below
 958 $chars{0xd648}=0x1e24; # capital h with dot below
 959 $chars{0xd649}=0x1eca; # capital i with dot below
 960 $chars{0xd64b}=0x1e32; # capital k with dot below
 961 $chars{0xd64c}=0x1e36; # capital l with dot below
 962 $chars{0xd64d}=0x1e42; # capital m with dot below
 963 $chars{0xd64e}=0x1e46; # capital n with dot below
 964 $chars{0xd64f}=0x1ecc; # capital o with dot below
 965 $chars{0xd652}=0x1e5a; # capital r with dot below
 966 $chars{0xd653}=0x1e62; # capital s with dot below
 967 $chars{0xd654}=0x1e6c; # capital t with dot below
 968 $chars{0xd655}=0x1ee4; # capital u with dot below
 969 $chars{0xd656}=0x1e7e; # capital v with dot below
 970 $chars{0xd657}=0x1e88; # capital w with dot below
 971 $chars{0xd659}=0x1ef4; # capital y with dot below
 972 $chars{0xd65a}=0x1e92; # capital z with dot below
 973 $chars{0xd661}=0x1ea1; # small a with dot below
 974 $chars{0xd662}=0x1e05; # small b with dot below
 975 $chars{0xd664}=0x1e0d; # small d with dot below
 976 $chars{0xd665}=0x1eb9; # small e with dot below
 977 $chars{0xd668}=0x1e25; # small h with dot below
 978 $chars{0xd669}=0x1ecb; # small i with dot below
 979 $chars{0xd66b}=0x1e33; # small k with dot below
 980 $chars{0xd66c}=0x1e37; # small l with dot below
 981 $chars{0xd66d}=0x1e43; # small m with dot below
 982 $chars{0xd66e}=0x1e47; # small n with dot below
 983 $chars{0xd66f}=0x1ecd; # small o with dot below
 984 $chars{0xd672}=0x1e5b; # small r with dot below
 985 $chars{0xd673}=0x1e63; # small s with dot below
 986 $chars{0xd674}=0x1e6d; # small t with dot below
 987 $chars{0xd675}=0x1ee5; # small u with dot below
 988 $chars{0xd676}=0x1e7f; # small v with dot below
 989 $chars{0xd677}=0x1e89; # small w with dot below
 990 $chars{0xd679}=0x1ef5; # small y with dot below
 991 $chars{0xd67a}=0x1e93; # small z with dot below
 992         # 5/7 double dot below
 993 $chars{0xd755}=0x1e72; # capital u with diaeresis below
 994 $chars{0xd775}=0x1e73; # small u with diaeresis below
 995         # 5/8 underline
 996 $chars{0xd820}=0x005f; # underline
 997         # 5/9 double underline
 998 $chars{0xd920}=0x2017; # double underline
 999         # 5/10 small low vertical bar
1000 $chars{0xda20}=0x02cc; #
1001         # 5/11 circumflex below
1002         # 5/12 (this position shall not be used)
1003         # 5/13 left half of ligature sign and of double tilde
1004         # 5/14 right half of ligature sign
1005         # 5/15 right half of double tilde
1006 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1007
1008 sub char_decode5426 {
1009     my ( $string) = @_;
1010     my $result;
1011
1012     my @data = unpack("C*", $string);
1013     my @characters;
1014     my $length=scalar(@data);
1015     for (my $i = 0; $i < scalar(@data); $i++) {
1016       my $char= $data[$i];
1017       if ($char >= 0x00 && $char <= 0x7F){
1018         #IsAscii
1019
1020           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1021       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1022         #Combined Char
1023         my $convchar ;
1024         if ($chars{$char*256+$data[$i+1]}) {
1025           $convchar= $chars{$char * 256 + $data[$i+1]};
1026           $i++;
1027 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1028         } elsif ($chars{$char})  {
1029           $convchar= $chars{$char};
1030 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1031         }else {
1032           $convchar=$char;
1033         }
1034         push @characters,$convchar;
1035       } else {
1036         my $convchar;
1037         if ($chars{$char})  {
1038           $convchar= $chars{$char};
1039 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1040         }else {
1041 #            printf "char %x $char\n",$char;
1042           $convchar=$char;
1043         }
1044         push @characters,$convchar;
1045       }
1046     }
1047     $result=pack "U*",@characters;
1048 #     $result=~s/\x01//;
1049 #     $result=~s/\x00//;
1050      $result=~s/\x0f//;
1051      $result=~s/\x1b.//;
1052      $result=~s/\x0e//;
1053      $result=~s/\x1b\x5b//;
1054 #   map{printf "%x",$_} @characters;
1055 #   printf "\n";
1056   return $result;
1057 }
1058
1059 1;
1060
1061
1062 =head1 AUTHOR
1063
1064 Koha Development Team <info@koha.org>
1065
1066 Galen Charlton <galen.charlton@liblime.com>
1067
1068 =cut