C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use Unicode::Normalize;
  26
  27 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  28
  29 BEGIN {
  30     # set the version for version checking
  31     $VERSION = 3.01;
  32     require Exporter;
  33     @ISA    = qw(Exporter);
  34     @EXPORT = qw(
  35         IsStringUTF8ish
  36         MarcToUTF8Record
  37         SetMarcUnicodeFlag
  38         StripNonXmlChars
  39                 Normalize_String
  40     );
  41 }
  42
  43 =head1 NAME
  44
  45 C4::Charset - utilities for handling character set conversions.
  46
  47 =head1 SYNOPSIS
  48
  49 use C4::Charset;
  50
  51 =head1 DESCRIPTION
  52
  53 This module contains routines for dealing with character set
  54 conversions, particularly for MARC records.
  55
  56 A variety of character encodings are in use by various MARC
  57 standards, and even more character encodings are used by
  58 non-standard MARC records.  The various MARC formats generally
  59 do not do a good job of advertising a given record's character
  60 encoding, and even when a record does advertise its encoding,
  61 e.g., via the Leader/09, experience has shown that one cannot
  62 trust it.
  63
  64 Ultimately, all MARC records are stored in Koha in UTF-8 and
  65 must be converted from whatever the source character encoding is.
  66 The goal of this module is to ensure that these conversions
  67 take place accurately.  When a character conversion cannot take
  68 place, or at least not accurately, the module was provide
  69 enough information to allow user-facing code to inform the user
  70 on how to deal with the situation.
  71
  72 =cut
  73
  74 =head1 FUNCTIONS
  75
  76 =head2 IsStringUTF8ish
  77
  78 =over 4
  79
  80 my $is_utf8 = IsStringUTF8ish($str);
  81
  82 =back
  83
  84 Determines if C<$str> is valid UTF-8.  This can mean
  85 one of two things:
  86
  87 =over 2
  88
  89 =item *
  90
  91 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  92
  93 =item *
  94
  95 The Perl UTF-8 flag is B<not> set, but the octets contain
  96 valid UTF-8.
  97
  98 =back
  99
 100 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 101 because in one could be presented with a MARC blob that is
 102 not actually in UTF-8 but whose sequence of octets appears to be
 103 valid UTF-8.  The rest of the MARC character conversion functions
 104 will assume that this situation occur does not very often.
 105
 106 =cut
 107
 108 sub IsStringUTF8ish {
 109     my $str = shift;
 110
 111     return 1 if utf8::is_utf8($str);
 112     return utf8::decode($str);
 113 }
 114
 115
 116 =head2 Normalize_String
 117
 118 =over 4
 119
 120 my $$string_normalized = Normalize_String($string);
 121
 122 =back
 123
 124 Returns normalized string C<$string> in C Form
 125
 126
 127 =cut
 128
 129 sub Normalize_String {
 130     my $string = shift;
 131         if (IsStringUTF8ish($string)){
 132                 return NFC($string);
 133         }
 134         else {
 135                 return $string;
 136         }
 137 }
 138
 139 =head2 MarcToUTF8Record
 140
 141 =over 4
 142
 143 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 144
 145 =back
 146
 147 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 148 optional source encoding, return a C<MARC::Record> that is
 149 converted to UTF-8.
 150
 151 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 152 is not guaranteed to have been converted correctly.  Specifically,
 153 if C<$converted_from> is 'failed', the MARC record returned failed
 154 character conversion and had each of its non-ASCII octets changed
 155 to the Unicode replacement character.
 156
 157 If the source encoding was not specified, this routine will
 158 try to guess it; the character encoding used for a successful
 159 conversion is returned in C<$converted_from>.
 160
 161 =cut
 162
 163 sub MarcToUTF8Record {
 164     my $marc = shift;
 165     my $marc_flavour = shift;
 166     my $source_encoding = shift;
 167
 168     my $marc_record;
 169     my $marc_blob_is_utf8 = 0;
 170     if (ref($marc) eq 'MARC::Record') {
 171         my $marc_blob = $marc->as_usmarc();
 172         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 173         $marc_record = $marc;
 174     } else {
 175         # dealing with a MARC blob
 176
 177         # remove any ersatz whitespace from the beginning and
 178         # end of the MARC blob -- these can creep into MARC
 179         # files produced by several sources -- caller really
 180         # should be doing this, however
 181         $marc =~ s/^\s+//;
 182         $marc =~ s/\s+$//;
 183         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 184         eval {
 185             $marc_record = MARC::Record->new_from_usmarc($marc);
 186         };
 187         if ($@) {
 188             # if we fail the first time, one likely problem
 189             # is that we have a MARC21 record that says that it's
 190             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 191             # We'll try parsing it again.
 192             substr($marc, 9, 1) = ' ';
 193             eval {
 194                 $marc_record = MARC::Record->new_from_usmarc($marc);
 195             };
 196             if ($@) {
 197                 # it's hopeless; return an empty MARC::Record
 198                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 199             }
 200         }
 201     }
 202
 203     # If we do not know the source encoding, try some guesses
 204     # as follows:
 205     #   1. Record is UTF-8 already.
 206     #   2. If MARC flavor is MARC21, then
 207     #      a. record is MARC-8
 208     #      b. record is ISO-8859-1
 209     #   3. If MARC flavor is UNIMARC, then
 210     if (not defined $source_encoding) {
 211         if ($marc_blob_is_utf8) {
 212             # note that for MARC21 we are not bothering to check
 213             # if the Leader/09 is set to 'a' or not -- because
 214             # of problems with various ILSs (including Koha in the
 215             # past, alas), this just is not trustworthy.
 216             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 217             return $marc_record, 'UTF-8', [];
 218         } else {
 219             if ($marc_flavour eq 'MARC21') {
 220                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 221             } elsif ($marc_flavour eq 'UNIMARC') {
 222                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 223             } else {
 224                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 225             }
 226         }
 227     } else {
 228         # caller knows the character encoding
 229         my $original_marc_record = $marc_record->clone();
 230         my @errors;
 231         if ($source_encoding =~ /utf-?8/i) {
 232             if ($marc_blob_is_utf8) {
 233                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 234                 return $marc_record, 'UTF-8', [];
 235             } else {
 236                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 237             }
 238         } elsif ($source_encoding =~ /marc-?8/i) {
 239             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 240         } elsif ($source_encoding =~ /5426/) {
 241             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 242         } else {
 243             # assume any other character encoding is for Text::Iconv
 244             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 245         }
 246
 247         if (@errors) {
 248             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 249             return $original_marc_record, 'failed', \@errors;
 250         } else {
 251             return $marc_record, $source_encoding, [];
 252         }
 253     }
 254
 255 }
 256
 257 =head2 SetMarcUnicodeFlag
 258
 259 =over 4
 260
 261 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 262
 263 =back
 264
 265 Set both the internal MARC::Record encoding flag
 266 and the appropriate Leader/09 (MARC21) or
 267 100/26-29 (UNIMARC) to indicate that the record
 268 is in UTF-8.  Note that this does B<not> do
 269 any actual character conversion.
 270
 271 =cut
 272
 273 sub SetMarcUnicodeFlag {
 274     my $marc_record = shift;
 275     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 276
 277     $marc_record->encoding('UTF-8');
 278     if ($marc_flavour eq 'MARC21') {
 279         my $leader = $marc_record->leader();
 280         substr($leader, 9, 1) = 'a';
 281         $marc_record->leader($leader);
 282     } elsif ($marc_flavour eq "UNIMARC") {
 283         if (my $field = $marc_record->field('100')) {
 284             my $sfa = $field->subfield('a');
 285
 286             my $subflength = 36;
 287             # fix the length of the field
 288             $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
 289             $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
 290
 291             substr($sfa, 26, 4) = '50  ';
 292             $field->update('a' => $sfa);
 293         }
 294     } else {
 295         warn "Unrecognized marcflavour: $marc_flavour";
 296     }
 297 }
 298
 299 =head2 StripNonXmlChars
 300
 301 =over 4
 302
 303 my $new_str = StripNonXmlChars($old_str);
 304
 305 =back
 306
 307 Given a string, return a copy with the
 308 characters that are illegal in XML
 309 removed.
 310
 311 This function exists to work around a problem
 312 that can occur with badly-encoded MARC records.
 313 Specifically, if a UTF-8 MARC record also
 314 has excape (\x1b) characters, MARC::File::XML
 315 will let the escape characters pass through
 316 when as_xml() or as_xml_record() is called.  The
 317 problem is that the escape character is not
 318 legal in well-formed XML documents, so when
 319 MARC::File::XML attempts to parse such a record,
 320 the XML parser will fail.
 321
 322 Stripping such characters will allow a
 323 MARC::Record->new_from_xml()
 324 to work, at the possible risk of some data loss.
 325
 326 =cut
 327
 328 sub StripNonXmlChars {
 329     my $str = shift;
 330     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 331     return $str;
 332 }
 333
 334 =head1 INTERNAL FUNCTIONS
 335
 336 =head2 _default_marc21_charconv_to_utf8
 337
 338 =over 4
 339
 340 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 341
 342 =back
 343
 344 Converts a C<MARC::Record> of unknown character set to UTF-8,
 345 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 346 to UTF-8, then a default conversion that replaces each non-ASCII
 347 character with the replacement character.
 348
 349 The C<$guessed_charset> return value contains the character set
 350 that resulted in a conversion to valid UTF-8; note that
 351 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 352 this is 'failed'.
 353
 354 =cut
 355
 356 sub _default_marc21_charconv_to_utf8 {
 357     my $marc_record = shift;
 358     my $marc_flavour = shift;
 359
 360     my $trial_marc8 = $marc_record->clone();
 361     my @all_errors = ();
 362     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 363     unless (@errors) {
 364         return $trial_marc8, 'MARC-8', [];
 365     }
 366     push @all_errors, @errors;
 367
 368     my $trial_8859_1 = $marc_record->clone();
 369     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 370     unless (@errors) {
 371         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 372                                                 # instead if we wanted to report details
 373                                                 # of the failed attempt at MARC-8 => UTF-8
 374     }
 375     push @all_errors, @errors;
 376
 377     my $default_converted = $marc_record->clone();
 378     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 379     return $default_converted, 'failed', \@all_errors;
 380 }
 381
 382 =head2 _default_unimarc_charconv_to_utf8
 383
 384 =over 4
 385
 386 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 387
 388 =back
 389
 390 Converts a C<MARC::Record> of unknown character set to UTF-8,
 391 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 392 to UTF-8, then a default conversion that replaces each non-ASCII
 393 character with the replacement character.
 394
 395 The C<$guessed_charset> return value contains the character set
 396 that resulted in a conversion to valid UTF-8; note that
 397 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 398 this is 'failed'.
 399
 400 =cut
 401
 402 sub _default_unimarc_charconv_to_utf8 {
 403     my $marc_record = shift;
 404     my $marc_flavour = shift;
 405
 406     my $trial_marc8 = $marc_record->clone();
 407     my @all_errors = ();
 408     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 409     unless (@errors) {
 410         return $trial_marc8, 'iso-5426';
 411     }
 412     push @all_errors, @errors;
 413
 414     my $trial_8859_1 = $marc_record->clone();
 415     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 416     unless (@errors) {
 417         return $trial_8859_1, 'iso-8859-1';
 418     }
 419     push @all_errors, @errors;
 420
 421     my $default_converted = $marc_record->clone();
 422     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 423     return $default_converted, 'failed', \@all_errors;
 424 }
 425
 426 =head2 _marc_marc8_to_utf8
 427
 428 =over 4
 429
 430 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 431
 432 =back
 433
 434 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 435 If the conversion fails for some reason, an
 436 appropriate messages will be placed in the returned
 437 C<@errors> array.
 438
 439 =cut
 440
 441 sub _marc_marc8_to_utf8 {
 442     my $marc_record = shift;
 443     my $marc_flavour = shift;
 444
 445     my $prev_ignore = MARC::Charset->ignore_errors();
 446     MARC::Charset->ignore_errors(1);
 447
 448     # trap warnings raised by MARC::Charset
 449     my @errors = ();
 450     local $SIG{__WARN__} = sub {
 451         my $msg = $_[0];
 452         if ($msg =~ /MARC.Charset/) {
 453             # FIXME - purpose of this regexp is to strip out the
 454             # line reference to MARC/Charset.pm, but as it
 455             # exists probably won't work quite on Windows --
 456             # some sort of minimal-bunch back-tracking RE
 457             # would be helpful here
 458             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 459             push @errors, $msg;
 460         } else {
 461             # if warning doesn't come from MARC::Charset, just
 462             # pass it on
 463             warn $msg;
 464         }
 465     };
 466
 467     foreach my $field ($marc_record->fields()) {
 468         if ($field->is_control_field()) {
 469             ; # do nothing -- control fields should not contain non-ASCII characters
 470         } else {
 471             my @converted_subfields;
 472             foreach my $subfield ($field->subfields()) {
 473                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 474                 unless (IsStringUTF8ish($utf8sf)) {
 475                     # Because of a bug in MARC::Charset 0.98, if the string
 476                     # has (a) one or more diacritics that (b) are only in character positions
 477                     # 128 to 255 inclusive, the resulting converted string is not in
 478                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 479                     # occurs, upgrade the string in place.  Moral of the story seems to be
 480                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 481                     # that the resulting string is UTF-8.
 482                     utf8::upgrade($utf8sf);
 483                 }
 484                             $utf8sf=NFC($utf8sf);
 485                 push @converted_subfields, $subfield->[0], $utf8sf;
 486             }
 487
 488             $field->replace_with(MARC::Field->new(
 489                 $field->tag(), $field->indicator(1), $field->indicator(2),
 490                 @converted_subfields)
 491             );
 492         }
 493     }
 494
 495     MARC::Charset->ignore_errors($prev_ignore);
 496
 497     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 498
 499     return @errors;
 500 }
 501
 502 =head2 _marc_iso5426_to_utf8
 503
 504 =over 4
 505
 506 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 507
 508 =back
 509
 510 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 511 If the conversion fails for some reason, an
 512 appropriate messages will be placed in the returned
 513 C<@errors> array.
 514
 515 FIXME - is ISO-5426 equivalent enough to MARC-8
 516 that C<MARC::Charset> can be used instead?
 517
 518 =cut
 519
 520 sub _marc_iso5426_to_utf8 {
 521     my $marc_record = shift;
 522     my $marc_flavour = shift;
 523
 524     my @errors = ();
 525
 526     foreach my $field ($marc_record->fields()) {
 527         if ($field->is_control_field()) {
 528             ; # do nothing -- control fields should not contain non-ASCII characters
 529         } else {
 530             my @converted_subfields;
 531             foreach my $subfield ($field->subfields()) {
 532                 my $utf8sf = char_decode5426($subfield->[1]);
 533                             $utf8sf=NFC($utf8sf);
 534                 push @converted_subfields, $subfield->[0], $utf8sf;
 535             }
 536
 537             $field->replace_with(MARC::Field->new(
 538                 $field->tag(), $field->indicator(1), $field->indicator(2),
 539                 @converted_subfields)
 540             );
 541         }
 542     }
 543
 544     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 545
 546     return @errors;
 547 }
 548
 549 =head2 _marc_to_utf8_via_text_iconv
 550
 551 =over 4
 552
 553 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 554
 555 =back
 556
 557 Convert a C<MARC::Record> to UTF-8 in-place using the
 558 C<Text::Iconv> CPAN module.  Any source encoding accepted
 559 by the user's iconv installation should work.  If
 560 the source encoding is not recognized on the user's
 561 server or the conversion fails for some reason,
 562 appropriate messages will be placed in the returned
 563 C<@errors> array.
 564
 565 =cut
 566
 567 sub _marc_to_utf8_via_text_iconv {
 568     my $marc_record = shift;
 569     my $marc_flavour = shift;
 570     my $source_encoding = shift;
 571
 572     my @errors = ();
 573     my $decoder;
 574     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 575     if ($@) {
 576         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 577         return @errors;
 578     }
 579
 580     my $prev_raise_error = Text::Iconv->raise_error();
 581     Text::Iconv->raise_error(1);
 582
 583     foreach my $field ($marc_record->fields()) {
 584         if ($field->is_control_field()) {
 585             ; # do nothing -- control fields should not contain non-ASCII characters
 586         } else {
 587             my @converted_subfields;
 588             foreach my $subfield ($field->subfields()) {
 589                 my $converted_value;
 590                 my $conversion_ok = 1;
 591                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 592                 if ($@) {
 593                     $conversion_ok = 0;
 594                     push @errors, $@;
 595                 } elsif (not defined $converted_value) {
 596                     $conversion_ok = 0;
 597                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 598                 }
 599
 600                 if ($conversion_ok) {
 601                     push @converted_subfields, $subfield->[0], $converted_value;
 602                 } else {
 603                     $converted_value = $subfield->[1];
 604                                 $converted_value=NFC($converted_value);
 605                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 606                     push @converted_subfields, $subfield->[0], $converted_value;
 607                 }
 608             }
 609
 610             $field->replace_with(MARC::Field->new(
 611                 $field->tag(), $field->indicator(1), $field->indicator(2),
 612                 @converted_subfields)
 613             );
 614         }
 615     }
 616
 617     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 618     Text::Iconv->raise_error($prev_raise_error);
 619
 620     return @errors;
 621 }
 622
 623 =head2 _marc_to_utf8_replacement_char
 624
 625 =over 4
 626
 627 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 628
 629 =back
 630
 631 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 632 unsatisfactory method of replacing all non-ASCII (e.g.,
 633 where the eight bit is set) octet with the Unicode
 634 replacement character.  This is meant as a last-ditch
 635 method, and would be best used as part of a UI that
 636 lets a cataloguer pick various character conversions
 637 until he or she finds the right one.
 638
 639 =cut
 640
 641 sub _marc_to_utf8_replacement_char {
 642     my $marc_record = shift;
 643     my $marc_flavour = shift;
 644
 645     foreach my $field ($marc_record->fields()) {
 646         if ($field->is_control_field()) {
 647             ; # do nothing -- control fields should not contain non-ASCII characters
 648         } else {
 649             my @converted_subfields;
 650             foreach my $subfield ($field->subfields()) {
 651                 my $value = $subfield->[1];
 652                                 $value=NFC($value);
 653                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 654                 push @converted_subfields, $subfield->[0], $value;
 655             }
 656
 657             $field->replace_with(MARC::Field->new(
 658                 $field->tag(), $field->indicator(1), $field->indicator(2),
 659                 @converted_subfields)
 660             );
 661         }
 662     }
 663
 664     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 665 }
 666
 667 =head2 char_decode5426
 668
 669 =over 4
 670
 671 my $utf8string = char_decode5426($iso_5426_string);
 672
 673 =back
 674
 675 Converts a string from ISO-5426 to UTF-8.
 676
 677 =cut
 678
 679
 680 my %chars;
 681 $chars{0xb0}=0x0101;#3/0ayn[ain]
 682 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 683 #$chars{0xb2}=0x00e0;#'à';
 684 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 685 #$chars{0xb3}=0x00e7;#'ç';
 686 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 687 # $chars{0xb4}='è';
 688 $chars{0xb4}=0x00e8;
 689 # $chars{0xb5}='é';
 690 $chars{0xb5}=0x00e9;
 691 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 692 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 693 $chars{0xfa}=0x0153;#oe
 694 $chars{0x81d1}=0x00b0;
 695
 696 ####
 697 ## combined characters iso5426
 698
 699 $chars{0xc041}=0x1ea2; # capital a with hook above
 700 $chars{0xc045}=0x1eba; # capital e with hook above
 701 $chars{0xc049}=0x1ec8; # capital i with hook above
 702 $chars{0xc04f}=0x1ece; # capital o with hook above
 703 $chars{0xc055}=0x1ee6; # capital u with hook above
 704 $chars{0xc059}=0x1ef6; # capital y with hook above
 705 $chars{0xc061}=0x1ea3; # small a with hook above
 706 $chars{0xc065}=0x1ebb; # small e with hook above
 707 $chars{0xc069}=0x1ec9; # small i with hook above
 708 $chars{0xc06f}=0x1ecf; # small o with hook above
 709 $chars{0xc075}=0x1ee7; # small u with hook above
 710 $chars{0xc079}=0x1ef7; # small y with hook above
 711
 712         # 4/1 grave accent
 713 $chars{0xc141}=0x00c0; # capital a with grave accent
 714 $chars{0xc145}=0x00c8; # capital e with grave accent
 715 $chars{0xc149}=0x00cc; # capital i with grave accent
 716 $chars{0xc14f}=0x00d2; # capital o with grave accent
 717 $chars{0xc155}=0x00d9; # capital u with grave accent
 718 $chars{0xc157}=0x1e80; # capital w with grave
 719 $chars{0xc159}=0x1ef2; # capital y with grave
 720 $chars{0xc161}=0x00e0; # small a with grave accent
 721 $chars{0xc165}=0x00e8; # small e with grave accent
 722 $chars{0xc169}=0x00ec; # small i with grave accent
 723 $chars{0xc16f}=0x00f2; # small o with grave accent
 724 $chars{0xc175}=0x00f9; # small u with grave accent
 725 $chars{0xc177}=0x1e81; # small w with grave
 726 $chars{0xc179}=0x1ef3; # small y with grave
 727         # 4/2 acute accent
 728 $chars{0xc241}=0x00c1; # capital a with acute accent
 729 $chars{0xc243}=0x0106; # capital c with acute accent
 730 $chars{0xc245}=0x00c9; # capital e with acute accent
 731 $chars{0xc247}=0x01f4; # capital g with acute
 732 $chars{0xc249}=0x00cd; # capital i with acute accent
 733 $chars{0xc24b}=0x1e30; # capital k with acute
 734 $chars{0xc24c}=0x0139; # capital l with acute accent
 735 $chars{0xc24d}=0x1e3e; # capital m with acute
 736 $chars{0xc24e}=0x0143; # capital n with acute accent
 737 $chars{0xc24f}=0x00d3; # capital o with acute accent
 738 $chars{0xc250}=0x1e54; # capital p with acute
 739 $chars{0xc252}=0x0154; # capital r with acute accent
 740 $chars{0xc253}=0x015a; # capital s with acute accent
 741 $chars{0xc255}=0x00da; # capital u with acute accent
 742 $chars{0xc257}=0x1e82; # capital w with acute
 743 $chars{0xc259}=0x00dd; # capital y with acute accent
 744 $chars{0xc25a}=0x0179; # capital z with acute accent
 745 $chars{0xc261}=0x00e1; # small a with acute accent
 746 $chars{0xc263}=0x0107; # small c with acute accent
 747 $chars{0xc265}=0x00e9; # small e with acute accent
 748 $chars{0xc267}=0x01f5; # small g with acute
 749 $chars{0xc269}=0x00ed; # small i with acute accent
 750 $chars{0xc26b}=0x1e31; # small k with acute
 751 $chars{0xc26c}=0x013a; # small l with acute accent
 752 $chars{0xc26d}=0x1e3f; # small m with acute
 753 $chars{0xc26e}=0x0144; # small n with acute accent
 754 $chars{0xc26f}=0x00f3; # small o with acute accent
 755 $chars{0xc270}=0x1e55; # small p with acute
 756 $chars{0xc272}=0x0155; # small r with acute accent
 757 $chars{0xc273}=0x015b; # small s with acute accent
 758 $chars{0xc275}=0x00fa; # small u with acute accent
 759 $chars{0xc277}=0x1e83; # small w with acute
 760 $chars{0xc279}=0x00fd; # small y with acute accent
 761 $chars{0xc27a}=0x017a; # small z with acute accent
 762 $chars{0xc2e1}=0x01fc; # capital ae with acute
 763 $chars{0xc2f1}=0x01fd; # small ae with acute
 764        # 4/3 circumflex accent
 765 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 766 $chars{0xc343}=0x0108; # capital c with circumflex
 767 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 768 $chars{0xc347}=0x011c; # capital g with circumflex
 769 $chars{0xc348}=0x0124; # capital h with circumflex
 770 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 771 $chars{0xc34a}=0x0134; # capital j with circumflex
 772 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 773 $chars{0xc353}=0x015c; # capital s with circumflex
 774 $chars{0xc355}=0x00db; # capital u with circumflex
 775 $chars{0xc357}=0x0174; # capital w with circumflex
 776 $chars{0xc359}=0x0176; # capital y with circumflex
 777 $chars{0xc35a}=0x1e90; # capital z with circumflex
 778 $chars{0xc361}=0x00e2; # small a with circumflex accent
 779 $chars{0xc363}=0x0109; # small c with circumflex
 780 $chars{0xc365}=0x00ea; # small e with circumflex accent
 781 $chars{0xc367}=0x011d; # small g with circumflex
 782 $chars{0xc368}=0x0125; # small h with circumflex
 783 $chars{0xc369}=0x00ee; # small i with circumflex accent
 784 $chars{0xc36a}=0x0135; # small j with circumflex
 785 $chars{0xc36e}=0x00f1; # small n with tilde
 786 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 787 $chars{0xc373}=0x015d; # small s with circumflex
 788 $chars{0xc375}=0x00fb; # small u with circumflex
 789 $chars{0xc377}=0x0175; # small w with circumflex
 790 $chars{0xc379}=0x0177; # small y with circumflex
 791 $chars{0xc37a}=0x1e91; # small z with circumflex
 792         # 4/4 tilde
 793 $chars{0xc441}=0x00c3; # capital a with tilde
 794 $chars{0xc445}=0x1ebc; # capital e with tilde
 795 $chars{0xc449}=0x0128; # capital i with tilde
 796 $chars{0xc44e}=0x00d1; # capital n with tilde
 797 $chars{0xc44f}=0x00d5; # capital o with tilde
 798 $chars{0xc455}=0x0168; # capital u with tilde
 799 $chars{0xc456}=0x1e7c; # capital v with tilde
 800 $chars{0xc459}=0x1ef8; # capital y with tilde
 801 $chars{0xc461}=0x00e3; # small a with tilde
 802 $chars{0xc465}=0x1ebd; # small e with tilde
 803 $chars{0xc469}=0x0129; # small i with tilde
 804 $chars{0xc46e}=0x00f1; # small n with tilde
 805 $chars{0xc46f}=0x00f5; # small o with tilde
 806 $chars{0xc475}=0x0169; # small u with tilde
 807 $chars{0xc476}=0x1e7d; # small v with tilde
 808 $chars{0xc479}=0x1ef9; # small y with tilde
 809     # 4/5 macron
 810 $chars{0xc541}=0x0100; # capital a with macron
 811 $chars{0xc545}=0x0112; # capital e with macron
 812 $chars{0xc547}=0x1e20; # capital g with macron
 813 $chars{0xc549}=0x012a; # capital i with macron
 814 $chars{0xc54f}=0x014c; # capital o with macron
 815 $chars{0xc555}=0x016a; # capital u with macron
 816 $chars{0xc561}=0x0101; # small a with macron
 817 $chars{0xc565}=0x0113; # small e with macron
 818 $chars{0xc567}=0x1e21; # small g with macron
 819 $chars{0xc569}=0x012b; # small i with macron
 820 $chars{0xc56f}=0x014d; # small o with macron
 821 $chars{0xc575}=0x016b; # small u with macron
 822 $chars{0xc572}=0x0159; # small r with macron
 823 $chars{0xc5e1}=0x01e2; # capital ae with macron
 824 $chars{0xc5f1}=0x01e3; # small ae with macron
 825         # 4/6 breve
 826 $chars{0xc641}=0x0102; # capital a with breve
 827 $chars{0xc645}=0x0114; # capital e with breve
 828 $chars{0xc647}=0x011e; # capital g with breve
 829 $chars{0xc649}=0x012c; # capital i with breve
 830 $chars{0xc64f}=0x014e; # capital o with breve
 831 $chars{0xc655}=0x016c; # capital u with breve
 832 $chars{0xc661}=0x0103; # small a with breve
 833 $chars{0xc665}=0x0115; # small e with breve
 834 $chars{0xc667}=0x011f; # small g with breve
 835 $chars{0xc669}=0x012d; # small i with breve
 836 $chars{0xc66f}=0x014f; # small o with breve
 837 $chars{0xc675}=0x016d; # small u with breve
 838         # 4/7 dot above
 839 $chars{0xc7b0}=0x01e1; # Ain with dot above
 840 $chars{0xc742}=0x1e02; # capital b with dot above
 841 $chars{0xc743}=0x010a; # capital c with dot above
 842 $chars{0xc744}=0x1e0a; # capital d with dot above
 843 $chars{0xc745}=0x0116; # capital e with dot above
 844 $chars{0xc746}=0x1e1e; # capital f with dot above
 845 $chars{0xc747}=0x0120; # capital g with dot above
 846 $chars{0xc748}=0x1e22; # capital h with dot above
 847 $chars{0xc749}=0x0130; # capital i with dot above
 848 $chars{0xc74d}=0x1e40; # capital m with dot above
 849 $chars{0xc74e}=0x1e44; # capital n with dot above
 850 $chars{0xc750}=0x1e56; # capital p with dot above
 851 $chars{0xc752}=0x1e58; # capital r with dot above
 852 $chars{0xc753}=0x1e60; # capital s with dot above
 853 $chars{0xc754}=0x1e6a; # capital t with dot above
 854 $chars{0xc757}=0x1e86; # capital w with dot above
 855 $chars{0xc758}=0x1e8a; # capital x with dot above
 856 $chars{0xc759}=0x1e8e; # capital y with dot above
 857 $chars{0xc75a}=0x017b; # capital z with dot above
 858 $chars{0xc761}=0x0227; # small b with dot above
 859 $chars{0xc762}=0x1e03; # small b with dot above
 860 $chars{0xc763}=0x010b; # small c with dot above
 861 $chars{0xc764}=0x1e0b; # small d with dot above
 862 $chars{0xc765}=0x0117; # small e with dot above
 863 $chars{0xc766}=0x1e1f; # small f with dot above
 864 $chars{0xc767}=0x0121; # small g with dot above
 865 $chars{0xc768}=0x1e23; # small h with dot above
 866 $chars{0xc76d}=0x1e41; # small m with dot above
 867 $chars{0xc76e}=0x1e45; # small n with dot above
 868 $chars{0xc770}=0x1e57; # small p with dot above
 869 $chars{0xc772}=0x1e59; # small r with dot above
 870 $chars{0xc773}=0x1e61; # small s with dot above
 871 $chars{0xc774}=0x1e6b; # small t with dot above
 872 $chars{0xc777}=0x1e87; # small w with dot above
 873 $chars{0xc778}=0x1e8b; # small x with dot above
 874 $chars{0xc779}=0x1e8f; # small y with dot above
 875 $chars{0xc77a}=0x017c; # small z with dot above
 876         # 4/8 trema, diaresis
 877 $chars{0xc820}=0x00a8; # diaeresis
 878 $chars{0xc841}=0x00c4; # capital a with diaeresis
 879 $chars{0xc845}=0x00cb; # capital e with diaeresis
 880 $chars{0xc848}=0x1e26; # capital h with diaeresis
 881 $chars{0xc849}=0x00cf; # capital i with diaeresis
 882 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 883 $chars{0xc855}=0x00dc; # capital u with diaeresis
 884 $chars{0xc857}=0x1e84; # capital w with diaeresis
 885 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 886 $chars{0xc859}=0x0178; # capital y with diaeresis
 887 $chars{0xc861}=0x00e4; # small a with diaeresis
 888 $chars{0xc865}=0x00eb; # small e with diaeresis
 889 $chars{0xc868}=0x1e27; # small h with diaeresis
 890 $chars{0xc869}=0x00ef; # small i with diaeresis
 891 $chars{0xc86f}=0x00f6; # small o with diaeresis
 892 $chars{0xc874}=0x1e97; # small t with diaeresis
 893 $chars{0xc875}=0x00fc; # small u with diaeresis
 894 $chars{0xc877}=0x1e85; # small w with diaeresis
 895 $chars{0xc878}=0x1e8d; # small x with diaeresis
 896 $chars{0xc879}=0x00ff; # small y with diaeresis
 897         # 4/9 umlaut
 898 $chars{0xc920}=0x00a8; # [diaeresis]
 899 $chars{0xc961}=0x00e4; # a with umlaut
 900 $chars{0xc965}=0x00eb; # e with umlaut
 901 $chars{0xc969}=0x00ef; # i with umlaut
 902 $chars{0xc96f}=0x00f6; # o with umlaut
 903 $chars{0xc975}=0x00fc; # u with umlaut
 904         # 4/10 circle above
 905 $chars{0xca41}=0x00c5; # capital a with ring above
 906 $chars{0xcaad}=0x016e; # capital u with ring above
 907 $chars{0xca61}=0x00e5; # small a with ring above
 908 $chars{0xca75}=0x016f; # small u with ring above
 909 $chars{0xca77}=0x1e98; # small w with ring above
 910 $chars{0xca79}=0x1e99; # small y with ring above
 911         # 4/11 high comma off centre
 912         # 4/12 inverted high comma centred
 913         # 4/13 double acute accent
 914 $chars{0xcd4f}=0x0150; # capital o with double acute
 915 $chars{0xcd55}=0x0170; # capital u with double acute
 916 $chars{0xcd6f}=0x0151; # small o with double acute
 917 $chars{0xcd75}=0x0171; # small u with double acute
 918         # 4/14 horn
 919 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 920 $chars{0xce55}=0x01af; # latin capital letter u with horn
 921 $chars{0xce74}=0x01a1; # latin small letter o with horn
 922 $chars{0xce75}=0x01b0; # latin small letter u with horn
 923         # 4/15 caron (hacek
 924 $chars{0xcf41}=0x01cd; # capital a with caron
 925 $chars{0xcf43}=0x010c; # capital c with caron
 926 $chars{0xcf44}=0x010e; # capital d with caron
 927 $chars{0xcf45}=0x011a; # capital e with caron
 928 $chars{0xcf47}=0x01e6; # capital g with caron
 929 $chars{0xcf49}=0x01cf; # capital i with caron
 930 $chars{0xcf4b}=0x01e8; # capital k with caron
 931 $chars{0xcf4c}=0x013d; # capital l with caron
 932 $chars{0xcf4e}=0x0147; # capital n with caron
 933 $chars{0xcf4f}=0x01d1; # capital o with caron
 934 $chars{0xcf52}=0x0158; # capital r with caron
 935 $chars{0xcf53}=0x0160; # capital s with caron
 936 $chars{0xcf54}=0x0164; # capital t with caron
 937 $chars{0xcf55}=0x01d3; # capital u with caron
 938 $chars{0xcf5a}=0x017d; # capital z with caron
 939 $chars{0xcf61}=0x01ce; # small a with caron
 940 $chars{0xcf63}=0x010d; # small c with caron
 941 $chars{0xcf64}=0x010f; # small d with caron
 942 $chars{0xcf65}=0x011b; # small e with caron
 943 $chars{0xcf67}=0x01e7; # small g with caron
 944 $chars{0xcf69}=0x01d0; # small i with caron
 945 $chars{0xcf6a}=0x01f0; # small j with caron
 946 $chars{0xcf6b}=0x01e9; # small k with caron
 947 $chars{0xcf6c}=0x013e; # small l with caron
 948 $chars{0xcf6e}=0x0148; # small n with caron
 949 $chars{0xcf6f}=0x01d2; # small o with caron
 950 $chars{0xcf72}=0x0159; # small r with caron
 951 $chars{0xcf73}=0x0161; # small s with caron
 952 $chars{0xcf74}=0x0165; # small t with caron
 953 $chars{0xcf75}=0x01d4; # small u with caron
 954 $chars{0xcf7a}=0x017e; # small z with caron
 955         # 5/0 cedilla
 956 $chars{0xd020}=0x00b8; # cedilla
 957 $chars{0xd043}=0x00c7; # capital c with cedilla
 958 $chars{0xd044}=0x1e10; # capital d with cedilla
 959 $chars{0xd047}=0x0122; # capital g with cedilla
 960 $chars{0xd048}=0x1e28; # capital h with cedilla
 961 $chars{0xd04b}=0x0136; # capital k with cedilla
 962 $chars{0xd04c}=0x013b; # capital l with cedilla
 963 $chars{0xd04e}=0x0145; # capital n with cedilla
 964 $chars{0xd052}=0x0156; # capital r with cedilla
 965 $chars{0xd053}=0x015e; # capital s with cedilla
 966 $chars{0xd054}=0x0162; # capital t with cedilla
 967 $chars{0xd063}=0x00e7; # small c with cedilla
 968 $chars{0xd064}=0x1e11; # small d with cedilla
 969 $chars{0xd065}=0x0119; # small e with cedilla
 970 $chars{0xd067}=0x0123; # small g with cedilla
 971 $chars{0xd068}=0x1e29; # small h with cedilla
 972 $chars{0xd06b}=0x0137; # small k with cedilla
 973 $chars{0xd06c}=0x013c; # small l with cedilla
 974 $chars{0xd06e}=0x0146; # small n with cedilla
 975 $chars{0xd072}=0x0157; # small r with cedilla
 976 $chars{0xd073}=0x015f; # small s with cedilla
 977 $chars{0xd074}=0x0163; # small t with cedilla
 978         # 5/1 rude
 979         # 5/2 hook to left
 980         # 5/3 ogonek (hook to right
 981 $chars{0xd320}=0x02db; # ogonek
 982 $chars{0xd341}=0x0104; # capital a with ogonek
 983 $chars{0xd345}=0x0118; # capital e with ogonek
 984 $chars{0xd349}=0x012e; # capital i with ogonek
 985 $chars{0xd34f}=0x01ea; # capital o with ogonek
 986 $chars{0xd355}=0x0172; # capital u with ogonek
 987 $chars{0xd361}=0x0105; # small a with ogonek
 988 $chars{0xd365}=0x0119; # small e with ogonek
 989 $chars{0xd369}=0x012f; # small i with ogonek
 990 $chars{0xd36f}=0x01eb; # small o with ogonek
 991 $chars{0xd375}=0x0173; # small u with ogonek
 992         # 5/4 circle below
 993 $chars{0xd441}=0x1e00; # capital a with ring below
 994 $chars{0xd461}=0x1e01; # small a with ring below
 995         # 5/5 half circle below
 996 $chars{0xf948}=0x1e2a; # capital h with breve below
 997 $chars{0xf968}=0x1e2b; # small h with breve below
 998         # 5/6 dot below
 999 $chars{0xd641}=0x1ea0; # capital a with dot below
1000 $chars{0xd642}=0x1e04; # capital b with dot below
1001 $chars{0xd644}=0x1e0c; # capital d with dot below
1002 $chars{0xd645}=0x1eb8; # capital e with dot below
1003 $chars{0xd648}=0x1e24; # capital h with dot below
1004 $chars{0xd649}=0x1eca; # capital i with dot below
1005 $chars{0xd64b}=0x1e32; # capital k with dot below
1006 $chars{0xd64c}=0x1e36; # capital l with dot below
1007 $chars{0xd64d}=0x1e42; # capital m with dot below
1008 $chars{0xd64e}=0x1e46; # capital n with dot below
1009 $chars{0xd64f}=0x1ecc; # capital o with dot below
1010 $chars{0xd652}=0x1e5a; # capital r with dot below
1011 $chars{0xd653}=0x1e62; # capital s with dot below
1012 $chars{0xd654}=0x1e6c; # capital t with dot below
1013 $chars{0xd655}=0x1ee4; # capital u with dot below
1014 $chars{0xd656}=0x1e7e; # capital v with dot below
1015 $chars{0xd657}=0x1e88; # capital w with dot below
1016 $chars{0xd659}=0x1ef4; # capital y with dot below
1017 $chars{0xd65a}=0x1e92; # capital z with dot below
1018 $chars{0xd661}=0x1ea1; # small a with dot below
1019 $chars{0xd662}=0x1e05; # small b with dot below
1020 $chars{0xd664}=0x1e0d; # small d with dot below
1021 $chars{0xd665}=0x1eb9; # small e with dot below
1022 $chars{0xd668}=0x1e25; # small h with dot below
1023 $chars{0xd669}=0x1ecb; # small i with dot below
1024 $chars{0xd66b}=0x1e33; # small k with dot below
1025 $chars{0xd66c}=0x1e37; # small l with dot below
1026 $chars{0xd66d}=0x1e43; # small m with dot below
1027 $chars{0xd66e}=0x1e47; # small n with dot below
1028 $chars{0xd66f}=0x1ecd; # small o with dot below
1029 $chars{0xd672}=0x1e5b; # small r with dot below
1030 $chars{0xd673}=0x1e63; # small s with dot below
1031 $chars{0xd674}=0x1e6d; # small t with dot below
1032 $chars{0xd675}=0x1ee5; # small u with dot below
1033 $chars{0xd676}=0x1e7f; # small v with dot below
1034 $chars{0xd677}=0x1e89; # small w with dot below
1035 $chars{0xd679}=0x1ef5; # small y with dot below
1036 $chars{0xd67a}=0x1e93; # small z with dot below
1037         # 5/7 double dot below
1038 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1039 $chars{0xd775}=0x1e73; # small u with diaeresis below
1040         # 5/8 underline
1041 $chars{0xd820}=0x005f; # underline
1042         # 5/9 double underline
1043 $chars{0xd920}=0x2017; # double underline
1044         # 5/10 small low vertical bar
1045 $chars{0xda20}=0x02cc; #
1046         # 5/11 circumflex below
1047         # 5/12 (this position shall not be used)
1048         # 5/13 left half of ligature sign and of double tilde
1049         # 5/14 right half of ligature sign
1050         # 5/15 right half of double tilde
1051 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1052
1053 sub char_decode5426 {
1054     my ( $string) = @_;
1055     my $result;
1056
1057     my @data = unpack("C*", $string);
1058     my @characters;
1059     my $length=scalar(@data);
1060     for (my $i = 0; $i < scalar(@data); $i++) {
1061       my $char= $data[$i];
1062       if ($char >= 0x00 && $char <= 0x7F){
1063         #IsAscii
1064
1065           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1066       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1067         #Combined Char
1068         my $convchar ;
1069         if ($chars{$char*256+$data[$i+1]}) {
1070           $convchar= $chars{$char * 256 + $data[$i+1]};
1071           $i++;
1072 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1073         } elsif ($chars{$char})  {
1074           $convchar= $chars{$char};
1075 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1076         }else {
1077           $convchar=$char;
1078         }
1079         push @characters,$convchar;
1080       } else {
1081         my $convchar;
1082         if ($chars{$char})  {
1083           $convchar= $chars{$char};
1084 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1085         }else {
1086 #            printf "char %x $char\n",$char;
1087           $convchar=$char;
1088         }
1089         push @characters,$convchar;
1090       }
1091     }
1092     $result=pack "U*",@characters;
1093 #     $result=~s/\x01//;
1094 #     $result=~s/\x00//;
1095      $result=~s/\x0f//;
1096      $result=~s/\x1b.//;
1097      $result=~s/\x0e//;
1098      $result=~s/\x1b\x5b//;
1099 #   map{printf "%x",$_} @characters;
1100 #   printf "\n";
1101   return $result;
1102 }
1103
1104 1;
1105
1106
1107 =head1 AUTHOR
1108
1109 Koha Development Team <info@koha.org>
1110
1111 Galen Charlton <galen.charlton@liblime.com>
1112
1113 =cut