3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along with
17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
18 # Suite 330, Boston, MA 02111-1307 USA
23 use MARC::Charset qw/marc8_to_utf8/;
25 use Unicode::Normalize;
27 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
30 # set the version for version checking
45 C4::Charset - utilities for handling character set conversions.
53 This module contains routines for dealing with character set
54 conversions, particularly for MARC records.
56 A variety of character encodings are in use by various MARC
57 standards, and even more character encodings are used by
58 non-standard MARC records. The various MARC formats generally
59 do not do a good job of advertising a given record's character
60 encoding, and even when a record does advertise its encoding,
61 e.g., via the Leader/09, experience has shown that one cannot
64 Ultimately, all MARC records are stored in Koha in UTF-8 and
65 must be converted from whatever the source character encoding is.
66 The goal of this module is to ensure that these conversions
67 take place accurately. When a character conversion cannot take
68 place, or at least not accurately, the module was provide
69 enough information to allow user-facing code to inform the user
70 on how to deal with the situation.
76 =head2 IsStringUTF8ish
80 my $is_utf8 = IsStringUTF8ish($str);
84 Determines if C<$str> is valid UTF-8. This can mean
91 The Perl UTF-8 flag is set and the string contains valid UTF-8.
95 The Perl UTF-8 flag is B<not> set, but the octets contain
100 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
101 because in one could be presented with a MARC blob that is
102 not actually in UTF-8 but whose sequence of octets appears to be
103 valid UTF-8. The rest of the MARC character conversion functions
104 will assume that this situation occur does not very often.
108 sub IsStringUTF8ish {
111 return 1 if utf8::is_utf8($str);
112 return utf8::decode($str);
116 =head2 Normalize_String
120 my $$string_normalized = Normalize_String($string);
124 Returns normalized string C<$string> in C Form
129 sub Normalize_String {
131 if (IsStringUTF8ish($string)){
139 =head2 MarcToUTF8Record
143 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
147 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
148 optional source encoding, return a C<MARC::Record> that is
151 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
152 is not guaranteed to have been converted correctly. Specifically,
153 if C<$converted_from> is 'failed', the MARC record returned failed
154 character conversion and had each of its non-ASCII octets changed
155 to the Unicode replacement character.
157 If the source encoding was not specified, this routine will
158 try to guess it; the character encoding used for a successful
159 conversion is returned in C<$converted_from>.
163 sub MarcToUTF8Record {
165 my $marc_flavour = shift;
166 my $source_encoding = shift;
169 my $marc_blob_is_utf8 = 0;
170 if (ref($marc) eq 'MARC::Record') {
171 my $marc_blob = $marc->as_usmarc();
172 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
173 $marc_record = $marc;
175 # dealing with a MARC blob
177 # remove any ersatz whitespace from the beginning and
178 # end of the MARC blob -- these can creep into MARC
179 # files produced by several sources -- caller really
180 # should be doing this, however
183 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
185 $marc_record = MARC::Record->new_from_usmarc($marc);
188 # if we fail the first time, one likely problem
189 # is that we have a MARC21 record that says that it's
190 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
191 # We'll try parsing it again.
192 substr($marc, 9, 1) = ' ';
194 $marc_record = MARC::Record->new_from_usmarc($marc);
197 # it's hopeless; return an empty MARC::Record
198 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
203 # If we do not know the source encoding, try some guesses
205 # 1. Record is UTF-8 already.
206 # 2. If MARC flavor is MARC21, then
207 # a. record is MARC-8
208 # b. record is ISO-8859-1
209 # 3. If MARC flavor is UNIMARC, then
210 if (not defined $source_encoding) {
211 if ($marc_blob_is_utf8) {
212 # note that for MARC21 we are not bothering to check
213 # if the Leader/09 is set to 'a' or not -- because
214 # of problems with various ILSs (including Koha in the
215 # past, alas), this just is not trustworthy.
216 SetMarcUnicodeFlag($marc_record, $marc_flavour);
217 return $marc_record, 'UTF-8', [];
219 if ($marc_flavour eq 'MARC21') {
220 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
221 } elsif ($marc_flavour eq 'UNIMARC') {
222 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
224 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
228 # caller knows the character encoding
229 my $original_marc_record = $marc_record->clone();
231 if ($source_encoding =~ /utf-?8/i) {
232 if ($marc_blob_is_utf8) {
233 SetMarcUnicodeFlag($marc_record, $marc_flavour);
234 return $marc_record, 'UTF-8', [];
236 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
238 } elsif ($source_encoding =~ /marc-?8/i) {
239 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
240 } elsif ($source_encoding =~ /5426/) {
241 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
243 # assume any other character encoding is for Text::Iconv
244 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
248 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
249 return $original_marc_record, 'failed', \@errors;
251 return $marc_record, $source_encoding, [];
257 =head2 SetMarcUnicodeFlag
261 SetMarcUnicodeFlag($marc_record, $marc_flavour);
265 Set both the internal MARC::Record encoding flag
266 and the appropriate Leader/09 (MARC21) or
267 100/26-29 (UNIMARC) to indicate that the record
268 is in UTF-8. Note that this does B<not> do
269 any actual character conversion.
273 sub SetMarcUnicodeFlag {
274 my $marc_record = shift;
275 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
277 $marc_record->encoding('UTF-8');
278 if ($marc_flavour eq 'MARC21') {
279 my $leader = $marc_record->leader();
280 substr($leader, 9, 1) = 'a';
281 $marc_record->leader($leader);
282 } elsif ($marc_flavour eq "UNIMARC") {
283 if (my $field = $marc_record->field('100')) {
284 my $sfa = $field->subfield('a');
287 # fix the length of the field
288 $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
289 $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
291 substr($sfa, 26, 4) = '50 ';
292 $field->update('a' => $sfa);
295 warn "Unrecognized marcflavour: $marc_flavour";
299 =head2 StripNonXmlChars
303 my $new_str = StripNonXmlChars($old_str);
307 Given a string, return a copy with the
308 characters that are illegal in XML
311 This function exists to work around a problem
312 that can occur with badly-encoded MARC records.
313 Specifically, if a UTF-8 MARC record also
314 has excape (\x1b) characters, MARC::File::XML
315 will let the escape characters pass through
316 when as_xml() or as_xml_record() is called. The
317 problem is that the escape character is not
318 legal in well-formed XML documents, so when
319 MARC::File::XML attempts to parse such a record,
320 the XML parser will fail.
322 Stripping such characters will allow a
323 MARC::Record->new_from_xml()
324 to work, at the possible risk of some data loss.
328 sub StripNonXmlChars {
330 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
334 =head1 INTERNAL FUNCTIONS
336 =head2 _default_marc21_charconv_to_utf8
340 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
344 Converts a C<MARC::Record> of unknown character set to UTF-8,
345 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
346 to UTF-8, then a default conversion that replaces each non-ASCII
347 character with the replacement character.
349 The C<$guessed_charset> return value contains the character set
350 that resulted in a conversion to valid UTF-8; note that
351 if the MARC-8 and ISO-8859-1 conversions failed, the value of
356 sub _default_marc21_charconv_to_utf8 {
357 my $marc_record = shift;
358 my $marc_flavour = shift;
360 my $trial_marc8 = $marc_record->clone();
362 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
364 return $trial_marc8, 'MARC-8', [];
366 push @all_errors, @errors;
368 my $trial_8859_1 = $marc_record->clone();
369 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
371 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
372 # instead if we wanted to report details
373 # of the failed attempt at MARC-8 => UTF-8
375 push @all_errors, @errors;
377 my $default_converted = $marc_record->clone();
378 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
379 return $default_converted, 'failed', \@all_errors;
382 =head2 _default_unimarc_charconv_to_utf8
386 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
390 Converts a C<MARC::Record> of unknown character set to UTF-8,
391 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
392 to UTF-8, then a default conversion that replaces each non-ASCII
393 character with the replacement character.
395 The C<$guessed_charset> return value contains the character set
396 that resulted in a conversion to valid UTF-8; note that
397 if the MARC-8 and ISO-8859-1 conversions failed, the value of
402 sub _default_unimarc_charconv_to_utf8 {
403 my $marc_record = shift;
404 my $marc_flavour = shift;
406 my $trial_marc8 = $marc_record->clone();
408 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
410 return $trial_marc8, 'iso-5426';
412 push @all_errors, @errors;
414 my $trial_8859_1 = $marc_record->clone();
415 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
417 return $trial_8859_1, 'iso-8859-1';
419 push @all_errors, @errors;
421 my $default_converted = $marc_record->clone();
422 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
423 return $default_converted, 'failed', \@all_errors;
426 =head2 _marc_marc8_to_utf8
430 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
434 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
435 If the conversion fails for some reason, an
436 appropriate messages will be placed in the returned
441 sub _marc_marc8_to_utf8 {
442 my $marc_record = shift;
443 my $marc_flavour = shift;
445 my $prev_ignore = MARC::Charset->ignore_errors();
446 MARC::Charset->ignore_errors(1);
448 # trap warnings raised by MARC::Charset
450 local $SIG{__WARN__} = sub {
452 if ($msg =~ /MARC.Charset/) {
453 # FIXME - purpose of this regexp is to strip out the
454 # line reference to MARC/Charset.pm, but as it
455 # exists probably won't work quite on Windows --
456 # some sort of minimal-bunch back-tracking RE
457 # would be helpful here
458 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
461 # if warning doesn't come from MARC::Charset, just
467 foreach my $field ($marc_record->fields()) {
468 if ($field->is_control_field()) {
469 ; # do nothing -- control fields should not contain non-ASCII characters
471 my @converted_subfields;
472 foreach my $subfield ($field->subfields()) {
473 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
474 unless (IsStringUTF8ish($utf8sf)) {
475 # Because of a bug in MARC::Charset 0.98, if the string
476 # has (a) one or more diacritics that (b) are only in character positions
477 # 128 to 255 inclusive, the resulting converted string is not in
478 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
479 # occurs, upgrade the string in place. Moral of the story seems to be
480 # that pack("U", ...) is better than chr(...) if you need to guarantee
481 # that the resulting string is UTF-8.
482 utf8::upgrade($utf8sf);
484 $utf8sf=NFC($utf8sf);
485 push @converted_subfields, $subfield->[0], $utf8sf;
488 $field->replace_with(MARC::Field->new(
489 $field->tag(), $field->indicator(1), $field->indicator(2),
490 @converted_subfields)
495 MARC::Charset->ignore_errors($prev_ignore);
497 SetMarcUnicodeFlag($marc_record, $marc_flavour);
502 =head2 _marc_iso5426_to_utf8
506 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
510 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
511 If the conversion fails for some reason, an
512 appropriate messages will be placed in the returned
515 FIXME - is ISO-5426 equivalent enough to MARC-8
516 that C<MARC::Charset> can be used instead?
520 sub _marc_iso5426_to_utf8 {
521 my $marc_record = shift;
522 my $marc_flavour = shift;
526 foreach my $field ($marc_record->fields()) {
527 if ($field->is_control_field()) {
528 ; # do nothing -- control fields should not contain non-ASCII characters
530 my @converted_subfields;
531 foreach my $subfield ($field->subfields()) {
532 my $utf8sf = char_decode5426($subfield->[1]);
533 $utf8sf=NFC($utf8sf);
534 push @converted_subfields, $subfield->[0], $utf8sf;
537 $field->replace_with(MARC::Field->new(
538 $field->tag(), $field->indicator(1), $field->indicator(2),
539 @converted_subfields)
544 SetMarcUnicodeFlag($marc_record, $marc_flavour);
549 =head2 _marc_to_utf8_via_text_iconv
553 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
557 Convert a C<MARC::Record> to UTF-8 in-place using the
558 C<Text::Iconv> CPAN module. Any source encoding accepted
559 by the user's iconv installation should work. If
560 the source encoding is not recognized on the user's
561 server or the conversion fails for some reason,
562 appropriate messages will be placed in the returned
567 sub _marc_to_utf8_via_text_iconv {
568 my $marc_record = shift;
569 my $marc_flavour = shift;
570 my $source_encoding = shift;
574 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
576 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
580 my $prev_raise_error = Text::Iconv->raise_error();
581 Text::Iconv->raise_error(1);
583 foreach my $field ($marc_record->fields()) {
584 if ($field->is_control_field()) {
585 ; # do nothing -- control fields should not contain non-ASCII characters
587 my @converted_subfields;
588 foreach my $subfield ($field->subfields()) {
590 my $conversion_ok = 1;
591 eval { $converted_value = $decoder->convert($subfield->[1]); };
595 } elsif (not defined $converted_value) {
597 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
600 if ($conversion_ok) {
601 push @converted_subfields, $subfield->[0], $converted_value;
603 $converted_value = $subfield->[1];
604 $converted_value=NFC($converted_value);
605 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
606 push @converted_subfields, $subfield->[0], $converted_value;
610 $field->replace_with(MARC::Field->new(
611 $field->tag(), $field->indicator(1), $field->indicator(2),
612 @converted_subfields)
617 SetMarcUnicodeFlag($marc_record, $marc_flavour);
618 Text::Iconv->raise_error($prev_raise_error);
623 =head2 _marc_to_utf8_replacement_char
627 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
631 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
632 unsatisfactory method of replacing all non-ASCII (e.g.,
633 where the eight bit is set) octet with the Unicode
634 replacement character. This is meant as a last-ditch
635 method, and would be best used as part of a UI that
636 lets a cataloguer pick various character conversions
637 until he or she finds the right one.
641 sub _marc_to_utf8_replacement_char {
642 my $marc_record = shift;
643 my $marc_flavour = shift;
645 foreach my $field ($marc_record->fields()) {
646 if ($field->is_control_field()) {
647 ; # do nothing -- control fields should not contain non-ASCII characters
649 my @converted_subfields;
650 foreach my $subfield ($field->subfields()) {
651 my $value = $subfield->[1];
653 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
654 push @converted_subfields, $subfield->[0], $value;
657 $field->replace_with(MARC::Field->new(
658 $field->tag(), $field->indicator(1), $field->indicator(2),
659 @converted_subfields)
664 SetMarcUnicodeFlag($marc_record, $marc_flavour);
667 =head2 char_decode5426
671 my $utf8string = char_decode5426($iso_5426_string);
675 Converts a string from ISO-5426 to UTF-8.
681 $chars{0xb0}=0x0101;#3/0ayn[ain]
682 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
683 #$chars{0xb2}=0x00e0;#'Ã ';
684 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
685 #$chars{0xb3}=0x00e7;#'ç';
686 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
691 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
692 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
693 $chars{0xfa}=0x0153;#oe
694 $chars{0x81d1}=0x00b0;
697 ## combined characters iso5426
699 $chars{0xc041}=0x1ea2; # capital a with hook above
700 $chars{0xc045}=0x1eba; # capital e with hook above
701 $chars{0xc049}=0x1ec8; # capital i with hook above
702 $chars{0xc04f}=0x1ece; # capital o with hook above
703 $chars{0xc055}=0x1ee6; # capital u with hook above
704 $chars{0xc059}=0x1ef6; # capital y with hook above
705 $chars{0xc061}=0x1ea3; # small a with hook above
706 $chars{0xc065}=0x1ebb; # small e with hook above
707 $chars{0xc069}=0x1ec9; # small i with hook above
708 $chars{0xc06f}=0x1ecf; # small o with hook above
709 $chars{0xc075}=0x1ee7; # small u with hook above
710 $chars{0xc079}=0x1ef7; # small y with hook above
713 $chars{0xc141}=0x00c0; # capital a with grave accent
714 $chars{0xc145}=0x00c8; # capital e with grave accent
715 $chars{0xc149}=0x00cc; # capital i with grave accent
716 $chars{0xc14f}=0x00d2; # capital o with grave accent
717 $chars{0xc155}=0x00d9; # capital u with grave accent
718 $chars{0xc157}=0x1e80; # capital w with grave
719 $chars{0xc159}=0x1ef2; # capital y with grave
720 $chars{0xc161}=0x00e0; # small a with grave accent
721 $chars{0xc165}=0x00e8; # small e with grave accent
722 $chars{0xc169}=0x00ec; # small i with grave accent
723 $chars{0xc16f}=0x00f2; # small o with grave accent
724 $chars{0xc175}=0x00f9; # small u with grave accent
725 $chars{0xc177}=0x1e81; # small w with grave
726 $chars{0xc179}=0x1ef3; # small y with grave
728 $chars{0xc241}=0x00c1; # capital a with acute accent
729 $chars{0xc243}=0x0106; # capital c with acute accent
730 $chars{0xc245}=0x00c9; # capital e with acute accent
731 $chars{0xc247}=0x01f4; # capital g with acute
732 $chars{0xc249}=0x00cd; # capital i with acute accent
733 $chars{0xc24b}=0x1e30; # capital k with acute
734 $chars{0xc24c}=0x0139; # capital l with acute accent
735 $chars{0xc24d}=0x1e3e; # capital m with acute
736 $chars{0xc24e}=0x0143; # capital n with acute accent
737 $chars{0xc24f}=0x00d3; # capital o with acute accent
738 $chars{0xc250}=0x1e54; # capital p with acute
739 $chars{0xc252}=0x0154; # capital r with acute accent
740 $chars{0xc253}=0x015a; # capital s with acute accent
741 $chars{0xc255}=0x00da; # capital u with acute accent
742 $chars{0xc257}=0x1e82; # capital w with acute
743 $chars{0xc259}=0x00dd; # capital y with acute accent
744 $chars{0xc25a}=0x0179; # capital z with acute accent
745 $chars{0xc261}=0x00e1; # small a with acute accent
746 $chars{0xc263}=0x0107; # small c with acute accent
747 $chars{0xc265}=0x00e9; # small e with acute accent
748 $chars{0xc267}=0x01f5; # small g with acute
749 $chars{0xc269}=0x00ed; # small i with acute accent
750 $chars{0xc26b}=0x1e31; # small k with acute
751 $chars{0xc26c}=0x013a; # small l with acute accent
752 $chars{0xc26d}=0x1e3f; # small m with acute
753 $chars{0xc26e}=0x0144; # small n with acute accent
754 $chars{0xc26f}=0x00f3; # small o with acute accent
755 $chars{0xc270}=0x1e55; # small p with acute
756 $chars{0xc272}=0x0155; # small r with acute accent
757 $chars{0xc273}=0x015b; # small s with acute accent
758 $chars{0xc275}=0x00fa; # small u with acute accent
759 $chars{0xc277}=0x1e83; # small w with acute
760 $chars{0xc279}=0x00fd; # small y with acute accent
761 $chars{0xc27a}=0x017a; # small z with acute accent
762 $chars{0xc2e1}=0x01fc; # capital ae with acute
763 $chars{0xc2f1}=0x01fd; # small ae with acute
764 # 4/3 circumflex accent
765 $chars{0xc341}=0x00c2; # capital a with circumflex accent
766 $chars{0xc343}=0x0108; # capital c with circumflex
767 $chars{0xc345}=0x00ca; # capital e with circumflex accent
768 $chars{0xc347}=0x011c; # capital g with circumflex
769 $chars{0xc348}=0x0124; # capital h with circumflex
770 $chars{0xc349}=0x00ce; # capital i with circumflex accent
771 $chars{0xc34a}=0x0134; # capital j with circumflex
772 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
773 $chars{0xc353}=0x015c; # capital s with circumflex
774 $chars{0xc355}=0x00db; # capital u with circumflex
775 $chars{0xc357}=0x0174; # capital w with circumflex
776 $chars{0xc359}=0x0176; # capital y with circumflex
777 $chars{0xc35a}=0x1e90; # capital z with circumflex
778 $chars{0xc361}=0x00e2; # small a with circumflex accent
779 $chars{0xc363}=0x0109; # small c with circumflex
780 $chars{0xc365}=0x00ea; # small e with circumflex accent
781 $chars{0xc367}=0x011d; # small g with circumflex
782 $chars{0xc368}=0x0125; # small h with circumflex
783 $chars{0xc369}=0x00ee; # small i with circumflex accent
784 $chars{0xc36a}=0x0135; # small j with circumflex
785 $chars{0xc36e}=0x00f1; # small n with tilde
786 $chars{0xc36f}=0x00f4; # small o with circumflex accent
787 $chars{0xc373}=0x015d; # small s with circumflex
788 $chars{0xc375}=0x00fb; # small u with circumflex
789 $chars{0xc377}=0x0175; # small w with circumflex
790 $chars{0xc379}=0x0177; # small y with circumflex
791 $chars{0xc37a}=0x1e91; # small z with circumflex
793 $chars{0xc441}=0x00c3; # capital a with tilde
794 $chars{0xc445}=0x1ebc; # capital e with tilde
795 $chars{0xc449}=0x0128; # capital i with tilde
796 $chars{0xc44e}=0x00d1; # capital n with tilde
797 $chars{0xc44f}=0x00d5; # capital o with tilde
798 $chars{0xc455}=0x0168; # capital u with tilde
799 $chars{0xc456}=0x1e7c; # capital v with tilde
800 $chars{0xc459}=0x1ef8; # capital y with tilde
801 $chars{0xc461}=0x00e3; # small a with tilde
802 $chars{0xc465}=0x1ebd; # small e with tilde
803 $chars{0xc469}=0x0129; # small i with tilde
804 $chars{0xc46e}=0x00f1; # small n with tilde
805 $chars{0xc46f}=0x00f5; # small o with tilde
806 $chars{0xc475}=0x0169; # small u with tilde
807 $chars{0xc476}=0x1e7d; # small v with tilde
808 $chars{0xc479}=0x1ef9; # small y with tilde
810 $chars{0xc541}=0x0100; # capital a with macron
811 $chars{0xc545}=0x0112; # capital e with macron
812 $chars{0xc547}=0x1e20; # capital g with macron
813 $chars{0xc549}=0x012a; # capital i with macron
814 $chars{0xc54f}=0x014c; # capital o with macron
815 $chars{0xc555}=0x016a; # capital u with macron
816 $chars{0xc561}=0x0101; # small a with macron
817 $chars{0xc565}=0x0113; # small e with macron
818 $chars{0xc567}=0x1e21; # small g with macron
819 $chars{0xc569}=0x012b; # small i with macron
820 $chars{0xc56f}=0x014d; # small o with macron
821 $chars{0xc575}=0x016b; # small u with macron
822 $chars{0xc572}=0x0159; # small r with macron
823 $chars{0xc5e1}=0x01e2; # capital ae with macron
824 $chars{0xc5f1}=0x01e3; # small ae with macron
826 $chars{0xc641}=0x0102; # capital a with breve
827 $chars{0xc645}=0x0114; # capital e with breve
828 $chars{0xc647}=0x011e; # capital g with breve
829 $chars{0xc649}=0x012c; # capital i with breve
830 $chars{0xc64f}=0x014e; # capital o with breve
831 $chars{0xc655}=0x016c; # capital u with breve
832 $chars{0xc661}=0x0103; # small a with breve
833 $chars{0xc665}=0x0115; # small e with breve
834 $chars{0xc667}=0x011f; # small g with breve
835 $chars{0xc669}=0x012d; # small i with breve
836 $chars{0xc66f}=0x014f; # small o with breve
837 $chars{0xc675}=0x016d; # small u with breve
839 $chars{0xc7b0}=0x01e1; # Ain with dot above
840 $chars{0xc742}=0x1e02; # capital b with dot above
841 $chars{0xc743}=0x010a; # capital c with dot above
842 $chars{0xc744}=0x1e0a; # capital d with dot above
843 $chars{0xc745}=0x0116; # capital e with dot above
844 $chars{0xc746}=0x1e1e; # capital f with dot above
845 $chars{0xc747}=0x0120; # capital g with dot above
846 $chars{0xc748}=0x1e22; # capital h with dot above
847 $chars{0xc749}=0x0130; # capital i with dot above
848 $chars{0xc74d}=0x1e40; # capital m with dot above
849 $chars{0xc74e}=0x1e44; # capital n with dot above
850 $chars{0xc750}=0x1e56; # capital p with dot above
851 $chars{0xc752}=0x1e58; # capital r with dot above
852 $chars{0xc753}=0x1e60; # capital s with dot above
853 $chars{0xc754}=0x1e6a; # capital t with dot above
854 $chars{0xc757}=0x1e86; # capital w with dot above
855 $chars{0xc758}=0x1e8a; # capital x with dot above
856 $chars{0xc759}=0x1e8e; # capital y with dot above
857 $chars{0xc75a}=0x017b; # capital z with dot above
858 $chars{0xc761}=0x0227; # small b with dot above
859 $chars{0xc762}=0x1e03; # small b with dot above
860 $chars{0xc763}=0x010b; # small c with dot above
861 $chars{0xc764}=0x1e0b; # small d with dot above
862 $chars{0xc765}=0x0117; # small e with dot above
863 $chars{0xc766}=0x1e1f; # small f with dot above
864 $chars{0xc767}=0x0121; # small g with dot above
865 $chars{0xc768}=0x1e23; # small h with dot above
866 $chars{0xc76d}=0x1e41; # small m with dot above
867 $chars{0xc76e}=0x1e45; # small n with dot above
868 $chars{0xc770}=0x1e57; # small p with dot above
869 $chars{0xc772}=0x1e59; # small r with dot above
870 $chars{0xc773}=0x1e61; # small s with dot above
871 $chars{0xc774}=0x1e6b; # small t with dot above
872 $chars{0xc777}=0x1e87; # small w with dot above
873 $chars{0xc778}=0x1e8b; # small x with dot above
874 $chars{0xc779}=0x1e8f; # small y with dot above
875 $chars{0xc77a}=0x017c; # small z with dot above
876 # 4/8 trema, diaresis
877 $chars{0xc820}=0x00a8; # diaeresis
878 $chars{0xc841}=0x00c4; # capital a with diaeresis
879 $chars{0xc845}=0x00cb; # capital e with diaeresis
880 $chars{0xc848}=0x1e26; # capital h with diaeresis
881 $chars{0xc849}=0x00cf; # capital i with diaeresis
882 $chars{0xc84f}=0x00d6; # capital o with diaeresis
883 $chars{0xc855}=0x00dc; # capital u with diaeresis
884 $chars{0xc857}=0x1e84; # capital w with diaeresis
885 $chars{0xc858}=0x1e8c; # capital x with diaeresis
886 $chars{0xc859}=0x0178; # capital y with diaeresis
887 $chars{0xc861}=0x00e4; # small a with diaeresis
888 $chars{0xc865}=0x00eb; # small e with diaeresis
889 $chars{0xc868}=0x1e27; # small h with diaeresis
890 $chars{0xc869}=0x00ef; # small i with diaeresis
891 $chars{0xc86f}=0x00f6; # small o with diaeresis
892 $chars{0xc874}=0x1e97; # small t with diaeresis
893 $chars{0xc875}=0x00fc; # small u with diaeresis
894 $chars{0xc877}=0x1e85; # small w with diaeresis
895 $chars{0xc878}=0x1e8d; # small x with diaeresis
896 $chars{0xc879}=0x00ff; # small y with diaeresis
898 $chars{0xc920}=0x00a8; # [diaeresis]
899 $chars{0xc961}=0x00e4; # a with umlaut
900 $chars{0xc965}=0x00eb; # e with umlaut
901 $chars{0xc969}=0x00ef; # i with umlaut
902 $chars{0xc96f}=0x00f6; # o with umlaut
903 $chars{0xc975}=0x00fc; # u with umlaut
905 $chars{0xca41}=0x00c5; # capital a with ring above
906 $chars{0xcaad}=0x016e; # capital u with ring above
907 $chars{0xca61}=0x00e5; # small a with ring above
908 $chars{0xca75}=0x016f; # small u with ring above
909 $chars{0xca77}=0x1e98; # small w with ring above
910 $chars{0xca79}=0x1e99; # small y with ring above
911 # 4/11 high comma off centre
912 # 4/12 inverted high comma centred
913 # 4/13 double acute accent
914 $chars{0xcd4f}=0x0150; # capital o with double acute
915 $chars{0xcd55}=0x0170; # capital u with double acute
916 $chars{0xcd6f}=0x0151; # small o with double acute
917 $chars{0xcd75}=0x0171; # small u with double acute
919 $chars{0xce54}=0x01a0; # latin capital letter o with horn
920 $chars{0xce55}=0x01af; # latin capital letter u with horn
921 $chars{0xce74}=0x01a1; # latin small letter o with horn
922 $chars{0xce75}=0x01b0; # latin small letter u with horn
924 $chars{0xcf41}=0x01cd; # capital a with caron
925 $chars{0xcf43}=0x010c; # capital c with caron
926 $chars{0xcf44}=0x010e; # capital d with caron
927 $chars{0xcf45}=0x011a; # capital e with caron
928 $chars{0xcf47}=0x01e6; # capital g with caron
929 $chars{0xcf49}=0x01cf; # capital i with caron
930 $chars{0xcf4b}=0x01e8; # capital k with caron
931 $chars{0xcf4c}=0x013d; # capital l with caron
932 $chars{0xcf4e}=0x0147; # capital n with caron
933 $chars{0xcf4f}=0x01d1; # capital o with caron
934 $chars{0xcf52}=0x0158; # capital r with caron
935 $chars{0xcf53}=0x0160; # capital s with caron
936 $chars{0xcf54}=0x0164; # capital t with caron
937 $chars{0xcf55}=0x01d3; # capital u with caron
938 $chars{0xcf5a}=0x017d; # capital z with caron
939 $chars{0xcf61}=0x01ce; # small a with caron
940 $chars{0xcf63}=0x010d; # small c with caron
941 $chars{0xcf64}=0x010f; # small d with caron
942 $chars{0xcf65}=0x011b; # small e with caron
943 $chars{0xcf67}=0x01e7; # small g with caron
944 $chars{0xcf69}=0x01d0; # small i with caron
945 $chars{0xcf6a}=0x01f0; # small j with caron
946 $chars{0xcf6b}=0x01e9; # small k with caron
947 $chars{0xcf6c}=0x013e; # small l with caron
948 $chars{0xcf6e}=0x0148; # small n with caron
949 $chars{0xcf6f}=0x01d2; # small o with caron
950 $chars{0xcf72}=0x0159; # small r with caron
951 $chars{0xcf73}=0x0161; # small s with caron
952 $chars{0xcf74}=0x0165; # small t with caron
953 $chars{0xcf75}=0x01d4; # small u with caron
954 $chars{0xcf7a}=0x017e; # small z with caron
956 $chars{0xd020}=0x00b8; # cedilla
957 $chars{0xd043}=0x00c7; # capital c with cedilla
958 $chars{0xd044}=0x1e10; # capital d with cedilla
959 $chars{0xd047}=0x0122; # capital g with cedilla
960 $chars{0xd048}=0x1e28; # capital h with cedilla
961 $chars{0xd04b}=0x0136; # capital k with cedilla
962 $chars{0xd04c}=0x013b; # capital l with cedilla
963 $chars{0xd04e}=0x0145; # capital n with cedilla
964 $chars{0xd052}=0x0156; # capital r with cedilla
965 $chars{0xd053}=0x015e; # capital s with cedilla
966 $chars{0xd054}=0x0162; # capital t with cedilla
967 $chars{0xd063}=0x00e7; # small c with cedilla
968 $chars{0xd064}=0x1e11; # small d with cedilla
969 $chars{0xd065}=0x0119; # small e with cedilla
970 $chars{0xd067}=0x0123; # small g with cedilla
971 $chars{0xd068}=0x1e29; # small h with cedilla
972 $chars{0xd06b}=0x0137; # small k with cedilla
973 $chars{0xd06c}=0x013c; # small l with cedilla
974 $chars{0xd06e}=0x0146; # small n with cedilla
975 $chars{0xd072}=0x0157; # small r with cedilla
976 $chars{0xd073}=0x015f; # small s with cedilla
977 $chars{0xd074}=0x0163; # small t with cedilla
980 # 5/3 ogonek (hook to right
981 $chars{0xd320}=0x02db; # ogonek
982 $chars{0xd341}=0x0104; # capital a with ogonek
983 $chars{0xd345}=0x0118; # capital e with ogonek
984 $chars{0xd349}=0x012e; # capital i with ogonek
985 $chars{0xd34f}=0x01ea; # capital o with ogonek
986 $chars{0xd355}=0x0172; # capital u with ogonek
987 $chars{0xd361}=0x0105; # small a with ogonek
988 $chars{0xd365}=0x0119; # small e with ogonek
989 $chars{0xd369}=0x012f; # small i with ogonek
990 $chars{0xd36f}=0x01eb; # small o with ogonek
991 $chars{0xd375}=0x0173; # small u with ogonek
993 $chars{0xd441}=0x1e00; # capital a with ring below
994 $chars{0xd461}=0x1e01; # small a with ring below
995 # 5/5 half circle below
996 $chars{0xf948}=0x1e2a; # capital h with breve below
997 $chars{0xf968}=0x1e2b; # small h with breve below
999 $chars{0xd641}=0x1ea0; # capital a with dot below
1000 $chars{0xd642}=0x1e04; # capital b with dot below
1001 $chars{0xd644}=0x1e0c; # capital d with dot below
1002 $chars{0xd645}=0x1eb8; # capital e with dot below
1003 $chars{0xd648}=0x1e24; # capital h with dot below
1004 $chars{0xd649}=0x1eca; # capital i with dot below
1005 $chars{0xd64b}=0x1e32; # capital k with dot below
1006 $chars{0xd64c}=0x1e36; # capital l with dot below
1007 $chars{0xd64d}=0x1e42; # capital m with dot below
1008 $chars{0xd64e}=0x1e46; # capital n with dot below
1009 $chars{0xd64f}=0x1ecc; # capital o with dot below
1010 $chars{0xd652}=0x1e5a; # capital r with dot below
1011 $chars{0xd653}=0x1e62; # capital s with dot below
1012 $chars{0xd654}=0x1e6c; # capital t with dot below
1013 $chars{0xd655}=0x1ee4; # capital u with dot below
1014 $chars{0xd656}=0x1e7e; # capital v with dot below
1015 $chars{0xd657}=0x1e88; # capital w with dot below
1016 $chars{0xd659}=0x1ef4; # capital y with dot below
1017 $chars{0xd65a}=0x1e92; # capital z with dot below
1018 $chars{0xd661}=0x1ea1; # small a with dot below
1019 $chars{0xd662}=0x1e05; # small b with dot below
1020 $chars{0xd664}=0x1e0d; # small d with dot below
1021 $chars{0xd665}=0x1eb9; # small e with dot below
1022 $chars{0xd668}=0x1e25; # small h with dot below
1023 $chars{0xd669}=0x1ecb; # small i with dot below
1024 $chars{0xd66b}=0x1e33; # small k with dot below
1025 $chars{0xd66c}=0x1e37; # small l with dot below
1026 $chars{0xd66d}=0x1e43; # small m with dot below
1027 $chars{0xd66e}=0x1e47; # small n with dot below
1028 $chars{0xd66f}=0x1ecd; # small o with dot below
1029 $chars{0xd672}=0x1e5b; # small r with dot below
1030 $chars{0xd673}=0x1e63; # small s with dot below
1031 $chars{0xd674}=0x1e6d; # small t with dot below
1032 $chars{0xd675}=0x1ee5; # small u with dot below
1033 $chars{0xd676}=0x1e7f; # small v with dot below
1034 $chars{0xd677}=0x1e89; # small w with dot below
1035 $chars{0xd679}=0x1ef5; # small y with dot below
1036 $chars{0xd67a}=0x1e93; # small z with dot below
1037 # 5/7 double dot below
1038 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1039 $chars{0xd775}=0x1e73; # small u with diaeresis below
1041 $chars{0xd820}=0x005f; # underline
1042 # 5/9 double underline
1043 $chars{0xd920}=0x2017; # double underline
1044 # 5/10 small low vertical bar
1045 $chars{0xda20}=0x02cc; #
1046 # 5/11 circumflex below
1047 # 5/12 (this position shall not be used)
1048 # 5/13 left half of ligature sign and of double tilde
1049 # 5/14 right half of ligature sign
1050 # 5/15 right half of double tilde
1051 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1053 sub char_decode5426 {
1057 my @data = unpack("C*", $string);
1059 my $length=scalar(@data);
1060 for (my $i = 0; $i < scalar(@data); $i++) {
1061 my $char= $data[$i];
1062 if ($char >= 0x00 && $char <= 0x7F){
1065 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1066 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1069 if ($chars{$char*256+$data[$i+1]}) {
1070 $convchar= $chars{$char * 256 + $data[$i+1]};
1072 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1073 } elsif ($chars{$char}) {
1074 $convchar= $chars{$char};
1075 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1079 push @characters,$convchar;
1082 if ($chars{$char}) {
1083 $convchar= $chars{$char};
1084 # printf "char %x, converted %x\n",$char,$chars{$char};
1086 # printf "char %x $char\n",$char;
1089 push @characters,$convchar;
1092 $result=pack "U*",@characters;
1093 # $result=~s/\x01//;
1094 # $result=~s/\x00//;
1098 $result=~s/\x1b\x5b//;
1099 # map{printf "%x",$_} @characters;
1109 Koha Development Team <info@koha.org>
1111 Galen Charlton <galen.charlton@liblime.com>