# Suite 330, Boston, MA 02111-1307 USA
use strict;
+use warnings;
+
use MARC::Charset qw/marc8_to_utf8/;
use Text::Iconv;
+use Unicode::Normalize;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
BEGIN {
# set the version for version checking
- $VERSION = 3.00;
+ $VERSION = 3.01;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(
IsStringUTF8ish
MarcToUTF8Record
SetMarcUnicodeFlag
+ StripNonXmlChars
+ Normalize_String
);
}
return utf8::decode($str);
}
+
+=head2 Normalize_String
+
+=over 4
+
+my $$string_normalized = Normalize_String($string);
+
+=back
+
+Returns normalized string C<$string> in C Form
+
+
+=cut
+
+sub Normalize_String {
+ my $string = shift;
+ if (IsStringUTF8ish($string)){
+ return NFC($string);
+ }
+ else {
+ return $string;
+ }
+}
+
=head2 MarcToUTF8Record
=over 4
$marc =~ s/^\s+//;
$marc =~ s/\s+$//;
$marc_blob_is_utf8 = IsStringUTF8ish($marc);
- $marc_record = MARC::Record->new_from_usmarc($marc);
+ eval {
+ $marc_record = MARC::Record->new_from_usmarc($marc);
+ };
+ if ($@) {
+ # if we fail the first time, one likely problem
+ # is that we have a MARC21 record that says that it's
+ # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
+ # We'll try parsing it again.
+ substr($marc, 9, 1) = ' ';
+ eval {
+ $marc_record = MARC::Record->new_from_usmarc($marc);
+ };
+ if ($@) {
+ # it's hopeless; return an empty MARC::Record
+ return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
+ }
+ }
}
# If we do not know the source encoding, try some guesses
@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
} else {
# assume any other character encoding is for Text::Iconv
- @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+ @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
}
if (@errors) {
sub SetMarcUnicodeFlag {
my $marc_record = shift;
- my $marc_flavour = shift;
+ my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
$marc_record->encoding('UTF-8');
if ($marc_flavour eq 'MARC21') {
} elsif ($marc_flavour eq "UNIMARC") {
if (my $field = $marc_record->field('100')) {
my $sfa = $field->subfield('a');
- substr($sfa, 26, 4) = '5050';
+
+ my $subflength = 36;
+ # fix the length of the field
+ $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
+ $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
+
+ substr($sfa, 26, 4) = '50 ';
$field->update('a' => $sfa);
}
+ } else {
+ warn "Unrecognized marcflavour: $marc_flavour";
}
}
+=head2 StripNonXmlChars
+
+=over 4
+
+my $new_str = StripNonXmlChars($old_str);
+
+=back
+
+Given a string, return a copy with the
+characters that are illegal in XML
+removed.
+
+This function exists to work around a problem
+that can occur with badly-encoded MARC records.
+Specifically, if a UTF-8 MARC record also
+has excape (\x1b) characters, MARC::File::XML
+will let the escape characters pass through
+when as_xml() or as_xml_record() is called. The
+problem is that the escape character is not
+legal in well-formed XML documents, so when
+MARC::File::XML attempts to parse such a record,
+the XML parser will fail.
+
+Stripping such characters will allow a
+MARC::Record->new_from_xml()
+to work, at the possible risk of some data loss.
+=cut
+
+sub StripNonXmlChars {
+ my $str = shift;
+ $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
+ return $str;
+}
=head1 INTERNAL FUNCTIONS
my @converted_subfields;
foreach my $subfield ($field->subfields()) {
my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
+ unless (IsStringUTF8ish($utf8sf)) {
+ # Because of a bug in MARC::Charset 0.98, if the string
+ # has (a) one or more diacritics that (b) are only in character positions
+ # 128 to 255 inclusive, the resulting converted string is not in
+ # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
+ # occurs, upgrade the string in place. Moral of the story seems to be
+ # that pack("U", ...) is better than chr(...) if you need to guarantee
+ # that the resulting string is UTF-8.
+ utf8::upgrade($utf8sf);
+ }
+ $utf8sf=NFC($utf8sf);
push @converted_subfields, $subfield->[0], $utf8sf;
}
my @converted_subfields;
foreach my $subfield ($field->subfields()) {
my $utf8sf = char_decode5426($subfield->[1]);
+ $utf8sf=NFC($utf8sf);
push @converted_subfields, $subfield->[0], $utf8sf;
}
push @converted_subfields, $subfield->[0], $converted_value;
} else {
$converted_value = $subfield->[1];
+ $converted_value=NFC($converted_value);
$converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
push @converted_subfields, $subfield->[0], $converted_value;
}
my @converted_subfields;
foreach my $subfield ($field->subfields()) {
my $value = $subfield->[1];
+ $value=NFC($value);
$value =~ s/[\200-\377]/\xef\xbf\xbd/g;
push @converted_subfields, $subfield->[0], $value;
}
=cut
-sub char_decode5426 {
- my ( $string) = @_;
- my $result;
+
my %chars;
$chars{0xb0}=0x0101;#3/0ayn[ain]
$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
# 5/14 right half of ligature sign
# 5/15 right half of double tilde
# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
+
+sub char_decode5426 {
+ my ( $string) = @_;
+ my $result;
+
my @data = unpack("C*", $string);
my @characters;
my $length=scalar(@data);