use strict;
use MARC::Charset qw/marc8_to_utf8/;
use Text::Iconv;
+use C4::Debug;
+use Unicode::Normalize;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
BEGIN {
# set the version for version checking
- $VERSION = 3.00;
+ $VERSION = 3.01;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(
IsStringUTF8ish
MarcToUTF8Record
+ SetUTF8Flag
SetMarcUnicodeFlag
StripNonXmlChars
);
return utf8::decode($str);
}
+=head2 SetUTF8Flag
+
+=over 4
+
+my $marc_record = SetUTF8Flag($marc_record);
+
+=back
+
+This function sets the PERL UTF8 flag for data.
+It is required when using new_from_usmarc
+since MARC::File::USMARC does not handle PERL UTF8 setting.
+When editing unicode marc records fields and subfields, you
+would end up in double encoding without using this function.
+
+FIXME
+In my opinion, this function belongs to MARC::Record and not
+to this package.
+But since it handles charset, and MARC::Record, it finds its way in that package
+
+=cut
+
+sub SetUTF8Flag{
+ my ($record)=@_;
+ return unless ($record && $record->fields());
+ foreach my $field ($record->fields()){
+ if ($field->tag()>=10){
+ my @subfields;
+ foreach my $subfield ($field->subfields()){
+ push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
+ }
+ my $newfield=MARC::Field->new(
+ $field->tag(),
+ $field->indicator(1),
+ $field->indicator(2),
+ @subfields
+ );
+ $field->replace_with($newfield);
+ }
+ }
+}
+
+=head2 NormalizeString
+
+=over 4
+
+ my $normalized_string=NormalizeString($string);
+
+=back
+ Given
+ a string
+ nfc : If you want to set NFC and not NFD
+ transform : If you expect all the signs to be removed
+ Sets the PERL UTF8 Flag on your initial data if need be
+ and applies cleaning if required
+
+ Returns a utf8 NFD normalized string
+
+ Sample code :
+ my $string=NormalizeString ("l'ornithoptère");
+ #results into ornithoptère in NFD form and sets UTF8 Flag
+=cut
+
+sub NormalizeString{
+ my ($string,$nfc,$transform)=@_;
+ utf8::decode($string) unless (utf8::is_utf8($string));
+ if ($nfc){
+ $string= NFD($string);
+ }
+ else {
+ $string=NFC($string);
+ }
+ if ($transform){
+ $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
+ #removing one letter words "d'" "l'" was changed into "d " "l "
+ $string=~s/\b\S\b//g;
+ $string=~s/\s+$//g;
+ }
+ return $string;
+}
+
=head2 MarcToUTF8Record
=over 4
} else {
if ($marc_flavour eq 'MARC21') {
return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
- } elsif ($marc_flavour eq 'UNIMARC') {
+ } elsif ($marc_flavour =~/UNIMARC/) {
return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
} else {
return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
sub SetMarcUnicodeFlag {
my $marc_record = shift;
- my $marc_flavour = shift;
+ my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
$marc_record->encoding('UTF-8');
if ($marc_flavour eq 'MARC21') {
my $leader = $marc_record->leader();
substr($leader, 9, 1) = 'a';
$marc_record->leader($leader);
- } elsif ($marc_flavour eq "UNIMARC") {
- if (my $field = $marc_record->field('100')) {
- my $sfa = $field->subfield('a');
- substr($sfa, 26, 4) = '5050';
- $field->update('a' => $sfa);
+ } elsif ($marc_flavour =~/UNIMARC/) {
+ my $string;
+ my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
+ $string=$marc_record->subfield( 100, "a" );
+ if (length($string)==$subflength) {
+ $string = substr $string, 0,$subflength if (length($string)>$subflength);
+ }
+ else {
+ $string = POSIX::strftime( "%Y%m%d", localtime );
+ $string =~ s/\-//g;
+ $string = sprintf( "%-*s", $subflength, $string );
+ }
+ substr( $string, $encodingposition, 8, "frey50 " );
+ if ( $marc_record->subfield( 100, "a" ) ) {
+ $marc_record->field('100')->update(a=>$string);
+ }
+ else {
+ $marc_record->insert_grouped_field(
+ MARC::Field->new( 100, '', '', "a" => $string ) );
}
+ $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
+ } else {
+ warn "Unrecognized marcflavour: $marc_flavour";
}
}
sub StripNonXmlChars {
my $str = shift;
+ if (!defined($str) || $str eq ""){
+ return "";
+ }
$str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
return $str;
}
my @converted_subfields;
foreach my $subfield ($field->subfields()) {
my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
+ unless (IsStringUTF8ish($utf8sf)) {
+ # Because of a bug in MARC::Charset 0.98, if the string
+ # has (a) one or more diacritics that (b) are only in character positions
+ # 128 to 255 inclusive, the resulting converted string is not in
+ # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
+ # occurs, upgrade the string in place. Moral of the story seems to be
+ # that pack("U", ...) is better than chr(...) if you need to guarantee
+ # that the resulting string is UTF-8.
+ utf8::upgrade($utf8sf);
+ }
push @converted_subfields, $subfield->[0], $utf8sf;
}
=cut
-sub char_decode5426 {
- my ( $string) = @_;
- my $result;
+
my %chars;
-$chars{0xb0}=0x0101;#3/0ayn[ain]
+# $chars{0xb0}=0x0101;#3/0ayn[ain] # commented because it seems to be wrong at use
$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
#$chars{0xb2}=0x00e0;#'à';
$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
# 5/14 right half of ligature sign
# 5/15 right half of double tilde
# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
+
+sub char_decode5426 {
+ my ( $string) = @_;
+ my $result;
+
my @data = unpack("C*", $string);
my @characters;
my $length=scalar(@data);