MT2116 : Addons to the CSV Export

[koha.git] / C4 / Charset.pm
diff --git a/C4/Charset.pm b/C4/Charset.pm

index 655d6c7..e39637a 100644 (file)
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -23,6 +23,7 @@ use warnings;
  use MARC::Charset qw/marc8_to_utf8/;
  use Text::Iconv;
  use C4::Debug;
+use Unicode::Normalize;
  
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  
@@ -34,6 +35,7 @@ BEGIN {
      @EXPORT = qw(
          IsStringUTF8ish
          MarcToUTF8Record
+        SetUTF8Flag
          SetMarcUnicodeFlag
          StripNonXmlChars
      );
@@ -111,6 +113,86 @@ sub IsStringUTF8ish {
      return utf8::decode($str);
  }
  
+=head2 SetUTF8Flag
+
+=over 4
+
+my $marc_record = SetUTF8Flag($marc_record);
+
+=back
+
+This function sets the PERL UTF8 flag for data.
+It is required when using new_from_usmarc 
+since MARC::File::USMARC does not handle PERL UTF8 setting.
+When editing unicode marc records fields and subfields, you
+would end up in double encoding without using this function. 
+
+FIXME
+In my opinion, this function belongs to MARC::Record and not
+to this package.
+But since it handles charset, and MARC::Record, it finds its way in that package
+
+=cut
+
+sub SetUTF8Flag{
+       my ($record)=@_;
+       return unless ($record && $record->fields());
+       foreach my $field ($record->fields()){
+               if ($field->tag()>=10){
+                       my @subfields;
+                       foreach my $subfield ($field->subfields()){
+                               push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
+                       }
+                       my $newfield=MARC::Field->new(
+                                                       $field->tag(),
+                                                       $field->indicator(1),
+                                                       $field->indicator(2),
+                                                       @subfields
+                                               );
+                       $field->replace_with($newfield);
+               }
+       }
+}
+
+=head2 NormalizeString
+
+=over 4
+
+    my $normalized_string=NormalizeString($string);
+
+=back
+       Given 
+           a string
+        nfc : If you want to set NFC and not NFD
+        transform : If you expect all the signs to be removed
+    Sets the PERL UTF8 Flag on your initial data if need be
+    and applies cleaning if required 
+    
+       Returns a utf8 NFD normalized string
+       
+       Sample code :
+       my $string=NormalizeString ("l'ornithoptère");
+    #results into ornithoptère in NFD form and sets UTF8 Flag
+=cut
+
+sub NormalizeString{
+       my ($string,$nfc,$transform)=@_;
+       utf8::decode($string) unless (utf8::is_utf8($string));
+       if ($nfc){
+               $string= NFD($string);
+       }
+       else {
+               $string=NFC($string);
+       }
+       if ($transform){
+    $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
+       #removing one letter words "d'" "l'"  was changed into "d " "l " 
+    $string=~s/\b\S\b//g;
+    $string=~s/\s+$//g;
+       }
+    return $string; 
+}
+
  =head2 MarcToUTF8Record
  
  =over 4
@@ -255,9 +337,9 @@ sub SetMarcUnicodeFlag {
          $marc_record->leader($leader); 
      } elsif ($marc_flavour =~/UNIMARC/) {
          my $string; 
-               my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,8):(36,22));
+               my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
                 $string=$marc_record->subfield( 100, "a" );
-        if (length($string)==$subflength) { 
+        if (defined $string && length($string)==$subflength) { 
                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
          } 
          else {