Bug 10403: (follow-up) fix test to use vendor created earlier during test

[koha.git] / C4 / Charset.pm
diff --git a/C4/Charset.pm b/C4/Charset.pm

index 712bd85..2968b5e 100644 (file)
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -29,7 +29,7 @@ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  
  BEGIN {
      # set the version for version checking
-    $VERSION = 3.01;
+    $VERSION = 3.07.00.049;
      require Exporter;
      @ISA    = qw(Exporter);
      @EXPORT = qw(
@@ -39,6 +39,7 @@ BEGIN {
          SetUTF8Flag
          SetMarcUnicodeFlag
          StripNonXmlChars
+        nsb_clean
      );
  }
  
@@ -112,7 +113,7 @@ sub IsStringUTF8ish {
  
  =head2 SetUTF8Flag
  
-  my $marc_record = SetUTF8Flag($marc_record);
+  my $marc_record = SetUTF8Flag($marc_record, $nfd);
  
  This function sets the PERL UTF8 flag for data.
  It is required when using new_from_usmarc 
@@ -120,6 +121,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting.
  When editing unicode marc records fields and subfields, you
  would end up in double encoding without using this function. 
  
+If $nfd is set, string normalization will use NFD instead of NFC
+
  FIXME
  In my opinion, this function belongs to MARC::Record and not
  to this package.
@@ -128,23 +131,26 @@ But since it handles charset, and MARC::Record, it finds its way in that package
  =cut
  
  sub SetUTF8Flag{
-       my ($record)=@_;
-       return unless ($record && $record->fields());
-       foreach my $field ($record->fields()){
-               if ($field->tag()>=10){
-                       my @subfields;
-                       foreach my $subfield ($field->subfields()){
-                               push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
-                       }
-                       my $newfield=MARC::Field->new(
-                                                       $field->tag(),
-                                                       $field->indicator(1),
-                                                       $field->indicator(2),
-                                                       @subfields
-                                               );
-                       $field->replace_with($newfield);
-               }
-       }
+    my ($record, $nfd)=@_;
+    return unless ($record && $record->fields());
+    foreach my $field ($record->fields()){
+        if ($field->tag()>=10){
+            my @subfields;
+            foreach my $subfield ($field->subfields()){
+                push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
+            }
+            eval {
+                my $newfield=MARC::Field->new(
+                            $field->tag(),
+                            $field->indicator(1),
+                            $field->indicator(2),
+                            @subfields
+                        );
+                $field->replace_with($newfield);
+            };
+            warn "ERROR occurred in SetUTF8Flag $@" if $@;
+        }
+    }
  }
  
  =head2 NormalizeString
@@ -248,20 +254,20 @@ sub MarcToUTF8Record {
      # If we do not know the source encoding, try some guesses
      # as follows:
      #   1. Record is UTF-8 already.
-    #   2. If MARC flavor is MARC21, then
+    #   2. If MARC flavor is MARC21 or NORMARC, then
      #      a. record is MARC-8
      #      b. record is ISO-8859-1
      #   3. If MARC flavor is UNIMARC, then
      if (not defined $source_encoding) {
          if ($marc_blob_is_utf8) {
-            # note that for MARC21 we are not bothering to check
+            # note that for MARC21/NORMARC we are not bothering to check
              # if the Leader/09 is set to 'a' or not -- because
              # of problems with various ILSs (including Koha in the
              # past, alas), this just is not trustworthy.
              SetMarcUnicodeFlag($marc_record, $marc_flavour);
              return $marc_record, 'UTF-8', [];
          } else {
-            if ($marc_flavour eq 'MARC21') {
+            if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
                  return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
              } elsif ($marc_flavour =~/UNIMARC/) {
                  return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
@@ -316,13 +322,15 @@ sub SetMarcUnicodeFlag {
      my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
  
      $marc_record->encoding('UTF-8');
-    if ($marc_flavour eq 'MARC21') {
+    if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
          my $leader = $marc_record->leader();
          substr($leader, 9, 1) = 'a';
          $marc_record->leader($leader); 
      } elsif ($marc_flavour =~/UNIMARC/) {
+       my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
+        $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
          my $string; 
-               my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
+               my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
                 $string=$marc_record->subfield( 100, "a" );
          if (defined $string && length($string)==$subflength) { 
                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
@@ -330,9 +338,10 @@ sub SetMarcUnicodeFlag {
          else { 
              $string = POSIX::strftime( "%Y%m%d", localtime ); 
              $string =~ s/\-//g; 
-            $string = sprintf( "%-*s", $subflength, $string ); 
+            $string = sprintf( "%-*s", $subflength, $string );
+           substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
          } 
-        substr( $string, $encodingposition, 8, "frey50  " ); 
+        substr( $string, $encodingposition, 3, "y50" );
          if ( $marc_record->subfield( 100, "a" ) ) { 
                         $marc_record->field('100')->update(a=>$string);
                 }
@@ -340,7 +349,7 @@ sub SetMarcUnicodeFlag {
              $marc_record->insert_grouped_field( 
                  MARC::Field->new( 100, '', '', "a" => $string ) ); 
          }
-               $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
+               $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
      } else {
          warn "Unrecognized marcflavour: $marc_flavour";
      }
@@ -380,6 +389,40 @@ sub StripNonXmlChars {
      return $str;
  }
  
+
+
+=head2 nsb_clean
+
+=over 4
+
+nsb_clean($string);
+
+=back
+
+Removes Non Sorting Block characters
+
+=cut
+sub nsb_clean {
+    my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
+    my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
+    my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
+    my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
+    my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
+
+    # handles non sorting blocks
+    my ($string) = @_ ;
+    $_ = $string ;
+    s/$NSB//g ;
+    s/$NSE//g ;
+    s/$NSB2//g ;
+    s/$NSE2//g ;
+    s/$C2//g ;
+    $string = $_ ;
+
+    return($string) ;
+}
+
+
  =head1 INTERNAL FUNCTIONS
  
  =head2 _default_marc21_charconv_to_utf8