NoZebra fixes : removing \r and \n when indexing

[koha.git] / C4 / Biblio.pm
diff --git a/C4/Biblio.pm b/C4/Biblio.pm

index bb81ed4..b07073e 100755 (executable)
--- a/C4/Biblio.pm
+++ b/C4/Biblio.pm
@@ -30,6 +30,7 @@ use C4::Branch;
  use C4::Dates qw/format_date/;
  use C4::Log; # logaction
  use C4::ClassSource;
+use C4::Charset;
  
  use vars qw($VERSION @ISA @EXPORT);
  
@@ -40,6 +41,7 @@ BEGIN {
         @ISA = qw( Exporter );
  
         # to add biblios
+# EXPORTED FUNCTIONS.
         push @EXPORT, qw( 
                 &AddBiblio
         );
@@ -60,7 +62,6 @@ BEGIN {
                 &GetMarcSeries
                 GetMarcUrls
                 &GetUsedMarcStructure
-
                 &GetXmlBiblio
  
                 &GetAuthorisedValueDesc
@@ -77,12 +78,17 @@ BEGIN {
                 &ModBiblioframework
                 &ModZebra
         );
-
         # To delete something
         push @EXPORT, qw(
                 &DelBiblio
         );
  
+    # To link headings in a bib record
+    # to authority records.
+    push @EXPORT, qw(
+        &LinkBibHeadingsToAuthorities
+    );
+
         # Internal functions
         # those functions are exported but should not be used
         # they are usefull is few circumstances, so are exported.
@@ -90,7 +96,6 @@ BEGIN {
         push @EXPORT, qw(
                 &ModBiblioMarc
         );
-
         # Others functions
         push @EXPORT, qw(
                 &TransformMarcToKoha
@@ -98,11 +103,16 @@ BEGIN {
                 &TransformHtmlToMarc
                 &TransformHtmlToXml
                 &PrepareItemrecordDisplay
-               &char_decode
                 &GetNoZebraIndexes
         );
  }
  
+# because of interdependencies between
+# C4::Search, C4::Heading, and C4::Biblio,
+# 'use C4::Heading' must occur after
+# the exports have been defined.
+use C4::Heading;
+
  =head1 NAME
  
  C4::Biblio - cataloging management functions
@@ -223,6 +233,9 @@ sub AddBiblio {
  
      _koha_marc_update_bib_ids($record, $frameworkcode, $biblionumber, $biblioitemnumber);
  
+    # update MARC subfield that stores biblioitems.cn_sort
+    _koha_marc_update_biblioitem_cn_sort($record, $olddata, $frameworkcode);
+    
      # now add the record
      $biblionumber = ModBiblioMarc( $record, $biblionumber, $frameworkcode ) unless $defer_marc_save;
        
@@ -281,12 +294,15 @@ sub ModBiblio {
      $sth->finish();
      _koha_marc_update_bib_ids($record, $frameworkcode, $biblionumber, $biblioitemnumber);
  
-    # update the MARC record (that now contains biblio and items) with the new record data
-    &ModBiblioMarc( $record, $biblionumber, $frameworkcode );
-    
      # load the koha-table data object
      my $oldbiblio = TransformMarcToKoha( $dbh, $record, $frameworkcode );
  
+    # update MARC subfield that stores biblioitems.cn_sort
+    _koha_marc_update_biblioitem_cn_sort($record, $oldbiblio, $frameworkcode);
+
+    # update the MARC record (that now contains biblio and items) with the new record data
+    &ModBiblioMarc( $record, $biblionumber, $frameworkcode );
+    
      # modify the other koha tables
      _koha_modify_biblio( $dbh, $oldbiblio, $frameworkcode );
      _koha_modify_biblioitem_nonmarc( $dbh, $oldbiblio );
@@ -371,6 +387,63 @@ sub DelBiblio {
      return;
  }
  
+=head2 LinkBibHeadingsToAuthorities
+
+=over 4
+
+my $headings_linked = LinkBibHeadingsToAuthorities($marc);
+
+=back
+
+Links bib headings to authority records by checking
+each authority-controlled field in the C<MARC::Record>
+object C<$marc>, looking for a matching authority record,
+and setting the linking subfield $9 to the ID of that
+authority record.  
+
+If no matching authority exists, or if multiple
+authorities match, no $9 will be added, and any 
+existing one inthe field will be deleted.
+
+Returns the number of heading links changed in the
+MARC record.
+
+=cut
+
+sub LinkBibHeadingsToAuthorities {
+    my $bib = shift;
+
+    my $num_headings_changed = 0;
+    foreach my $field ($bib->fields()) {
+        my $heading = C4::Heading->new_from_bib_field($field);    
+        next unless defined $heading;
+
+        # check existing $9
+        my $current_link = $field->subfield('9');
+
+        # look for matching authorities
+        my $authorities = $heading->authorities();
+
+        # want only one exact match
+        if ($#{ $authorities } == 0) {
+            my $authority = MARC::Record->new_from_usmarc($authorities->[0]);
+            my $authid = $authority->field('001')->data();
+            next if defined $current_link and $current_link eq $authid;
+
+            $field->delete_subfield(code => '9') if defined $current_link;
+            $field->add_subfields('9', $authid);
+            $num_headings_changed++;
+        } else {
+            if (defined $current_link) {
+                $field->delete_subfield(code => '9');
+                $num_headings_changed++;
+            }
+        }
+
+    }
+    return $num_headings_changed;
+}
+
  =head2 GetBiblioData
  
  =over 4
@@ -580,10 +653,19 @@ $frameworkcode : the framework code to read
  
  =cut
  
+# cache for results of GetMarcStructure -- needed
+# for batch jobs
+our $marc_structure_cache;
+
  sub GetMarcStructure {
      my ( $forlibrarian, $frameworkcode ) = @_;
      my $dbh=C4::Context->dbh;
      $frameworkcode = "" unless $frameworkcode;
+
+    if (defined $marc_structure_cache and exists $marc_structure_cache->{$forlibrarian}->{$frameworkcode}) {
+        return $marc_structure_cache->{$forlibrarian}->{$frameworkcode};
+    }
+
      my $sth;
      my $libfield = ( $forlibrarian eq 1 ) ? 'liblibrarian' : 'libopac';
  
@@ -663,6 +745,9 @@ sub GetMarcStructure {
          $res->{$tag}->{$subfield}->{'link'}           = $link;
          $res->{$tag}->{$subfield}->{defaultvalue}     = $defaultvalue;
      }
+
+    $marc_structure_cache->{$forlibrarian}->{$frameworkcode} = $res;
+
      return $res;
  }
  
@@ -737,18 +822,13 @@ sub GetMarcBiblio {
      my $sth          =
        $dbh->prepare("SELECT marcxml FROM biblioitems WHERE biblionumber=? ");
      $sth->execute($biblionumber);
-     my ($marcxml) = $sth->fetchrow;
+    my $row = $sth->fetchrow_hashref;
+    my $marcxml = StripNonXmlChars($row->{'marcxml'});
       MARC::File::XML->default_record_format(C4::Context->preference('marcflavour'));
-     $marcxml =~ s/\x1e//g;
-     $marcxml =~ s/\x1f//g;
-     $marcxml =~ s/\x1d//g;
-     $marcxml =~ s/\x0f//g;
-     $marcxml =~ s/\x0c//g;  
-#   warn $marcxml;
      my $record = MARC::Record->new();
      if ($marcxml) {
          $record = eval {MARC::Record::new_from_xml( $marcxml, "utf8", C4::Context->preference('marcflavour'))};
-        if ($@) {warn $@;}
+        if ($@) {warn " problem with :$biblionumber : $@ \n$marcxml";}
  #      $record = MARC::Record::new_from_usmarc( $marc) if $marc;
          return $record;
      } else {
@@ -987,11 +1067,12 @@ sub GetMarcAuthors {
              }
              else {
                  # reset $linkvalue if UNIMARC author responsibility
-                if ( $marcflavour eq 'UNIMARC' and ($authors_subfield->[0] eq '4')) {
+                if ( $marcflavour eq 'UNIMARC' and ($authors_subfield->[0] eq "4")) {
                      $linkvalue = "(".GetAuthorisedValueDesc( $field->tag(), $authors_subfield->[0], $authors_subfield->[1], '', $tagslib ).")";
                  }
                  push @link_loop, {'limit' => 'au', link => $linkvalue, operator => $operator };
              }
+            $value = GetAuthorisedValueDesc( $field->tag(), $authors_subfield->[0], $authors_subfield->[1], '', $tagslib ) if ( $marcflavour eq 'UNIMARC' and ($authors_subfield->[0] =~/4/));
              my @this_link_loop = @link_loop;
              my $separator = C4::Context->preference("authoritysep") unless $count_auth==0;
              push @subfields_loop, {code => $subfieldcode, value => $value, link_loop => \@this_link_loop, separator => $separator} unless ($authors_subfield->[0] == 9 );
@@ -1724,202 +1805,6 @@ sub TransformMarcToKohaOneField {
  
  =head1  OTHER FUNCTIONS
  
-=head2 char_decode
-
-=over 4
-
-my $string = char_decode( $string, $encoding );
-
-converts ISO 5426 coded string to UTF-8
-sloppy code : should be improved in next issue
-
-=back
-
-=cut
-
-sub char_decode {
-    my ( $string, $encoding ) = @_;
-    $_ = $string;
-
-    $encoding = C4::Context->preference("marcflavour") unless $encoding;
-    if ( $encoding eq "UNIMARC" ) {
-
-        #         s/\xe1/Æ/gm;
-        s/\xe2/Ğ/gm;
-        s/\xe9/Ø/gm;
-        s/\xec/ş/gm;
-        s/\xf1/æ/gm;
-        s/\xf3/ğ/gm;
-        s/\xf9/ø/gm;
-        s/\xfb/ß/gm;
-        s/\xc1\x61/à/gm;
-        s/\xc1\x65/è/gm;
-        s/\xc1\x69/ì/gm;
-        s/\xc1\x6f/ò/gm;
-        s/\xc1\x75/ù/gm;
-        s/\xc1\x41/À/gm;
-        s/\xc1\x45/È/gm;
-        s/\xc1\x49/Ì/gm;
-        s/\xc1\x4f/Ò/gm;
-        s/\xc1\x55/Ù/gm;
-        s/\xc2\x41/Á/gm;
-        s/\xc2\x45/É/gm;
-        s/\xc2\x49/Í/gm;
-        s/\xc2\x4f/Ó/gm;
-        s/\xc2\x55/Ú/gm;
-        s/\xc2\x59/İ/gm;
-        s/\xc2\x61/á/gm;
-        s/\xc2\x65/é/gm;
-        s/\xc2\x69/í/gm;
-        s/\xc2\x6f/ó/gm;
-        s/\xc2\x75/ú/gm;
-        s/\xc2\x79/ı/gm;
-        s/\xc3\x41/Â/gm;
-        s/\xc3\x45/Ê/gm;
-        s/\xc3\x49/Î/gm;
-        s/\xc3\x4f/Ô/gm;
-        s/\xc3\x55/Û/gm;
-        s/\xc3\x61/â/gm;
-        s/\xc3\x65/ê/gm;
-        s/\xc3\x69/î/gm;
-        s/\xc3\x6f/ô/gm;
-        s/\xc3\x75/û/gm;
-        s/\xc4\x41/Ã/gm;
-        s/\xc4\x4e/Ñ/gm;
-        s/\xc4\x4f/Õ/gm;
-        s/\xc4\x61/ã/gm;
-        s/\xc4\x6e/ñ/gm;
-        s/\xc4\x6f/õ/gm;
-        s/\xc8\x41/Ä/gm;
-        s/\xc8\x45/Ë/gm;
-        s/\xc8\x49/Ï/gm;
-        s/\xc8\x61/ä/gm;
-        s/\xc8\x65/ë/gm;
-        s/\xc8\x69/ï/gm;
-        s/\xc8\x6F/ö/gm;
-        s/\xc8\x75/ü/gm;
-        s/\xc8\x76/ÿ/gm;
-        s/\xc9\x41/Ä/gm;
-        s/\xc9\x45/Ë/gm;
-        s/\xc9\x49/Ï/gm;
-        s/\xc9\x4f/Ö/gm;
-        s/\xc9\x55/Ü/gm;
-        s/\xc9\x61/ä/gm;
-        s/\xc9\x6f/ö/gm;
-        s/\xc9\x75/ü/gm;
-        s/\xca\x41/Å/gm;
-        s/\xca\x61/å/gm;
-        s/\xd0\x43/Ç/gm;
-        s/\xd0\x63/ç/gm;
-
-        # this handles non-sorting blocks (if implementation requires this)
-        $string = nsb_clean($_);
-    }
-    elsif ( $encoding eq "USMARC" || $encoding eq "MARC21" ) {
-        ##MARC-8 to UTF-8
-
-        s/\xe1\x61/à/gm;
-        s/\xe1\x65/è/gm;
-        s/\xe1\x69/ì/gm;
-        s/\xe1\x6f/ò/gm;
-        s/\xe1\x75/ù/gm;
-        s/\xe1\x41/À/gm;
-        s/\xe1\x45/È/gm;
-        s/\xe1\x49/Ì/gm;
-        s/\xe1\x4f/Ò/gm;
-        s/\xe1\x55/Ù/gm;
-        s/\xe2\x41/Á/gm;
-        s/\xe2\x45/É/gm;
-        s/\xe2\x49/Í/gm;
-        s/\xe2\x4f/Ó/gm;
-        s/\xe2\x55/Ú/gm;
-        s/\xe2\x59/İ/gm;
-        s/\xe2\x61/á/gm;
-        s/\xe2\x65/é/gm;
-        s/\xe2\x69/í/gm;
-        s/\xe2\x6f/ó/gm;
-        s/\xe2\x75/ú/gm;
-        s/\xe2\x79/ı/gm;
-        s/\xe3\x41/Â/gm;
-        s/\xe3\x45/Ê/gm;
-        s/\xe3\x49/Î/gm;
-        s/\xe3\x4f/Ô/gm;
-        s/\xe3\x55/Û/gm;
-        s/\xe3\x61/â/gm;
-        s/\xe3\x65/ê/gm;
-        s/\xe3\x69/î/gm;
-        s/\xe3\x6f/ô/gm;
-        s/\xe3\x75/û/gm;
-        s/\xe4\x41/Ã/gm;
-        s/\xe4\x4e/Ñ/gm;
-        s/\xe4\x4f/Õ/gm;
-        s/\xe4\x61/ã/gm;
-        s/\xe4\x6e/ñ/gm;
-        s/\xe4\x6f/õ/gm;
-        s/\xe6\x41/Ă/gm;
-        s/\xe6\x45/Ĕ/gm;
-        s/\xe6\x65/ĕ/gm;
-        s/\xe6\x61/ă/gm;
-        s/\xe8\x45/Ë/gm;
-        s/\xe8\x49/Ï/gm;
-        s/\xe8\x65/ë/gm;
-        s/\xe8\x69/ï/gm;
-        s/\xe8\x76/ÿ/gm;
-        s/\xe9\x41/A/gm;
-        s/\xe9\x4f/O/gm;
-        s/\xe9\x55/U/gm;
-        s/\xe9\x61/a/gm;
-        s/\xe9\x6f/o/gm;
-        s/\xe9\x75/u/gm;
-        s/\xea\x41/A/gm;
-        s/\xea\x61/a/gm;
-
-        #Additional Turkish characters
-        s/\x1b//gm;
-        s/\x1e//gm;
-        s/(\xf0)s/\xc5\x9f/gm;
-        s/(\xf0)S/\xc5\x9e/gm;
-        s/(\xf0)c/ç/gm;
-        s/(\xf0)C/Ç/gm;
-        s/\xe7\x49/\\xc4\xb0/gm;
-        s/(\xe6)G/\xc4\x9e/gm;
-        s/(\xe6)g/ğ\xc4\x9f/gm;
-        s/\xB8/ı/gm;
-        s/\xB9/£/gm;
-        s/(\xe8|\xc8)o/ö/gm;
-        s/(\xe8|\xc8)O/Ö/gm;
-        s/(\xe8|\xc8)u/ü/gm;
-        s/(\xe8|\xc8)U/Ü/gm;
-        s/\xc2\xb8/\xc4\xb1/gm;
-        s/¸/\xc4\xb1/gm;
-
-        # this handles non-sorting blocks (if implementation requires this)
-        $string = nsb_clean($_);
-    }
-    return ($string);
-}
-
-=head2 nsb_clean
-
-=over 4
-
-my $string = nsb_clean( $string, $encoding );
-
-=back
-
-=cut
-
-sub nsb_clean {
-    my $NSB      = '\x88';    # NSB : begin Non Sorting Block
-    my $NSE      = '\x89';    # NSE : Non Sorting Block end
-                              # handles non sorting blocks
-    my ($string) = @_;
-    $_ = $string;
-    s/$NSB/(/gm;
-    s/[ ]{0,1}$NSE/) /gm;
-    $string = $_;
-    return ($string);
-}
  
  =head2 PrepareItemrecordDisplay
  
@@ -2210,9 +2095,20 @@ sub ModZebra {
          #
          # we use zebra, just fill zebraqueue table
          #
-        my $sth=$dbh->prepare("INSERT INTO zebraqueue  (biblio_auth_number,server,operation) VALUES(?,?,?)");
-        $sth->execute($biblionumber,$server,$op);
-        $sth->finish;
+        my $check_sql = "SELECT COUNT(*) FROM zebraqueue 
+                         WHERE server = ?
+                         AND   biblio_auth_number = ?
+                         AND   operation = ?
+                         AND   done = 0";
+        my $check_sth = $dbh->prepare_cached($check_sql);
+        $check_sth->execute($server, $biblionumber, $op);
+        my ($count) = $check_sth->fetchrow_array;
+        $check_sth->finish();
+        if ($count == 0) {
+            my $sth=$dbh->prepare("INSERT INTO zebraqueue  (biblio_auth_number,server,operation) VALUES(?,?,?)");
+            $sth->execute($biblionumber,$server,$op);
+            $sth->finish;
+        }
      }
  }
  
@@ -2373,7 +2269,7 @@ sub _AddBiblioNoZebra {
      }
  
      # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
-    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+    $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=|\r|\n//g;
      # limit to 10 char, should be enough, and limit the DB size
      $title = substr($title,0,10);
      #parse each field
@@ -2385,6 +2281,7 @@ sub _AddBiblioNoZebra {
              my $tag = $field->tag();
              my $subfieldcode = $subfield->[0];
              my $indexed=0;
+            warn "INDEXING :".$subfield->[1];
              # check each index to see if the subfield is stored somewhere
              # otherwise, store it in __RAW__ index
              foreach my $key (keys %index) {
@@ -2393,15 +2290,15 @@ sub _AddBiblioNoZebra {
                      $indexed=1;
                      my $line= lc $subfield->[1];
                      # remove meaningless value in the field...
-                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
+                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:|\r|\n/ /g;
                      # ... and split in words
                      foreach (split / /,$line) {
                          next unless $_; # skip  empty values (multiple spaces)
                          # if the entry is already here, improve weight
  #                         warn "managing $_";
-                        if ($result{$key}->{"$_"} =~ /$biblionumber,$title\-(\d);/) {
+                        if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) { 
                              my $weight=$1+1;
-                            $result{$key}->{"$_"} =~ s/$biblionumber,$title\-(\d);//;
+                            $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
                              $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
                          } else {
                              # get the value if it exist in the nozebra table, otherwise, create it
@@ -2411,7 +2308,7 @@ sub _AddBiblioNoZebra {
                              if ($existing_biblionumbers) {
                                  $result{$key}->{"$_"} =$existing_biblionumbers;
                                  my $weight=$1+1;
-                                $result{$key}->{"$_"} =~ s/$biblionumber,$title\-(\d);//;
+                                $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
                                  $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
                              # create a new ligne for this entry
                              } else {
@@ -2426,14 +2323,14 @@ sub _AddBiblioNoZebra {
              # the subfield is not indexed, store it in __RAW__ index anyway
              unless ($indexed) {
                  my $line= lc $subfield->[1];
-                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:|\r|\n/ /g;
                  # ... and split in words
                  foreach (split / /,$line) {
                      next unless $_; # skip  empty values (multiple spaces)
                      # if the entry is already here, improve weight
-                    if ($result{'__RAW__'}->{"$_"} =~ /$biblionumber,$title\-(\d);/) {
+                    if ($result{'__RAW__'}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) { 
                          my $weight=$1+1;
-                        $result{'__RAW__'}->{"$_"} =~ s/$biblionumber,$title\-(\d);//;
+                        $result{'__RAW__'}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
                          $result{'__RAW__'}->{"$_"} .= "$biblionumber,$title-$weight;";
                      } else {
                          # get the value if it exist in the nozebra table, otherwise, create it
@@ -2443,7 +2340,7 @@ sub _AddBiblioNoZebra {
                          if ($existing_biblionumbers) {
                              $result{'__RAW__'}->{"$_"} =$existing_biblionumbers;
                              my $weight=$1+1;
-                            $result{'__RAW__'}->{"$_"} =~ s/$biblionumber,$title\-(\d);//;
+                            $result{'__RAW__'}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
                              $result{'__RAW__'}->{"$_"} .= "$biblionumber,$title-$weight;";
                          # create a new ligne for this entry
                          } else {
@@ -2573,6 +2470,44 @@ sub _koha_marc_update_bib_ids {
      }
  }
  
+=head2 _koha_marc_update_biblioitem_cn_sort
+
+=over 4
+
+_koha_marc_update_biblioitem_cn_sort($marc, $biblioitem, $frameworkcode);
+
+=back
+
+Given a MARC bib record and the biblioitem hash, update the
+subfield that contains a copy of the value of biblioitems.cn_sort.
+
+=cut
+
+sub _koha_marc_update_biblioitem_cn_sort {
+    my $marc = shift;
+    my $biblioitem = shift;
+    my $frameworkcode= shift;
+
+    my ($biblioitem_tag, $biblioitem_subfield ) = GetMarcFromKohaField("biblioitems.cn_sort",$frameworkcode);
+    return unless $biblioitem_tag;
+
+    my ($cn_sort) = GetClassSort($biblioitem->{'biblioitems.cn_source'}, $biblioitem->{'cn_class'}, $biblioitem->{'cn_item'} );
+
+    if (my $field = $marc->field($biblioitem_tag)) {
+        $field->delete_subfield(code => $biblioitem_subfield);
+        if ($cn_sort ne '') {
+            $field->add_subfields($biblioitem_subfield => $cn_sort);
+        }
+    } else {
+        # if we get here, no biblioitem tag is present in the MARC record, so
+        # we'll create it if $cn_sort is not empty -- this would be
+        # an odd combination of events, however
+        if ($cn_sort) {
+            $marc->insert_grouped_field(MARC::Field->new($biblioitem_tag, ' ', ' ', $biblioitem_subfield => $cn_sort));
+        }
+    }
+}
+
  =head2 _koha_add_biblio
  
  =over 4