X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=C4%2FMatcher.pm;h=40e6e5f0309c471e186fe18cdc505b2f51622288;hb=c80cd97b68c76312062780783427542a5268c61f;hp=a0f1a9820727390f8f24d538ece04365d0e3c37a;hpb=9b47b82a77a18e677d2dae30b73dcae0d9abba5b;p=koha.git diff --git a/C4/Matcher.pm b/C4/Matcher.pm index a0f1a98207..40e6e5f030 100644 --- a/C4/Matcher.pm +++ b/C4/Matcher.pm @@ -1,34 +1,29 @@ package C4::Matcher; -# Copyright (C) 2007 LibLime +# Copyright (C) 2007 LibLime, 2012 C & P Bibliography Services # # This file is part of Koha. # -# Koha is free software; you can redistribute it and/or modify it under the -# terms of the GNU General Public License as published by the Free Software -# Foundation; either version 2 of the License, or (at your option) any later -# version. +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. # -# Koha is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along with -# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, -# Suite 330, Boston, MA 02111-1307 USA +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . -use strict; -use C4::Context; -use MARC::Record; -use C4::Search; -use C4::Biblio; +use Modern::Perl; -use vars qw($VERSION); +use MARC::Record; -BEGIN { - # set the version for version checking - $VERSION = 3.01; -} +use Koha::SearchEngine; +use Koha::SearchEngine::Search; +use Koha::Util::Normalize qw/legacy_default remove_spaces upper_case lower_case/; =head1 NAME @@ -36,37 +31,33 @@ C4::Matcher - find MARC records matching another one =head1 SYNOPSIS -=over 4 + my @matchers = C4::Matcher::GetMatcherList(); -my @matchers = C4::Matcher::GetMatcherList(); + my $matcher = C4::Matcher->new($record_type); + $matcher->threshold($threshold); + $matcher->code($code); + $matcher->description($description); -my $matcher = C4::Matcher->new($record_type); -$matcher->threshold($threshold); -$matcher->code($code); -$matcher->description($description); + $matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, ''); + $matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, ''); + $matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]); -$matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, ''); -$matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, ''); -$matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]); + $matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, ''); + $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], + [ { tag => '245', subfields => 'a', norms => [] } ]); -$matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, ''); -$matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], - [ { tag => '245', subfields => 'a', norms => [] } ]); + my @matches = $matcher->get_matches($marc_record, $max_matches); -my @matches = $matcher->get_matches($marc_record, $max_matches); + foreach $match (@matches) { -foreach $match (@matches) { + # matches already sorted in order of + # decreasing score + print "record ID: $match->{'record_id'}; + print "score: $match->{'score'}; - # matches already sorted in order of - # decreasing score - print "record ID: $match->{'record_id'}; - print "score: $match->{'score'}; + } -} - -my $matcher_description = $matcher->dump(); - -=back + my $matcher_description = $matcher->dump(); =head1 FUNCTIONS @@ -74,18 +65,14 @@ my $matcher_description = $matcher->dump(); =head2 GetMatcherList -=over 4 - -my @matchers = C4::Matcher::GetMatcherList(); - -=back + my @matchers = C4::Matcher::GetMatcherList(); Returns an array of hashrefs list all matchers present in the database. Each hashref includes: -matcher_id -code -description + * matcher_id + * code + * description =cut @@ -101,17 +88,27 @@ sub GetMatcherList { return @results; } -=head1 METHODS +=head2 GetMatcherId + + my $matcher_id = C4::Matcher::GetMatcherId($code); + +Returns the matcher_id of a code. =cut -=head2 new +sub GetMatcherId { + my ($code) = @_; + my $dbh = C4::Context->dbh; -=over 4 + my $matcher_id = $dbh->selectrow_array("SELECT matcher_id FROM marc_matchers WHERE code = ?", undef, $code); + return $matcher_id; +} -my $matcher = C4::Matcher->new($record_type, $threshold); +=head1 METHODS -=back +=head2 new + + my $matcher = C4::Matcher->new($record_type, $threshold); Creates a new Matcher. C<$record_type> indicates which search database to use, e.g., 'biblio' or 'authority' and defaults to @@ -150,11 +147,7 @@ sub new { =head2 fetch -=over 4 - -my $matcher = C4::Matcher->fetch($id); - -=back + my $matcher = C4::Matcher->fetch($id); Creates a matcher object from the version stored in the database. If a matcher with the given @@ -243,11 +236,7 @@ sub _fetch_matchpoint { =head2 store -=over 4 - -my $id = $matcher->store(); - -=back + my $id = $matcher->store(); Stores matcher in database. The return value is the ID of the marc_matchers row. If the matcher was @@ -372,11 +361,7 @@ sub _store_matchpoint { =head2 delete -=over 4 - -C4::Matcher->delete($id); - -=back + C4::Matcher->delete($id); Deletes the matcher of the specified ID from the database. @@ -392,14 +377,24 @@ sub delete { $sth->execute($matcher_id); # relying on cascading deletes to clean up everything } -=head2 threshold +=head2 record_type -=over 4 + $matcher->record_type('biblio'); + my $record_type = $matcher->record_type(); -$matcher->threshold(1000); -my $threshold = $matcher->threshold(); +Accessor method. -=back +=cut + +sub record_type { + my $self = shift; + @_ ? $self->{'record_type'} = shift : $self->{'record_type'}; +} + +=head2 threshold + + $matcher->threshold(1000); + my $threshold = $matcher->threshold(); Accessor method. @@ -412,12 +407,8 @@ sub threshold { =head2 _id -=over 4 - -$matcher->_id(123); -my $id = $matcher->_id(); - -=back + $matcher->_id(123); + my $id = $matcher->_id(); Accessor method. Note that using this method to set the DB ID of the matcher should not be @@ -432,12 +423,8 @@ sub _id { =head2 code -=over 4 - -$matcher->code('ISBN'); -my $code = $matcher->code(); - -=back + $matcher->code('ISBN'); + my $code = $matcher->code(); Accessor method. @@ -450,12 +437,8 @@ sub code { =head2 description -=over 4 - -$matcher->description('match on ISBN'); -my $description = $matcher->description(); - -=back + $matcher->description('match on ISBN'); + my $description = $matcher->description(); Accessor method. @@ -468,11 +451,7 @@ sub description { =head2 add_matchpoint -=over 4 - -$matcher->add_matchpoint($index, $score, $matchcomponents); - -=back + $matcher->add_matchpoint($index, $score, $matchcomponents); Adds a matchpoint that may include multiple components. The $index parameter identifies the index that will be searched, while $score @@ -510,13 +489,10 @@ sub add_matchpoint { =head2 add_simple_matchpoint -=over 4 + $matcher->add_simple_matchpoint($index, $score, $source_tag, + $source_subfields, $source_offset, + $source_length, $source_normalizer); -$matcher->add_simple_matchpoint($index, $score, $source_tag, $source_subfields, - $source_offset, $source_length, - $source_normalizer); - -=back Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields, normalized per the normalization fuction, search the index. All records retrieved @@ -538,11 +514,7 @@ sub add_simple_matchpoint { =head2 add_required_check -=over 4 - -$match->add_required_check($source_matchpoint, $target_matchpoint); - -=back + $match->add_required_check($source_matchpoint, $target_matchpoint); Adds a required check definition. A required check means that in order for a match to be considered valid, the key derived from the @@ -594,16 +566,14 @@ sub add_required_check { =head2 add_simple_required_check -$matcher->add_simple_required_check($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer, - $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer); - -=over 4 + $matcher->add_simple_required_check($source_tag, $source_subfields, + $source_offset, $source_length, $source_normalizer, + $target_tag, $target_subfields, $target_offset, + $target_length, $target_normalizer); Adds a required check, which requires that the normalized keys made from the source and targets must match for a match to be considered valid. -=back - =cut sub add_simple_required_check { @@ -619,19 +589,15 @@ sub add_simple_required_check { ); } -=head2 find_matches - -=over 4 - -my @matches = $matcher->get_matches($marc_record, $max_matches); -foreach $match (@matches) { - # matches already sorted in order of - # decreasing score - print "record ID: $match->{'record_id'}; - print "score: $match->{'score'}; -} +=head2 get_matches -=back + my @matches = $matcher->get_matches($marc_record, $max_matches); + foreach $match (@matches) { + # matches already sorted in order of + # decreasing score + print "record ID: $match->{'record_id'}; + print "score: $match->{'score'}; + } Identifies all of the records matching the given MARC record. For a record already in the database to be considered a match, it must meet the following criteria: @@ -655,17 +621,81 @@ sub get_matches { my %matches = (); - foreach my $matchpoint (@{ $self->{'matchpoints'} }) { - my @source_keys = _get_match_keys($source_record, $matchpoint); + my $QParser; + $QParser = C4::Context->queryparser if (C4::Context->preference('UseQueryParser')); + foreach my $matchpoint ( @{ $self->{'matchpoints'} } ) { + my @source_keys = _get_match_keys( $source_record, $matchpoint ); + next if scalar(@source_keys) == 0; + + # FIXME - because of a bug in QueryParser, an expression ofthe + # format 'isbn:"isbn1" || isbn:"isbn2" || isbn"isbn3"...' + # does not get parsed correctly, so we will not + # do AggressiveMatchOnISBN if UseQueryParser is on + @source_keys = C4::Koha::GetVariationsOfISBNs(@source_keys) + if ( $matchpoint->{index} =~ /^isbn$/i + && C4::Context->preference('AggressiveMatchOnISBN') ) + && !C4::Context->preference('UseQueryParser'); + + @source_keys = C4::Koha::GetVariationsOfISSNs(@source_keys) + if ( $matchpoint->{index} =~ /^issn$/i + && C4::Context->preference('AggressiveMatchOnISSN') ) + && !C4::Context->preference('UseQueryParser'); + # build query - my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys); - # FIXME only searching biblio index at the moment - my ($error, $searchresults, $total_hits) = SimpleSearch($query); + my $query; + my $error; + my $searchresults; + my $total_hits; + if ( $self->{'record_type'} eq 'biblio' ) { + + if ($QParser) { + $query = join( " || ", + map { "$matchpoint->{'index'}:$_" } @source_keys ); + } + else { + my $phr = ( C4::Context->preference('AggressiveMatchOnISBN') || C4::Context->preference('AggressiveMatchOnISSN') ) ? ',phr' : q{}; + $query = join( " or ", + map { "$matchpoint->{'index'}$phr=\"$_\"" } @source_keys ); + #NOTE: double-quote the values so you don't get a "Embedded truncation not supported" error when a term has a ? in it. + } - warn "search failed ($query) $error" if $error; - foreach my $matched (@$searchresults) { - $matches{$matched} += $matchpoint->{'score'}; + my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX}); + ( $error, $searchresults, $total_hits ) = + $searcher->simple_search_compat( $query, 0, $max_matches ); + } + elsif ( $self->{'record_type'} eq 'authority' ) { + my $authresults; + my @marclist; + my @and_or; + my @excluding = []; + my @operator; + my @value; + foreach my $key (@source_keys) { + push @marclist, $matchpoint->{'index'}; + push @and_or, 'or'; + push @operator, 'exact'; + push @value, $key; + } + require C4::AuthoritiesMarc; + ( $authresults, $total_hits ) = + C4::AuthoritiesMarc::SearchAuthorities( + \@marclist, \@and_or, \@excluding, \@operator, + \@value, 0, 20, undef, + 'AuthidAsc', 1 + ); + foreach my $result (@$authresults) { + push @$searchresults, $result->{'authid'}; + } + } + + if ( defined $error ) { + warn "search failed ($query) $error"; + } + else { + foreach my $matched ( @{$searchresults} ) { + $matches{$matched} += $matchpoint->{'score'}; + } } } @@ -674,16 +704,23 @@ sub get_matches { # get rid of any that don't meet the required checks %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ? ($_ => $matches{$_}) : () } - keys %matches; + keys %matches unless ($self->{'record_type'} eq 'auth'); my @results = (); - foreach my $marcblob (keys %matches) { - my $target_record = MARC::Record->new_from_usmarc($marcblob); - my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, ''); - # FIXME - again, bibliospecific - # also, can search engine be induced to give just the number in the first place? - my $record_number = $result->{'biblionumber'}; - push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} }; + if ($self->{'record_type'} eq 'biblio') { + require C4::Biblio; + foreach my $marcblob (keys %matches) { + my $target_record = C4::Search::new_record_from_zebra('biblioserver',$marcblob); + my $record_number; + my $result = C4::Biblio::TransformMarcToKoha($target_record, ''); + $record_number = $result->{'biblionumber'}; + push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} }; + } + } elsif ($self->{'record_type'} eq 'authority') { + require C4::AuthoritiesMarc; + foreach my $authid (keys %matches) { + push @results, { 'record_id' => $authid, 'score' => $matches{$authid} }; + } } @results = sort { $b->{'score'} cmp $a->{'score'} } @results; if (scalar(@results) > $max_matches) { @@ -695,11 +732,7 @@ sub get_matches { =head2 dump -=over 4 - -$description = $matcher->dump(); - -=back + $description = $matcher->dump(); Returns a reference to a structure containing all of the information in the matcher object. This is mainly a convenience method to @@ -715,6 +748,7 @@ sub dump { $result->{'matcher_id'} = $self->{'id'}; $result->{'code'} = $self->{'code'}; $result->{'description'} = $self->{'description'}; + $result->{'record_type'} = $self->{'record_type'}; $result->{'matchpoints'} = []; foreach my $matchpoint (@{ $self->{'matchpoints'} }) { @@ -744,6 +778,7 @@ sub _passes_required_checks { } sub _get_match_keys { + my $source_record = shift; my $matchpoint = shift; my $check_only_first_repeat = @_ ? shift : 0; @@ -762,7 +797,7 @@ sub _get_match_keys { # If there are two 003s and two 001s, there will be two keys: # first 003 + first 001 # second 003 + second 001 - + my @keys = (); for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) { my $component = $matchpoint->{'components'}->[$i]; @@ -771,22 +806,45 @@ sub _get_match_keys { $j++; last FIELD if $j > 0 and $check_only_first_repeat; last FIELD if $i > 0 and $j > $#keys; - my $key = ""; - if ($field->is_control_field()) { - if ($component->{'length'}) { - $key = _normalize(substr($field->data(), $component->{'offset'}, $component->{'length'})) - # FIXME normalize, substr - } else { - $key = _normalize($field->data()); - } + + my $string; + if ( $field->is_control_field() ) { + $string = $field->data(); } else { - foreach my $subfield ($field->subfields()) { - if (exists $component->{'subfields'}->{$subfield->[0]}) { - $key .= " " . $subfield->[1]; + $string = $field->as_string( + join('', keys %{ $component->{ subfields } }), ' ' # ' ' as separator + ); + } + + if ($component->{'length'}>0) { + $string= substr($string, $component->{'offset'}, $component->{'length'}); + } elsif ($component->{'offset'}) { + $string= substr($string, $component->{'offset'}); + } + + my $norms = $component->{'norms'}; + my $key = $string; + + foreach my $norm ( @{ $norms } ) { + if ( grep { $norm eq $_ } valid_normalization_routines() ) { + if ( $norm eq 'remove_spaces' ) { + $key = remove_spaces($key); + } + elsif ( $norm eq 'upper_case' ) { + $key = upper_case($key); + } + elsif ( $norm eq 'lower_case' ) { + $key = lower_case($key); } + elsif ( $norm eq 'legacy_default' ) { + $key = legacy_default($key); + } + } else { + warn "Invalid normalization routine required ($norm)" + unless $norm eq 'none'; } - $key = _normalize($key); } + if ($i == 0) { push @keys, $key if $key; } else { @@ -795,7 +853,6 @@ sub _get_match_keys { } } return @keys; - } @@ -812,14 +869,14 @@ sub _parse_match_component { return $component; } -# FIXME - default normalizer -sub _normalize { - my $value = uc shift; - $value =~ s/^\s+//; - $value =~ s/^\s+$//; - $value =~ s/\s+/ /g; - $value =~ s/[.;,\]\[\)\(\/"']//g; - return $value; +sub valid_normalization_routines { + + return ( + 'remove_spaces', + 'upper_case', + 'lower_case', + 'legacy_default' + ); } 1; @@ -827,7 +884,7 @@ __END__ =head1 AUTHOR -Koha Development Team +Koha Development Team Galen Charlton