From: Dobrica Pavlinusic Date: Tue, 14 Nov 2023 09:43:07 +0000 (+0100) Subject: similarity 0.9, merge all suggestions X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=543260c582708142468b530844363888ee09c8cf;p=csv-join-similarity similarity 0.9, merge all suggestions --- diff --git a/upari.pl b/upari.pl index 9366a8e..72ef4aa 100755 --- a/upari.pl +++ b/upari.pl @@ -49,22 +49,23 @@ foreach my $nr ( 1 .. 4 ) { $stat->{error}->{new_exact}->{$nr}++; my @candidates; + my $limit_sim = 0.9; foreach my $key ( keys %{ $keys->{ $key_id } } ) { - my $s = similarity $num, $key, 0.4; + my $s = similarity $num, $key, $limit_sim; #warn "# $num $key $s\n"; - if ($s > 0.8 ) { + if ($s > $limit_sim ) { push @candidates, { key => $key, s => $s }; } } - if ( $#candidates == 0 ) { - $stat->{similarity}->{$nr}++; - my $new_num = $candidates[0]->{key}; - $id = join('-',uc $new_num, $row->[1], $row->[2]); - print "SIMILARITY $nr $num -> $new_num\n"; - push @{ $keys->{ $key_id }->{ $id } }, uc $id; - } elsif ( $#candidates > 1 ) { - warn "# similarity IGNORED $num = ",dump( @candidates ),$/; - $stat->{similarity_multiple}->{$nr}++; + if ( @candidates ) { + my $multi = $#candidates > 0 ? 'multi' : ''; + $stat->{ 'similarity' . $multi }->{$nr}++; + foreach my $i ( 0 .. $#candidates ) { + my $new_num = $candidates[$i]->{key}; + $id = join('-',uc $new_num, $row->[1], $row->[2]); + print "SIMILARITY$multi $nr $num -> $new_num\n"; + push @{ $keys->{ $key_id }->{ $id } }, uc $id; + } } else { $stat->{similarity_none}->{$nr}++; push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];