similarity 0.9, merge all suggestions
authorDobrica Pavlinusic <dpavlin@rot13.org>
Tue, 14 Nov 2023 09:43:07 +0000 (10:43 +0100)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Tue, 14 Nov 2023 09:43:07 +0000 (10:43 +0100)
upari.pl

index 9366a8e..72ef4aa 100755 (executable)
--- a/upari.pl
+++ b/upari.pl
@@ -49,22 +49,23 @@ foreach my $nr ( 1 .. 4 ) {
                        $stat->{error}->{new_exact}->{$nr}++;
 
                        my @candidates;
+                       my $limit_sim = 0.9;
                        foreach my $key ( keys %{ $keys->{ $key_id } } ) {
-                               my $s = similarity $num, $key, 0.4;
+                               my $s = similarity $num, $key, $limit_sim;
                                #warn "# $num $key $s\n";
-                               if ($s > 0.8 ) {
+                               if ($s > $limit_sim ) {
                                        push @candidates, { key => $key, s => $s };
                                }
                        }
-                       if ( $#candidates == 0 ) {
-                               $stat->{similarity}->{$nr}++;
-                               my $new_num = $candidates[0]->{key};
-                               $id = join('-',uc $new_num, $row->[1], $row->[2]);
-                               print "SIMILARITY $nr $num -> $new_num\n";
-                               push @{ $keys->{ $key_id }->{ $id } }, uc $id;
-                       } elsif ( $#candidates > 1 ) {
-                               warn "# similarity IGNORED $num = ",dump( @candidates ),$/;
-                               $stat->{similarity_multiple}->{$nr}++;
+                       if ( @candidates ) {
+                               my $multi =  $#candidates > 0 ? 'multi' : '';
+                               $stat->{ 'similarity' . $multi }->{$nr}++;
+                               foreach my $i ( 0 .. $#candidates ) {
+                                       my $new_num = $candidates[$i]->{key};
+                                       $id = join('-',uc $new_num, $row->[1], $row->[2]);
+                                       print "SIMILARITY$multi $nr $num -> $new_num\n";
+                                       push @{ $keys->{ $key_id }->{ $id } }, uc $id;
+                               }
                        } else {
                                $stat->{similarity_none}->{$nr}++;
                                push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];