env LIMIT=0.9 is default
authorDobrica Pavlinusic <dpavlin@rot13.org>
Wed, 15 Nov 2023 09:03:17 +0000 (10:03 +0100)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Wed, 15 Nov 2023 20:26:18 +0000 (21:26 +0100)
upari.pl

index 444b9b1..0ff27a0 100755 (executable)
--- a/upari.pl
+++ b/upari.pl
@@ -28,6 +28,7 @@ sub duplicate {
 
 sub candidates {
        my ( $num, $key_id, $limit_sim ) = @_;
+       $limit_sim //= $ENV{LIMIT};
        $limit_sim //= 0.9;
 
        my @candidates;
@@ -45,13 +46,13 @@ sub candidates {
                                #print "XXX use $num $key ",dump($use);
                                my $duplicate = grep { $use->{$_} > 1 } keys %$use;
                                if ( $duplicate ) {
-                                       print "XXX suggest duplicate  $num $key SKIP duplicate ",dump($use), $/;
+                                       print "XXX $limit_sim suggest duplicate  $num $key SKIP duplicate ",dump($use), $/;
                                        $stat->{suggest}->{duplicate}++;
                                } else {
                                        push @candidates, { key => $key, s => $s };
                                }
                        } else {
-                               print "XXX candidates $key missing\n";
+                               print "XXX $limit_sim candidates $key missing\n";
                        }
                }
        }
@@ -129,7 +130,7 @@ foreach my $id ( sort keys %{ $stat->{_} } ) {
 
        if ( $#u_v < 3 ) { # single, double
                my ( $num, $key_id ) = split(/-/,$id,2);
-               my @candidates = candidates $num => $key_id, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
+               my @candidates = candidates $num => $key_id; #, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
                if ( @candidates ) {
                        print "MERGE $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
                        my @keys = map { $_->{key} } @candidates;