projects
/
csv-join-similarity
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
27a2617
)
env LIMIT=0.9 is default
author
Dobrica Pavlinusic
<dpavlin@rot13.org>
Wed, 15 Nov 2023 09:03:17 +0000
(10:03 +0100)
committer
Dobrica Pavlinusic
<dpavlin@rot13.org>
Wed, 15 Nov 2023 20:26:18 +0000
(21:26 +0100)
upari.pl
patch
|
blob
|
history
diff --git
a/upari.pl
b/upari.pl
index
444b9b1
..
0ff27a0
100755
(executable)
--- a/
upari.pl
+++ b/
upari.pl
@@
-28,6
+28,7
@@
sub duplicate {
sub candidates {
my ( $num, $key_id, $limit_sim ) = @_;
sub candidates {
my ( $num, $key_id, $limit_sim ) = @_;
+ $limit_sim //= $ENV{LIMIT};
$limit_sim //= 0.9;
my @candidates;
$limit_sim //= 0.9;
my @candidates;
@@
-45,13
+46,13
@@
sub candidates {
#print "XXX use $num $key ",dump($use);
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
if ( $duplicate ) {
#print "XXX use $num $key ",dump($use);
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
if ( $duplicate ) {
- print "XXX suggest duplicate $num $key SKIP duplicate ",dump($use), $/;
+ print "XXX
$limit_sim
suggest duplicate $num $key SKIP duplicate ",dump($use), $/;
$stat->{suggest}->{duplicate}++;
} else {
push @candidates, { key => $key, s => $s };
}
} else {
$stat->{suggest}->{duplicate}++;
} else {
push @candidates, { key => $key, s => $s };
}
} else {
- print "XXX candidates $key missing\n";
+ print "XXX
$limit_sim
candidates $key missing\n";
}
}
}
}
}
}
@@
-129,7
+130,7
@@
foreach my $id ( sort keys %{ $stat->{_} } ) {
if ( $#u_v < 3 ) { # single, double
my ( $num, $key_id ) = split(/-/,$id,2);
if ( $#u_v < 3 ) { # single, double
my ( $num, $key_id ) = split(/-/,$id,2);
- my @candidates = candidates $num => $key_id, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
+ my @candidates = candidates $num => $key_id
; #
, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
if ( @candidates ) {
print "MERGE $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
my @keys = map { $_->{key} } @candidates;
if ( @candidates ) {
print "MERGE $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
my @keys = map { $_->{key} } @candidates;