projects
/
csv-join-similarity
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
d94a52c
)
collect A_ counts (original data stats) only on first loop
author
Dobrica Pavlinusic
<dpavlin@rot13.org>
Tue, 21 Nov 2023 16:14:38 +0000
(17:14 +0100)
committer
Dobrica Pavlinusic
<dpavlin@rot13.org>
Tue, 21 Nov 2023 16:14:38 +0000
(17:14 +0100)
upari.pl
patch
|
blob
|
history
diff --git
a/upari.pl
b/upari.pl
index
3430566
..
cbb178f
100755
(executable)
--- a/
upari.pl
+++ b/
upari.pl
@@
-10,6
+10,8
@@
use Data::Dump qw(dump);
use String::Similarity;
use Storable;
use String::Similarity;
use Storable;
+my $debug = $ENV{DEBUG};
+
my @files = qw( 1.csv 2.csv 3.csv 4.csv );
our $stat;
my @files = qw( 1.csv 2.csv 3.csv 4.csv );
our $stat;
@@
-20,7
+22,7
@@
sub duplicate {
my $use;
$use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
my $use;
$use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
- print "XXX use @for ",dump($use),$/ if $duplicate;
+ print "XXX use @for ",dump($use),$/ if $d
ebug && $d
uplicate;
return $duplicate;
}
return $duplicate;
}
@@
-44,13
+46,13
@@
sub candidates {
#print "XXX use $num $key ",dump($use);
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
if ( $duplicate ) {
#print "XXX use $num $key ",dump($use);
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
if ( $duplicate ) {
- print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/;
+ print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/
if $debug
;
$stat->{suggest}->{duplicate}++;
} else {
push @candidates, { key => $key, s => $s };
}
} else {
$stat->{suggest}->{duplicate}++;
} else {
push @candidates, { key => $key, s => $s };
}
} else {
- print "XXX $limit_sim candidates $key missing\n";
+ print "XXX $limit_sim candidates $key missing\n"
if $debug
;
}
}
}
}
}
}
@@
-121,6
+123,7
@@
foreach my $nr ( 1 .. 4 ) {
+my $first = 1;
# 0.9 - 0.7 -- 0.6 is too lax
foreach my $limit ( 0.7, 0.6 ) {
warn "XXX limit $limit\n";
# 0.9 - 0.7 -- 0.6 is too lax
foreach my $limit ( 0.7, 0.6 ) {
warn "XXX limit $limit\n";
@@
-135,15
+138,17
@@
foreach my $id ( sort keys %{ $stat->{_} } ) {
#print "SKIP[$id]";
next;
}
#print "SKIP[$id]";
next;
}
- $stat->{A_count}->{ scalar @val }++;
- $stat->{A_count_total}++;
-
- #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
-
my $u;
$u->{$_}++ foreach @val;
my @u_v = sort keys %$u;
my $u;
$u->{$_}++ foreach @val;
my @u_v = sort keys %$u;
- $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+ if ( $first == 1 ) {
+ $stat->{A_count}->{ scalar @val }++;
+ $stat->{A_count_total}++;
+
+ #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
+
+ $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+ }
if ( $#u_v < 3 ) { # single, double
my ( $num, $key_id ) = split(/-/,$id,2);
if ( $#u_v < 3 ) { # single, double
my ( $num, $key_id ) = split(/-/,$id,2);
@@
-204,7
+209,7
@@
foreach my $id ( sort keys %{ $stat->{_} } ) {
$stat->{"B${limit}_count_val"}->{ join(' ', @u_v ) }++; # without duplicates
}
$stat->{"B${limit}_count_val"}->{ join(' ', @u_v ) }++; # without duplicates
}
-
+$first++;
} # for $limit
print "# stat = ",dump( $stat );
} # for $limit
print "# stat = ",dump( $stat );