use String::Similarity;
use Storable;
+my $debug = $ENV{DEBUG};
+
my @files = qw( 1.csv 2.csv 3.csv 4.csv );
our $stat;
my $use;
$use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
- print "XXX use @for ",dump($use),$/ if $duplicate;
+ print "XXX use @for ",dump($use),$/ if $debug && $duplicate;
return $duplicate;
}
#print "XXX use $num $key ",dump($use);
my $duplicate = grep { $use->{$_} > 1 } keys %$use;
if ( $duplicate ) {
- print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/;
+ print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/ if $debug;
$stat->{suggest}->{duplicate}++;
} else {
push @candidates, { key => $key, s => $s };
}
} else {
- print "XXX $limit_sim candidates $key missing\n";
+ print "XXX $limit_sim candidates $key missing\n" if $debug;
}
}
}
+my $first = 1;
# 0.9 - 0.7 -- 0.6 is too lax
foreach my $limit ( 0.7, 0.6 ) {
warn "XXX limit $limit\n";
#print "SKIP[$id]";
next;
}
- $stat->{A_count}->{ scalar @val }++;
- $stat->{A_count_total}++;
-
- #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
-
my $u;
$u->{$_}++ foreach @val;
my @u_v = sort keys %$u;
- $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+ if ( $first == 1 ) {
+ $stat->{A_count}->{ scalar @val }++;
+ $stat->{A_count_total}++;
+
+ #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
+
+ $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+ }
if ( $#u_v < 3 ) { # single, double
my ( $num, $key_id ) = split(/-/,$id,2);
$stat->{"B${limit}_count_val"}->{ join(' ', @u_v ) }++; # without duplicates
}
-
+$first++;
} # for $limit
print "# stat = ",dump( $stat );