From f381e69257855f9528384b707b521ffda8cd488c Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Tue, 21 Nov 2023 17:14:38 +0100 Subject: [PATCH] collect A_ counts (original data stats) only on first loop --- upari.pl | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/upari.pl b/upari.pl index 3430566..cbb178f 100755 --- a/upari.pl +++ b/upari.pl @@ -10,6 +10,8 @@ use Data::Dump qw(dump); use String::Similarity; use Storable; +my $debug = $ENV{DEBUG}; + my @files = qw( 1.csv 2.csv 3.csv 4.csv ); our $stat; @@ -20,7 +22,7 @@ sub duplicate { my $use; $use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for ); my $duplicate = grep { $use->{$_} > 1 } keys %$use; - print "XXX use @for ",dump($use),$/ if $duplicate; + print "XXX use @for ",dump($use),$/ if $debug && $duplicate; return $duplicate; } @@ -44,13 +46,13 @@ sub candidates { #print "XXX use $num $key ",dump($use); my $duplicate = grep { $use->{$_} > 1 } keys %$use; if ( $duplicate ) { - print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/; + print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/ if $debug; $stat->{suggest}->{duplicate}++; } else { push @candidates, { key => $key, s => $s }; } } else { - print "XXX $limit_sim candidates $key missing\n"; + print "XXX $limit_sim candidates $key missing\n" if $debug; } } } @@ -121,6 +123,7 @@ foreach my $nr ( 1 .. 4 ) { +my $first = 1; # 0.9 - 0.7 -- 0.6 is too lax foreach my $limit ( 0.7, 0.6 ) { warn "XXX limit $limit\n"; @@ -135,15 +138,17 @@ foreach my $id ( sort keys %{ $stat->{_} } ) { #print "SKIP[$id]"; next; } - $stat->{A_count}->{ scalar @val }++; - $stat->{A_count_total}++; - - #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates - my $u; $u->{$_}++ foreach @val; my @u_v = sort keys %$u; - $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates + if ( $first == 1 ) { + $stat->{A_count}->{ scalar @val }++; + $stat->{A_count_total}++; + + #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates + + $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates + } if ( $#u_v < 3 ) { # single, double my ( $num, $key_id ) = split(/-/,$id,2); @@ -204,7 +209,7 @@ foreach my $id ( sort keys %{ $stat->{_} } ) { $stat->{"B${limit}_count_val"}->{ join(' ', @u_v ) }++; # without duplicates } - +$first++; } # for $limit print "# stat = ",dump( $stat ); -- 2.20.1