From fd931e43468f06c89b8d2ac876283dc140350ac5 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Tue, 14 Nov 2023 12:04:51 +0100 Subject: [PATCH] cleanup, collect unique_id --- upari.pl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/upari.pl b/upari.pl index 72ef4aa..85e7285 100755 --- a/upari.pl +++ b/upari.pl @@ -39,18 +39,23 @@ foreach my $nr ( 1 .. 4 ) { ); my $num = uc $row->[0]; + if ( length $num < 3 ) { + print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/; + $stat->{ignore}->{$nr}++; + next; + } my $key_id = $row->[1] . '-' . $row->[2]; - if ( $nr == 1 ) { - push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0]; - } - if ( length($num) > 3 && $nr > 1 && ! exists $stat->{_}->{ $id } ) { - $stat->{error}->{new_exact}->{$nr}++; + $keys->{ $key_id }->{ $num }++; + + if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) { + $stat->{new_exact}->{$nr}++; my @candidates; my $limit_sim = 0.9; foreach my $key ( keys %{ $keys->{ $key_id } } ) { + next if $key eq $num; my $s = similarity $num, $key, $limit_sim; #warn "# $num $key $s\n"; if ($s > $limit_sim ) { @@ -64,15 +69,13 @@ foreach my $nr ( 1 .. 4 ) { my $new_num = $candidates[$i]->{key}; $id = join('-',uc $new_num, $row->[1], $row->[2]); print "SIMILARITY$multi $nr $num -> $new_num\n"; - push @{ $keys->{ $key_id }->{ $id } }, uc $id; + $keys->{ $key_id }->{ $id }++; } } else { $stat->{similarity_none}->{$nr}++; - push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0]; + $stat->{unique_id}->{$id}++; } - - } $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id }; -- 2.20.1