cleanup, collect unique_id
authorDobrica Pavlinusic <dpavlin@rot13.org>
Tue, 14 Nov 2023 11:04:51 +0000 (12:04 +0100)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Tue, 14 Nov 2023 11:04:51 +0000 (12:04 +0100)
upari.pl

index 72ef4aa..85e7285 100755 (executable)
--- a/upari.pl
+++ b/upari.pl
@@ -39,18 +39,23 @@ foreach my $nr ( 1 .. 4 ) {
                );
 
                my $num = uc $row->[0];
+               if ( length $num < 3 ) {
+                       print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/;
+                       $stat->{ignore}->{$nr}++;
+                       next;
+               }
 
                my $key_id = $row->[1] . '-' . $row->[2];
 
-               if ( $nr == 1 ) {
-                       push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
-               }
-               if ( length($num) > 3 && $nr > 1 && ! exists $stat->{_}->{ $id } ) {
-                       $stat->{error}->{new_exact}->{$nr}++;
+               $keys->{ $key_id }->{ $num }++;
+       
+               if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) {
+                       $stat->{new_exact}->{$nr}++;
 
                        my @candidates;
                        my $limit_sim = 0.9;
                        foreach my $key ( keys %{ $keys->{ $key_id } } ) {
+                               next if $key eq $num;
                                my $s = similarity $num, $key, $limit_sim;
                                #warn "# $num $key $s\n";
                                if ($s > $limit_sim ) {
@@ -64,15 +69,13 @@ foreach my $nr ( 1 .. 4 ) {
                                        my $new_num = $candidates[$i]->{key};
                                        $id = join('-',uc $new_num, $row->[1], $row->[2]);
                                        print "SIMILARITY$multi $nr $num -> $new_num\n";
-                                       push @{ $keys->{ $key_id }->{ $id } }, uc $id;
+                                       $keys->{ $key_id }->{ $id }++;
                                }
                        } else {
                                $stat->{similarity_none}->{$nr}++;
-                               push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
+                               $stat->{unique_id}->{$id}++;
                        }
 
-
-
                }
 
                $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };