merge at end

author Dobrica Pavlinusic <dpavlin@rot13.org>

Tue, 14 Nov 2023 19:45:39 +0000 (20:45 +0100)

committer Dobrica Pavlinusic <dpavlin@rot13.org>

Tue, 14 Nov 2023 19:45:39 +0000 (20:45 +0100)
author Dobrica Pavlinusic <dpavlin@rot13.org>
Tue, 14 Nov 2023 19:45:39 +0000 (20:45 +0100)
committer Dobrica Pavlinusic <dpavlin@rot13.org>
Tue, 14 Nov 2023 19:45:39 +0000 (20:45 +0100)
diff --git a/upari.pl b/upari.pl

index 85e7285..beebedf 100755 (executable)
--- a/upari.pl
+++ b/upari.pl
@@ -17,6 +17,22 @@ my @files = qw( 1.csv 2.csv 3.csv 4.csv );
  my $stat;
  my $keys;
  
+sub candidates {
+       my ( $num, $key_id, $limit_sim ) = @_;
+       $limit_sim //= 0.9;
+
+       my @candidates;
+       foreach my $key ( keys %{ $keys->{ $key_id } } ) {
+               next if $key eq $num;
+               my $s = similarity $num, $key, $limit_sim;
+               #warn "# $num $key $s\n";
+               if ($s > $limit_sim ) {
+                       push @candidates, { key => $key, s => $s };
+               }
+       }
+       return @candidates;
+}
+
  my $keys_file = 'keys.storable';
  if ( -e $keys_file ) {
         $keys = retrieve($keys_file) or die "$keys_file: $!";
@@ -32,6 +48,8 @@ foreach my $nr ( 1 .. 4 ) {
         while (my $row = $csv->getline ($fh)) {
                 $stat->{lines}->{$nr}++;
                 $stat->{ $file }->{lines}++;
+               $row->[1] =~ s/\D+//g && print 'c1';
+               $row->[2] =~ s/\D+//g && print 'c2';
                 my $id = join('-',
                         uc $row->[0],
                         $row->[1],
@@ -47,21 +65,13 @@ foreach my $nr ( 1 .. 4 ) {
  
                 my $key_id = $row->[1] . '-' . $row->[2];
  
+               $stat->{A_key_id}->{$key_id}->{$nr}++;
+
                 $keys->{ $key_id }->{ $num }++;
-       
+=for later     
                 if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) {
                         $stat->{new_exact}->{$nr}++;
-
-                       my @candidates;
-                       my $limit_sim = 0.9;
-                       foreach my $key ( keys %{ $keys->{ $key_id } } ) {
-                               next if $key eq $num;
-                               my $s = similarity $num, $key, $limit_sim;
-                               #warn "# $num $key $s\n";
-                               if ($s > $limit_sim ) {
-                                       push @candidates, { key => $key, s => $s };
-                               }
-                       }
+                       my @candidates = candidates $num => $key_id, 0.8; # 0.9 too strict
                         if ( @candidates ) {
                                 my $multi =  $#candidates > 0 ? 'multi' : '';
                                 $stat->{ 'similarity' . $multi }->{$nr}++;
@@ -77,7 +87,7 @@ foreach my $nr ( 1 .. 4 ) {
                         }
  
                 }
-
+=cut
                 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
  
                 push @{ $stat->{_}->{ $id } }, $nr;
@@ -86,21 +96,100 @@ foreach my $nr ( 1 .. 4 ) {
  
  }
  
+
+
+
+
+
  print "# total = ",scalar keys %{ $stat->{_} }, $/;
  foreach my $id ( keys %{ $stat->{_} } ) {
-       my @val = @{ $stat->{_}->{$id} };
-       $stat->{count}->{ scalar @val }++;
-       $stat->{count_total}++;
+       my @val;
+       my $v = $stat->{_}->{$id};
+       if ( defined $v && ref $v eq 'ARRAY' ) {
+               @val = @$v;
+       } else {
+               print "SKIP[$id]";
+               next;
+       }
+       $stat->{A_count}->{ scalar @val }++;
+       $stat->{A_count_total}++;
  
-       $stat->{'00_count_val'}->{ join(' ', @val) }++; # with duplicates
+       #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
  
         my $u;
         $u->{$_}++ foreach @val;
-       $stat->{count_val}->{ join(' ', sort keys %$u ) }++; # without duplicates
+       my @u_v = sort keys %$u;
+       $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+
+       if ( $#u_v < 2 ) { # single, double
+               my ( $num, $key_id ) = split(/-/,$id,2);
+               my @candidates = candidates $num => $key_id, 0.6; # XXX 0.9 too high, 0.8 better
+               if ( @candidates ) {
+                       print "MERGE $num $key_id ", dump( @candidates ), ' vals=', dump( @u_v ),$/;
+                       my @keys = map { $_->{key} } @candidates;
+                       my  $m_id = "$keys[0]-$key_id";
+                       foreach my $i ( 1 .. $#keys ) {
+                               my  $id = "$keys[$i]-$key_id";
+                               if ( ! exists $stat->{_}->{$id} ) {
+                                       print "ERROR: $num $key_id can't find $i $id";
  
+                               }
+                               my $o = delete $stat->{_}->{$id};
+                               warn "can't find $keys[$i]" if ! $o;
+                               foreach my $val ( @$o ) {
+                                       print '.';
+                                       push @{ $stat->{_}->{ $m_id } }, $val;
+                                       print "++ $m_id $val ";
+                               }
+                               print "result val=",dump( $stat->{_}->{ $m_id } ),$/;
+                       }
+                       $stat->{merge}++;
+               }
+
+       }
+}
+
+print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
+
+foreach my $id ( keys %{ $stat->{_} } ) {
+       my @val; # = @{ $stat->{_}->{$id} };
+       my $v = $stat->{_}->{$id};
+       if ( defined $v && ref $v eq 'ARRAY' ) {
+               @val = @$v;
+       } else {
+               print "SKIP[$id]";
+               next;
+       }
+       $stat->{B_count}->{ scalar @val }++;
+       $stat->{B_count_total}++;
+
+       #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
+
+       my $u;
+       $u->{$_}++ foreach @val;
+       my @u_v = sort keys %$u;
+       $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+}
+
+=for later
+foreach my $key ( keys %{ $stat->{unique_id} } ) {
+       #print "XXX unique_id $key\n";
+       print '.';
+       my ( $num, $key_id ) = split(/-/,$key,2);
+       my @candidates = candidates $num => $key_id, 0.8; # XXX lower
+       if ( @candidates ) {
+               print "## unique_id $num $key_id ",dump( @candidates ),$/;
+               $stat->{unique_id}++;
+       }
  }
+delete $stat->{unique_id};
+=cut
+
+
+
  print "# stat = ",dump( $stat );
  #print "# keys = ",dump( $keys );
  
+
  store $keys, $keys_file;
author	Dobrica Pavlinusic <dpavlin@rot13.org>
	Tue, 14 Nov 2023 19:45:39 +0000 (20:45 +0100)
committer	Dobrica Pavlinusic <dpavlin@rot13.org>
	Tue, 14 Nov 2023 19:45:39 +0000 (20:45 +0100)