From: Dobrica Pavlinusic <dpavlin@rot13.org>
Date: Tue, 14 Nov 2023 19:45:39 +0000 (+0100)
Subject: merge at end
X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=1273962538c1f0c3574cb23d1d99490ffaa210d5;p=csv-join-similarity

merge at end
---

diff --git a/upari.pl b/upari.pl
index 85e7285..beebedf 100755
--- a/upari.pl
+++ b/upari.pl
@@ -17,6 +17,22 @@ my @files = qw( 1.csv 2.csv 3.csv 4.csv );
 my $stat;
 my $keys;
 
+sub candidates {
+	my ( $num, $key_id, $limit_sim ) = @_;
+	$limit_sim //= 0.9;
+
+	my @candidates;
+	foreach my $key ( keys %{ $keys->{ $key_id } } ) {
+		next if $key eq $num;
+		my $s = similarity $num, $key, $limit_sim;
+		#warn "# $num $key $s\n";
+		if ($s > $limit_sim ) {
+			push @candidates, { key => $key, s => $s };
+		}
+	}
+	return @candidates;
+}
+
 my $keys_file = 'keys.storable';
 if ( -e $keys_file ) {
 	$keys = retrieve($keys_file) or die "$keys_file: $!";
@@ -32,6 +48,8 @@ foreach my $nr ( 1 .. 4 ) {
 	while (my $row = $csv->getline ($fh)) {
 		$stat->{lines}->{$nr}++;
 		$stat->{ $file }->{lines}++;
+		$row->[1] =~ s/\D+//g && print 'c1';
+		$row->[2] =~ s/\D+//g && print 'c2';
 		my $id = join('-',
 			uc $row->[0],
 			$row->[1],
@@ -47,21 +65,13 @@ foreach my $nr ( 1 .. 4 ) {
 
 		my $key_id = $row->[1] . '-' . $row->[2];
 
+		$stat->{A_key_id}->{$key_id}->{$nr}++;
+
 		$keys->{ $key_id }->{ $num }++;
-	
+=for later	
 		if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) {
 			$stat->{new_exact}->{$nr}++;
-
-			my @candidates;
-			my $limit_sim = 0.9;
-			foreach my $key ( keys %{ $keys->{ $key_id } } ) {
-				next if $key eq $num;
-				my $s = similarity $num, $key, $limit_sim;
-				#warn "# $num $key $s\n";
-				if ($s > $limit_sim ) {
-					push @candidates, { key => $key, s => $s };
-				}
-			}
+			my @candidates = candidates $num => $key_id, 0.8; # 0.9 too strict
 			if ( @candidates ) {
 				my $multi =  $#candidates > 0 ? 'multi' : '';
 				$stat->{ 'similarity' . $multi }->{$nr}++;
@@ -77,7 +87,7 @@ foreach my $nr ( 1 .. 4 ) {
 			}
 
 		}
-
+=cut
 		$stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
 
 		push @{ $stat->{_}->{ $id } }, $nr;
@@ -86,21 +96,100 @@ foreach my $nr ( 1 .. 4 ) {
 
 }
 
+
+
+
+
+
 print "# total = ",scalar keys %{ $stat->{_} }, $/;
 foreach my $id ( keys %{ $stat->{_} } ) {
-	my @val = @{ $stat->{_}->{$id} };
-	$stat->{count}->{ scalar @val }++;
-	$stat->{count_total}++;
+	my @val;
+	my $v = $stat->{_}->{$id};
+	if ( defined $v && ref $v eq 'ARRAY' ) {
+		@val = @$v;
+	} else {
+		print "SKIP[$id]";
+		next;
+	}
+	$stat->{A_count}->{ scalar @val }++;
+	$stat->{A_count_total}++;
 
-	$stat->{'00_count_val'}->{ join(' ', @val) }++; # with duplicates
+	#$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
 
 	my $u;
 	$u->{$_}++ foreach @val;
-	$stat->{count_val}->{ join(' ', sort keys %$u ) }++; # without duplicates
+	my @u_v = sort keys %$u;
+	$stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+
+	if ( $#u_v < 2 ) { # single, double
+		my ( $num, $key_id ) = split(/-/,$id,2);
+		my @candidates = candidates $num => $key_id, 0.6; # XXX 0.9 too high, 0.8 better
+		if ( @candidates ) {
+			print "MERGE $num $key_id ", dump( @candidates ), ' vals=', dump( @u_v ),$/;
+			my @keys = map { $_->{key} } @candidates;
+			my  $m_id = "$keys[0]-$key_id";
+			foreach my $i ( 1 .. $#keys ) {
+				my  $id = "$keys[$i]-$key_id";
+				if ( ! exists $stat->{_}->{$id} ) {
+			       		print "ERROR: $num $key_id can't find $i $id";
 
+				}
+				my $o = delete $stat->{_}->{$id};
+			       	warn "can't find $keys[$i]" if ! $o;
+				foreach my $val ( @$o ) {
+					print '.';
+					push @{ $stat->{_}->{ $m_id } }, $val;
+					print "++ $m_id $val ";
+				}
+				print "result val=",dump( $stat->{_}->{ $m_id } ),$/;
+			}
+			$stat->{merge}++;
+		}
+
+	}
+}
+
+print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
+
+foreach my $id ( keys %{ $stat->{_} } ) {
+	my @val; # = @{ $stat->{_}->{$id} };
+	my $v = $stat->{_}->{$id};
+	if ( defined $v && ref $v eq 'ARRAY' ) {
+		@val = @$v;
+	} else {
+		print "SKIP[$id]";
+		next;
+	}
+	$stat->{B_count}->{ scalar @val }++;
+	$stat->{B_count_total}++;
+
+	#$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
+
+	my $u;
+	$u->{$_}++ foreach @val;
+	my @u_v = sort keys %$u;
+	$stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+}
+
+=for later
+foreach my $key ( keys %{ $stat->{unique_id} } ) {
+	#print "XXX unique_id $key\n";
+	print '.';
+	my ( $num, $key_id ) = split(/-/,$key,2);
+	my @candidates = candidates $num => $key_id, 0.8; # XXX lower
+	if ( @candidates ) {
+		print "## unique_id $num $key_id ",dump( @candidates ),$/;
+		$stat->{unique_id}++;
+	}
 }
+delete $stat->{unique_id};
+=cut
+
+
+
 print "# stat = ",dump( $stat );
 #print "# keys = ",dump( $keys );
 
+
 store $keys, $keys_file;