From: Dobrica Pavlinusic Date: Tue, 14 Nov 2023 19:45:39 +0000 (+0100) Subject: merge at end X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=1273962538c1f0c3574cb23d1d99490ffaa210d5;p=csv-join-similarity merge at end --- diff --git a/upari.pl b/upari.pl index 85e7285..beebedf 100755 --- a/upari.pl +++ b/upari.pl @@ -17,6 +17,22 @@ my @files = qw( 1.csv 2.csv 3.csv 4.csv ); my $stat; my $keys; +sub candidates { + my ( $num, $key_id, $limit_sim ) = @_; + $limit_sim //= 0.9; + + my @candidates; + foreach my $key ( keys %{ $keys->{ $key_id } } ) { + next if $key eq $num; + my $s = similarity $num, $key, $limit_sim; + #warn "# $num $key $s\n"; + if ($s > $limit_sim ) { + push @candidates, { key => $key, s => $s }; + } + } + return @candidates; +} + my $keys_file = 'keys.storable'; if ( -e $keys_file ) { $keys = retrieve($keys_file) or die "$keys_file: $!"; @@ -32,6 +48,8 @@ foreach my $nr ( 1 .. 4 ) { while (my $row = $csv->getline ($fh)) { $stat->{lines}->{$nr}++; $stat->{ $file }->{lines}++; + $row->[1] =~ s/\D+//g && print 'c1'; + $row->[2] =~ s/\D+//g && print 'c2'; my $id = join('-', uc $row->[0], $row->[1], @@ -47,21 +65,13 @@ foreach my $nr ( 1 .. 4 ) { my $key_id = $row->[1] . '-' . $row->[2]; + $stat->{A_key_id}->{$key_id}->{$nr}++; + $keys->{ $key_id }->{ $num }++; - +=for later if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) { $stat->{new_exact}->{$nr}++; - - my @candidates; - my $limit_sim = 0.9; - foreach my $key ( keys %{ $keys->{ $key_id } } ) { - next if $key eq $num; - my $s = similarity $num, $key, $limit_sim; - #warn "# $num $key $s\n"; - if ($s > $limit_sim ) { - push @candidates, { key => $key, s => $s }; - } - } + my @candidates = candidates $num => $key_id, 0.8; # 0.9 too strict if ( @candidates ) { my $multi = $#candidates > 0 ? 'multi' : ''; $stat->{ 'similarity' . $multi }->{$nr}++; @@ -77,7 +87,7 @@ foreach my $nr ( 1 .. 4 ) { } } - +=cut $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id }; push @{ $stat->{_}->{ $id } }, $nr; @@ -86,21 +96,100 @@ foreach my $nr ( 1 .. 4 ) { } + + + + + print "# total = ",scalar keys %{ $stat->{_} }, $/; foreach my $id ( keys %{ $stat->{_} } ) { - my @val = @{ $stat->{_}->{$id} }; - $stat->{count}->{ scalar @val }++; - $stat->{count_total}++; + my @val; + my $v = $stat->{_}->{$id}; + if ( defined $v && ref $v eq 'ARRAY' ) { + @val = @$v; + } else { + print "SKIP[$id]"; + next; + } + $stat->{A_count}->{ scalar @val }++; + $stat->{A_count_total}++; - $stat->{'00_count_val'}->{ join(' ', @val) }++; # with duplicates + #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates my $u; $u->{$_}++ foreach @val; - $stat->{count_val}->{ join(' ', sort keys %$u ) }++; # without duplicates + my @u_v = sort keys %$u; + $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates + + if ( $#u_v < 2 ) { # single, double + my ( $num, $key_id ) = split(/-/,$id,2); + my @candidates = candidates $num => $key_id, 0.6; # XXX 0.9 too high, 0.8 better + if ( @candidates ) { + print "MERGE $num $key_id ", dump( @candidates ), ' vals=', dump( @u_v ),$/; + my @keys = map { $_->{key} } @candidates; + my $m_id = "$keys[0]-$key_id"; + foreach my $i ( 1 .. $#keys ) { + my $id = "$keys[$i]-$key_id"; + if ( ! exists $stat->{_}->{$id} ) { + print "ERROR: $num $key_id can't find $i $id"; + } + my $o = delete $stat->{_}->{$id}; + warn "can't find $keys[$i]" if ! $o; + foreach my $val ( @$o ) { + print '.'; + push @{ $stat->{_}->{ $m_id } }, $val; + print "++ $m_id $val "; + } + print "result val=",dump( $stat->{_}->{ $m_id } ),$/; + } + $stat->{merge}++; + } + + } +} + +print "# total after merge = ",scalar keys %{ $stat->{_} }, $/; + +foreach my $id ( keys %{ $stat->{_} } ) { + my @val; # = @{ $stat->{_}->{$id} }; + my $v = $stat->{_}->{$id}; + if ( defined $v && ref $v eq 'ARRAY' ) { + @val = @$v; + } else { + print "SKIP[$id]"; + next; + } + $stat->{B_count}->{ scalar @val }++; + $stat->{B_count_total}++; + + #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates + + my $u; + $u->{$_}++ foreach @val; + my @u_v = sort keys %$u; + $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates +} + +=for later +foreach my $key ( keys %{ $stat->{unique_id} } ) { + #print "XXX unique_id $key\n"; + print '.'; + my ( $num, $key_id ) = split(/-/,$key,2); + my @candidates = candidates $num => $key_id, 0.8; # XXX lower + if ( @candidates ) { + print "## unique_id $num $key_id ",dump( @candidates ),$/; + $stat->{unique_id}++; + } } +delete $stat->{unique_id}; +=cut + + + print "# stat = ",dump( $stat ); #print "# keys = ",dump( $keys ); + store $keys, $keys_file;