my $stat;
my $keys;
+sub candidates {
+ my ( $num, $key_id, $limit_sim ) = @_;
+ $limit_sim //= 0.9;
+
+ my @candidates;
+ foreach my $key ( keys %{ $keys->{ $key_id } } ) {
+ next if $key eq $num;
+ my $s = similarity $num, $key, $limit_sim;
+ #warn "# $num $key $s\n";
+ if ($s > $limit_sim ) {
+ push @candidates, { key => $key, s => $s };
+ }
+ }
+ return @candidates;
+}
+
my $keys_file = 'keys.storable';
if ( -e $keys_file ) {
$keys = retrieve($keys_file) or die "$keys_file: $!";
while (my $row = $csv->getline ($fh)) {
$stat->{lines}->{$nr}++;
$stat->{ $file }->{lines}++;
+ $row->[1] =~ s/\D+//g && print 'c1';
+ $row->[2] =~ s/\D+//g && print 'c2';
my $id = join('-',
uc $row->[0],
$row->[1],
my $key_id = $row->[1] . '-' . $row->[2];
+ $stat->{A_key_id}->{$key_id}->{$nr}++;
+
$keys->{ $key_id }->{ $num }++;
-
+=for later
if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) {
$stat->{new_exact}->{$nr}++;
-
- my @candidates;
- my $limit_sim = 0.9;
- foreach my $key ( keys %{ $keys->{ $key_id } } ) {
- next if $key eq $num;
- my $s = similarity $num, $key, $limit_sim;
- #warn "# $num $key $s\n";
- if ($s > $limit_sim ) {
- push @candidates, { key => $key, s => $s };
- }
- }
+ my @candidates = candidates $num => $key_id, 0.8; # 0.9 too strict
if ( @candidates ) {
my $multi = $#candidates > 0 ? 'multi' : '';
$stat->{ 'similarity' . $multi }->{$nr}++;
}
}
-
+=cut
$stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
push @{ $stat->{_}->{ $id } }, $nr;
}
+
+
+
+
+
print "# total = ",scalar keys %{ $stat->{_} }, $/;
foreach my $id ( keys %{ $stat->{_} } ) {
- my @val = @{ $stat->{_}->{$id} };
- $stat->{count}->{ scalar @val }++;
- $stat->{count_total}++;
+ my @val;
+ my $v = $stat->{_}->{$id};
+ if ( defined $v && ref $v eq 'ARRAY' ) {
+ @val = @$v;
+ } else {
+ print "SKIP[$id]";
+ next;
+ }
+ $stat->{A_count}->{ scalar @val }++;
+ $stat->{A_count_total}++;
- $stat->{'00_count_val'}->{ join(' ', @val) }++; # with duplicates
+ #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
my $u;
$u->{$_}++ foreach @val;
- $stat->{count_val}->{ join(' ', sort keys %$u ) }++; # without duplicates
+ my @u_v = sort keys %$u;
+ $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+
+ if ( $#u_v < 2 ) { # single, double
+ my ( $num, $key_id ) = split(/-/,$id,2);
+ my @candidates = candidates $num => $key_id, 0.6; # XXX 0.9 too high, 0.8 better
+ if ( @candidates ) {
+ print "MERGE $num $key_id ", dump( @candidates ), ' vals=', dump( @u_v ),$/;
+ my @keys = map { $_->{key} } @candidates;
+ my $m_id = "$keys[0]-$key_id";
+ foreach my $i ( 1 .. $#keys ) {
+ my $id = "$keys[$i]-$key_id";
+ if ( ! exists $stat->{_}->{$id} ) {
+ print "ERROR: $num $key_id can't find $i $id";
+ }
+ my $o = delete $stat->{_}->{$id};
+ warn "can't find $keys[$i]" if ! $o;
+ foreach my $val ( @$o ) {
+ print '.';
+ push @{ $stat->{_}->{ $m_id } }, $val;
+ print "++ $m_id $val ";
+ }
+ print "result val=",dump( $stat->{_}->{ $m_id } ),$/;
+ }
+ $stat->{merge}++;
+ }
+
+ }
+}
+
+print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
+
+foreach my $id ( keys %{ $stat->{_} } ) {
+ my @val; # = @{ $stat->{_}->{$id} };
+ my $v = $stat->{_}->{$id};
+ if ( defined $v && ref $v eq 'ARRAY' ) {
+ @val = @$v;
+ } else {
+ print "SKIP[$id]";
+ next;
+ }
+ $stat->{B_count}->{ scalar @val }++;
+ $stat->{B_count_total}++;
+
+ #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
+
+ my $u;
+ $u->{$_}++ foreach @val;
+ my @u_v = sort keys %$u;
+ $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
+}
+
+=for later
+foreach my $key ( keys %{ $stat->{unique_id} } ) {
+ #print "XXX unique_id $key\n";
+ print '.';
+ my ( $num, $key_id ) = split(/-/,$key,2);
+ my @candidates = candidates $num => $key_id, 0.8; # XXX lower
+ if ( @candidates ) {
+ print "## unique_id $num $key_id ",dump( @candidates ),$/;
+ $stat->{unique_id}++;
+ }
}
+delete $stat->{unique_id};
+=cut
+
+
+
print "# stat = ",dump( $stat );
#print "# keys = ",dump( $keys );
+
store $keys, $keys_file;