use strict;
use autodie;
-# apt install libtext-csv-perl
+# apt install libtext-csv-perl libstring-similarity-perl
use Text::CSV;
use Data::Dump qw(dump);
+use String::Similarity;
my @files = qw( 1.csv 2.csv 3.csv 4.csv );
$row->[2],
);
- my $num = $row->[0]; $num =~ s/\D//g;
+ my $num = $row->[0]; #$num =~ s/\D//g;
- $stat->{num}->{len}->{ length($num) }++;
+ #$stat->{num}->{len}->{ length($num) }++;
- my $key_id = $row->[1] . '-' . $row->[2] . 's' . int( $row->[4] );
+ my $key_id = $row->[1] . '-' . $row->[2]; # . ',' . int( $row->[4] );
if ( $nr == 1 ) {
push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
}
- if ( $num && $nr > 1 && ! exists $stat->{_}->{ $id } ) {
+ if ( length($num) > 3 && $nr > 1 && ! exists $stat->{_}->{ $id } ) {
$stat->{error}->{new_exact}->{$nr}++;
+
+ my @candidates;
+ foreach my $key ( keys %{ $keys->{ $key_id } } ) {
+ my $s = similarity $num, $key, 0.4;
+ #warn "# $num $key $s\n";
+ if ($s > 0.8 ) {
+ push @candidates, { key => $key, s => $s };
+ }
+ }
+ if ( $#candidates == 0 ) {
+ $stat->{similarity}->{$nr}++;
+ warn "# similarity $num = ",dump( @candidates ),$/;
+ $id = $candidates[0];
+ } elsif ( $#candidates > 1 ) {
+ warn "# similarity IGNORED $num = ",dump( @candidates ),$/;
+ }
+
+
+
if ( exists $keys->{ $key_id }->{ $num } ) {
my @found = @{ $keys->{ $key_id }->{ $num } };
if ( $#found == 0 ) {
foreach my $id ( keys %{ $stat->{_} } ) {
$stat->{count}->{ scalar @{ $stat->{_}->{$id} } }++;
$stat->{count_total}++;
+ foreach my $val ( @{ $stat->{_}->{$id} } ) {
+ #$stat->{val}->{$val}++ if $val > 1;
+ }
}
print "# stat = ",dump( $stat );
#print "# keys = ",dump( $keys );