git.rot13.org Git - csv-join-similarity/blob - upari.pl

   1 #!/usr/bin/perl
   2
   3 # TODO: skola <-> razred swap?
   4
   5 use warnings;
   6 use strict;
   7 use autodie;
   8
   9 # apt install libtext-csv-perl libstring-similarity-perl
  10 use Text::CSV;
  11 use Data::Dump qw(dump);
  12 use String::Similarity;
  13 use Storable;
  14
  15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
  16
  17 my $stat;
  18 my $keys;
  19
  20 sub candidates {
  21         my ( $num, $key_id, $limit_sim ) = @_;
  22         $limit_sim //= 0.9;
  23
  24         my @candidates;
  25         foreach my $key ( keys %{ $keys->{ $key_id } } ) {
  26                 next if $key eq $num;
  27                 my $s = similarity $num, $key, $limit_sim;
  28                 #warn "# $num $key $s\n";
  29                 if ($s > $limit_sim ) {
  30                         push @candidates, { key => $key, s => $s };
  31                 }
  32         }
  33         return @candidates;
  34 }
  35
  36 my $keys_file = 'keys.storable';
  37 if ( -e $keys_file ) {
  38         $keys = retrieve($keys_file) or die "$keys_file: $!";
  39         print "LOAD $keys_file", scalar keys %$keys, "\n";
  40 }
  41
  42 foreach my $nr ( 1 .. 4 ) {
  43         my $file = "$nr.csv";
  44         warn "# $file\n";
  45
  46         my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
  47         open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
  48         while (my $row = $csv->getline ($fh)) {
  49                 $stat->{lines}->{$nr}++;
  50                 $stat->{ $file }->{lines}++;
  51                 $row->[1] =~ s/\D+//g && print 'c1';
  52                 $row->[2] =~ s/\D+//g && print 'c2';
  53                 my $id = join('-',
  54                         uc $row->[0],
  55                         $row->[1],
  56                         $row->[2],
  57                 );
  58
  59                 my $num = uc $row->[0];
  60                 if ( length $num < 3 ) {
  61                         print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/;
  62                         $stat->{ignore}->{$nr}++;
  63                         next;
  64                 }
  65
  66                 my $key_id = $row->[1] . '-' . $row->[2];
  67
  68                 $stat->{A_key_id}->{$key_id}->{$nr}++;
  69
  70                 $keys->{ $key_id }->{ $num }++;
  71 =for later
  72                 if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) {
  73                         $stat->{new_exact}->{$nr}++;
  74                         my @candidates = candidates $num => $key_id, 0.8; # 0.9 too strict
  75                         if ( @candidates ) {
  76                                 my $multi =  $#candidates > 0 ? 'multi' : '';
  77                                 $stat->{ 'similarity' . $multi }->{$nr}++;
  78                                 foreach my $i ( 0 .. $#candidates ) {
  79                                         my $new_num = $candidates[$i]->{key};
  80                                         $id = join('-',uc $new_num, $row->[1], $row->[2]);
  81                                         print "SIMILARITY$multi $nr $num -> $new_num\n";
  82                                         $keys->{ $key_id }->{ $id }++;
  83                                 }
  84                         } else {
  85                                 $stat->{similarity_none}->{$nr}++;
  86                                 $stat->{unique_id}->{$id}++;
  87                         }
  88
  89                 }
  90 =cut
  91                 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
  92
  93                 push @{ $stat->{_}->{ $id } }, $nr;
  94         }
  95         close $fh;
  96
  97 }
  98
  99
 100
 101
 102
 103
 104 print "# total = ",scalar keys %{ $stat->{_} }, $/;
 105 foreach my $id ( keys %{ $stat->{_} } ) {
 106         my @val;
 107         my $v = $stat->{_}->{$id};
 108         if ( defined $v && ref $v eq 'ARRAY' ) {
 109                 @val = @$v;
 110         } else {
 111                 print "SKIP[$id]";
 112                 next;
 113         }
 114         $stat->{A_count}->{ scalar @val }++;
 115         $stat->{A_count_total}++;
 116
 117         #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
 118
 119         my $u;
 120         $u->{$_}++ foreach @val;
 121         my @u_v = sort keys %$u;
 122         $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
 123
 124         if ( $#u_v < 2 ) { # single, double
 125                 my ( $num, $key_id ) = split(/-/,$id,2);
 126                 my @candidates = candidates $num => $key_id, 0.6; # XXX 0.9 too high, 0.8 better
 127                 if ( @candidates ) {
 128                         print "MERGE $num $key_id ", dump( @candidates ), ' vals=', dump( @u_v ),$/;
 129                         my @keys = map { $_->{key} } @candidates;
 130                         my  $m_id = "$keys[0]-$key_id";
 131                         foreach my $i ( 1 .. $#keys ) {
 132                                 my  $id = "$keys[$i]-$key_id";
 133                                 if ( ! exists $stat->{_}->{$id} ) {
 134                                         print "ERROR: $num $key_id can't find $i $id";
 135
 136                                 }
 137                                 my $o = delete $stat->{_}->{$id};
 138                                 warn "can't find $keys[$i]" if ! $o;
 139                                 foreach my $val ( @$o ) {
 140                                         print '.';
 141                                         push @{ $stat->{_}->{ $m_id } }, $val;
 142                                         print "++ $m_id $val ";
 143                                 }
 144                                 print "result val=",dump( $stat->{_}->{ $m_id } ),$/;
 145                         }
 146                         $stat->{merge}++;
 147                 }
 148
 149         }
 150 }
 151
 152 print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
 153
 154 foreach my $id ( keys %{ $stat->{_} } ) {
 155         my @val; # = @{ $stat->{_}->{$id} };
 156         my $v = $stat->{_}->{$id};
 157         if ( defined $v && ref $v eq 'ARRAY' ) {
 158                 @val = @$v;
 159         } else {
 160                 print "SKIP[$id]";
 161                 next;
 162         }
 163         $stat->{B_count}->{ scalar @val }++;
 164         $stat->{B_count_total}++;
 165
 166         #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
 167
 168         my $u;
 169         $u->{$_}++ foreach @val;
 170         my @u_v = sort keys %$u;
 171         $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
 172 }
 173
 174 =for later
 175 foreach my $key ( keys %{ $stat->{unique_id} } ) {
 176         #print "XXX unique_id $key\n";
 177         print '.';
 178         my ( $num, $key_id ) = split(/-/,$key,2);
 179         my @candidates = candidates $num => $key_id, 0.8; # XXX lower
 180         if ( @candidates ) {
 181                 print "## unique_id $num $key_id ",dump( @candidates ),$/;
 182                 $stat->{unique_id}++;
 183         }
 184 }
 185 delete $stat->{unique_id};
 186 =cut
 187
 188
 189
 190 print "# stat = ",dump( $stat );
 191 #print "# keys = ",dump( $keys );
 192
 193
 194 store $keys, $keys_file;
 195