git.rot13.org Git - csv-join-similarity/blob - upari.pl

   1 #!/usr/bin/perl
   2
   3 # TODO: skola <-> razred swap?
   4
   5 use warnings;
   6 use strict;
   7 use autodie;
   8
   9 # apt install libtext-csv-perl libstring-similarity-perl
  10 use Text::CSV;
  11 use Data::Dump qw(dump);
  12 use String::Similarity;
  13 use Storable;
  14
  15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
  16
  17 our $stat;
  18 my $keys;
  19
  20 sub duplicate {
  21         my @for = @_;
  22         my $use;
  23         $use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
  24         my $duplicate = grep { $use->{$_} > 1 } keys %$use;
  25         print "XXX use @for ",dump($use),$/ if $duplicate;
  26         return $duplicate;
  27 }
  28
  29 sub candidates {
  30         my ( $num, $key_id, $limit_sim ) = @_;
  31         $limit_sim //= $ENV{LIMIT};
  32         $limit_sim //= 0.9;
  33
  34         my @candidates;
  35         foreach my $key ( sort keys %{ $keys->{ $key_id } } ) {
  36                 next if $key eq $num; # XXX don't return key from input
  37                 my $s = similarity $num, $key, $limit_sim;
  38                 #warn "# $num $key $s\n";
  39                 if ($s > $limit_sim ) {
  40                         if ( exists $stat->{_}->{"$key-$key_id"} ) {
  41                                 my $use;
  42                                 $use->{$_}++ foreach (
  43                                         @{ $stat->{_}->{"$num-$key_id"} },
  44                                         @{ $stat->{_}->{"$key-$key_id"} },
  45                                 );
  46                                 #print "XXX use $num $key ",dump($use);
  47                                 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
  48                                 if ( $duplicate ) {
  49                                         print "XXX $limit_sim suggest duplicate  $num $key SKIP duplicate ",dump($use), $/;
  50                                         $stat->{suggest}->{duplicate}++;
  51                                 } else {
  52                                         push @candidates, { key => $key, s => $s };
  53                                 }
  54                         } else {
  55                                 print "XXX $limit_sim candidates $key missing\n";
  56                         }
  57                 }
  58         }
  59         return @candidates;
  60 }
  61
  62 my $keys_file = 'keys.storable';
  63 if ( -e $keys_file ) {
  64         #$keys = retrieve($keys_file) or die "$keys_file: $!";
  65         print "LOAD $keys_file", scalar keys %$keys, "\n";
  66 }
  67
  68 foreach my $nr ( 1 .. 4 ) {
  69         my $file = "$nr.csv";
  70         warn "# $file\n";
  71
  72         my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
  73         open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
  74         while (my $row = $csv->getline ($fh)) {
  75                 $stat->{lines}->{$nr}++;
  76                 $stat->{ $file }->{lines}++;
  77                 $row->[1] =~ s/\D+//g && print 'c1';
  78                 $row->[2] =~ s/\D+//g && print 'c2';
  79                 my $id = join('-',
  80                         uc $row->[0],
  81                         $row->[1],
  82                         $row->[2],
  83                 );
  84
  85                 my $num = uc $row->[0];
  86                 if ( length $num < 3 ) {
  87                         print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/;
  88                         $stat->{ignore}->{$nr}++;
  89                         next;
  90                 }
  91
  92                 my $key_id = $row->[1] . '-' . $row->[2];
  93
  94                 $stat->{A_key_id}->{$key_id}->{$nr}++;
  95
  96                 $keys->{ $key_id }->{ $num }++;
  97
  98                 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
  99
 100                 push @{ $stat->{_}->{ $id } }, $nr;
 101         }
 102         close $fh;
 103
 104 }
 105
 106
 107
 108
 109
 110
 111 print "# total = ",scalar keys %{ $stat->{_} }, $/;
 112 foreach my $id ( sort keys %{ $stat->{_} } ) {
 113         my @val;
 114         my $v = $stat->{_}->{$id};
 115         if ( defined $v && ref $v eq 'ARRAY' ) {
 116                 @val = @$v;
 117         } else {
 118                 #print "SKIP[$id]";
 119                 next;
 120         }
 121         $stat->{A_count}->{ scalar @val }++;
 122         $stat->{A_count_total}++;
 123
 124         #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
 125
 126         my $u;
 127         $u->{$_}++ foreach @val;
 128         my @u_v = sort keys %$u;
 129         $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
 130
 131         if ( $#u_v < 3 ) { # single, double
 132                 my ( $num, $key_id ) = split(/-/,$id,2);
 133                 my @candidates = candidates $num => $key_id; #, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
 134                 if ( @candidates ) {
 135                         print "MERGE $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
 136                         my @keys = map { $_->{key} } @candidates;
 137                         my  $m_id = $id; # "$keys[0]-$key_id";
 138                         foreach my $i ( 0 .. $#keys ) {
 139                                 my  $id = "$keys[$i]-$key_id";
 140                                 if ( ! exists $stat->{_}->{$id} ) {
 141                                         print "ERROR: $num $key_id can't find $i $id";
 142                                 }
 143
 144                                 # XXX I298O-4743-7996
 145                                 if ( duplicate( $m_id => $id ) ) {
 146                                         print "XXX duplicate2 $m_id $id\n";
 147                                         $stat->{duplicate2}++;
 148                                         next;
 149
 150                                 }
 151
 152                                 my $o = delete $stat->{_}->{$id};
 153                                 warn "can't find $id" if ! $o;
 154                                 foreach my $val ( @$o ) {
 155                                         print '.';
 156                                         push @{ $stat->{_}->{ $m_id } }, $val;
 157                                         print "++ $m_id $val ";
 158                                         $stat->{merge_val}->{$val}++;
 159                                 }
 160                                 print "result val=",dump( $stat->{_}->{ $m_id } ), " result_elements=", scalar @{ $stat->{_}->{ $m_id } }, $/;
 161                         }
 162                         $stat->{merge}++;
 163                 }
 164
 165         }
 166 }
 167
 168 print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
 169
 170 foreach my $id ( sort keys %{ $stat->{_} } ) {
 171         my @val; # = @{ $stat->{_}->{$id} };
 172         my $v = $stat->{_}->{$id};
 173         if ( defined $v && ref $v eq 'ARRAY' ) {
 174                 @val = @$v;
 175         } else {
 176                 print "SKIP[$id]";
 177                 next;
 178         }
 179         $stat->{B_count}->{ scalar @val }++;
 180         $stat->{B_count_total}++;
 181
 182         #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
 183
 184         my $u;
 185         $u->{$_}++ foreach @val;
 186         my @u_v = sort keys %$u;
 187         $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
 188 }
 189
 190
 191 print "# stat = ",dump( $stat );
 192 #print "# keys = ",dump( $keys );
 193
 194
 195 store $keys, $keys_file;
 196