git.rot13.org Git - csv-join-similarity/blob - upari.pl

   1 #!/usr/bin/perl
   2
   3 # TODO: skola <-> razred swap?
   4
   5 use warnings;
   6 use strict;
   7 use autodie;
   8
   9 # apt install libtext-csv-perl libstring-similarity-perl
  10 use Text::CSV;
  11 use Data::Dump qw(dump);
  12 use String::Similarity;
  13 use Storable;
  14
  15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
  16
  17 my $stat;
  18 my $keys;
  19
  20 my $keys_file = 'keys.storable';
  21 if ( -e $keys_file ) {
  22         $keys = retrieve($keys_file) or die "$keys_file: $!";
  23         print "LOAD $keys_file", scalar keys %$keys, "\n";
  24 }
  25
  26 foreach my $nr ( 1 .. 4 ) {
  27         my $file = "$nr.csv";
  28         warn "# $file\n";
  29
  30         my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
  31         open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
  32         while (my $row = $csv->getline ($fh)) {
  33                 $stat->{lines}->{$nr}++;
  34                 $stat->{ $file }->{lines}++;
  35                 my $id = join('-',
  36                         uc $row->[0],
  37                         $row->[1],
  38                         $row->[2],
  39                 );
  40
  41                 my $num = uc $row->[0];
  42
  43                 my $key_id = $row->[1] . '-' . $row->[2];
  44
  45                 if ( $nr == 1 ) {
  46                         push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
  47                 }
  48                 if ( length($num) > 3 && $nr > 1 && ! exists $stat->{_}->{ $id } ) {
  49                         $stat->{error}->{new_exact}->{$nr}++;
  50
  51                         my @candidates;
  52                         my $limit_sim = 0.9;
  53                         foreach my $key ( keys %{ $keys->{ $key_id } } ) {
  54                                 my $s = similarity $num, $key, $limit_sim;
  55                                 #warn "# $num $key $s\n";
  56                                 if ($s > $limit_sim ) {
  57                                         push @candidates, { key => $key, s => $s };
  58                                 }
  59                         }
  60                         if ( @candidates ) {
  61                                 my $multi =  $#candidates > 0 ? 'multi' : '';
  62                                 $stat->{ 'similarity' . $multi }->{$nr}++;
  63                                 foreach my $i ( 0 .. $#candidates ) {
  64                                         my $new_num = $candidates[$i]->{key};
  65                                         $id = join('-',uc $new_num, $row->[1], $row->[2]);
  66                                         print "SIMILARITY$multi $nr $num -> $new_num\n";
  67                                         push @{ $keys->{ $key_id }->{ $id } }, uc $id;
  68                                 }
  69                         } else {
  70                                 $stat->{similarity_none}->{$nr}++;
  71                                 push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
  72                         }
  73
  74
  75
  76                 }
  77
  78                 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
  79
  80                 push @{ $stat->{_}->{ $id } }, $nr;
  81         }
  82         close $fh;
  83
  84 }
  85
  86 print "# total = ",scalar keys %{ $stat->{_} }, $/;
  87 foreach my $id ( keys %{ $stat->{_} } ) {
  88         my @val = @{ $stat->{_}->{$id} };
  89         $stat->{count}->{ scalar @val }++;
  90         $stat->{count_total}++;
  91
  92         $stat->{'00_count_val'}->{ join(' ', @val) }++; # with duplicates
  93
  94         my $u;
  95         $u->{$_}++ foreach @val;
  96         $stat->{count_val}->{ join(' ', sort keys %$u ) }++; # without duplicates
  97
  98 }
  99 print "# stat = ",dump( $stat );
 100 #print "# keys = ",dump( $keys );
 101
 102 store $keys, $keys_file;
 103