3 # TODO: skola <-> razred swap?
9 # apt install libtext-csv-perl libstring-similarity-perl
11 use Data::Dump qw(dump);
12 use String::Similarity;
15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
20 my $keys_file = 'keys.storable';
21 if ( -e $keys_file ) {
22 $keys = retrieve($keys_file) or die "$keys_file: $!";
23 print "LOAD $keys_file", scalar keys %$keys, "\n";
26 foreach my $nr ( 1 .. 4 ) {
30 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
31 open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
32 while (my $row = $csv->getline ($fh)) {
33 $stat->{lines}->{$nr}++;
34 $stat->{ $file }->{lines}++;
41 my $num = uc $row->[0];
43 my $key_id = $row->[1] . '-' . $row->[2];
46 push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
48 if ( length($num) > 3 && $nr > 1 && ! exists $stat->{_}->{ $id } ) {
49 $stat->{error}->{new_exact}->{$nr}++;
53 foreach my $key ( keys %{ $keys->{ $key_id } } ) {
54 my $s = similarity $num, $key, $limit_sim;
55 #warn "# $num $key $s\n";
56 if ($s > $limit_sim ) {
57 push @candidates, { key => $key, s => $s };
61 my $multi = $#candidates > 0 ? 'multi' : '';
62 $stat->{ 'similarity' . $multi }->{$nr}++;
63 foreach my $i ( 0 .. $#candidates ) {
64 my $new_num = $candidates[$i]->{key};
65 $id = join('-',uc $new_num, $row->[1], $row->[2]);
66 print "SIMILARITY$multi $nr $num -> $new_num\n";
67 push @{ $keys->{ $key_id }->{ $id } }, uc $id;
70 $stat->{similarity_none}->{$nr}++;
71 push @{ $keys->{ $key_id }->{ $num } }, uc $row->[0];
78 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
80 push @{ $stat->{_}->{ $id } }, $nr;
86 print "# total = ",scalar keys %{ $stat->{_} }, $/;
87 foreach my $id ( keys %{ $stat->{_} } ) {
88 my @val = @{ $stat->{_}->{$id} };
89 $stat->{count}->{ scalar @val }++;
90 $stat->{count_total}++;
92 $stat->{'00_count_val'}->{ join(' ', @val) }++; # with duplicates
95 $u->{$_}++ foreach @val;
96 $stat->{count_val}->{ join(' ', sort keys %$u ) }++; # without duplicates
99 print "# stat = ",dump( $stat );
100 #print "# keys = ",dump( $keys );
102 store $keys, $keys_file;