3 # TODO: skola <-> razred swap?
9 # apt install libtext-csv-perl libstring-similarity-perl
11 use Data::Dump qw(dump);
12 use String::Similarity;
15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
21 my ( $num, $key_id, $limit_sim ) = @_;
25 foreach my $key ( keys %{ $keys->{ $key_id } } ) {
27 my $s = similarity $num, $key, $limit_sim;
28 #warn "# $num $key $s\n";
29 if ($s > $limit_sim ) {
30 push @candidates, { key => $key, s => $s };
36 my $keys_file = 'keys.storable';
37 if ( -e $keys_file ) {
38 $keys = retrieve($keys_file) or die "$keys_file: $!";
39 print "LOAD $keys_file", scalar keys %$keys, "\n";
42 foreach my $nr ( 1 .. 4 ) {
46 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
47 open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
48 while (my $row = $csv->getline ($fh)) {
49 $stat->{lines}->{$nr}++;
50 $stat->{ $file }->{lines}++;
51 $row->[1] =~ s/\D+//g && print 'c1';
52 $row->[2] =~ s/\D+//g && print 'c2';
59 my $num = uc $row->[0];
60 if ( length $num < 3 ) {
61 print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/;
62 $stat->{ignore}->{$nr}++;
66 my $key_id = $row->[1] . '-' . $row->[2];
68 $stat->{A_key_id}->{$key_id}->{$nr}++;
70 $keys->{ $key_id }->{ $num }++;
72 if ( $nr > 1 && ! exists $stat->{_}->{ $id } ) {
73 $stat->{new_exact}->{$nr}++;
74 my @candidates = candidates $num => $key_id, 0.8; # 0.9 too strict
76 my $multi = $#candidates > 0 ? 'multi' : '';
77 $stat->{ 'similarity' . $multi }->{$nr}++;
78 foreach my $i ( 0 .. $#candidates ) {
79 my $new_num = $candidates[$i]->{key};
80 $id = join('-',uc $new_num, $row->[1], $row->[2]);
81 print "SIMILARITY$multi $nr $num -> $new_num\n";
82 $keys->{ $key_id }->{ $id }++;
85 $stat->{similarity_none}->{$nr}++;
86 $stat->{unique_id}->{$id}++;
91 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
93 push @{ $stat->{_}->{ $id } }, $nr;
104 print "# total = ",scalar keys %{ $stat->{_} }, $/;
105 foreach my $id ( keys %{ $stat->{_} } ) {
107 my $v = $stat->{_}->{$id};
108 if ( defined $v && ref $v eq 'ARRAY' ) {
114 $stat->{A_count}->{ scalar @val }++;
115 $stat->{A_count_total}++;
117 #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
120 $u->{$_}++ foreach @val;
121 my @u_v = sort keys %$u;
122 $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
124 if ( $#u_v < 2 ) { # single, double
125 my ( $num, $key_id ) = split(/-/,$id,2);
126 my @candidates = candidates $num => $key_id, 0.6; # XXX 0.9 too high, 0.8 better
128 print "MERGE $num $key_id ", dump( @candidates ), ' vals=', dump( @u_v ),$/;
129 my @keys = map { $_->{key} } @candidates;
130 my $m_id = "$keys[0]-$key_id";
131 foreach my $i ( 1 .. $#keys ) {
132 my $id = "$keys[$i]-$key_id";
133 if ( ! exists $stat->{_}->{$id} ) {
134 print "ERROR: $num $key_id can't find $i $id";
137 my $o = delete $stat->{_}->{$id};
138 warn "can't find $keys[$i]" if ! $o;
139 foreach my $val ( @$o ) {
141 push @{ $stat->{_}->{ $m_id } }, $val;
142 print "++ $m_id $val ";
144 print "result val=",dump( $stat->{_}->{ $m_id } ),$/;
152 print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
154 foreach my $id ( keys %{ $stat->{_} } ) {
155 my @val; # = @{ $stat->{_}->{$id} };
156 my $v = $stat->{_}->{$id};
157 if ( defined $v && ref $v eq 'ARRAY' ) {
163 $stat->{B_count}->{ scalar @val }++;
164 $stat->{B_count_total}++;
166 #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
169 $u->{$_}++ foreach @val;
170 my @u_v = sort keys %$u;
171 $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
175 foreach my $key ( keys %{ $stat->{unique_id} } ) {
176 #print "XXX unique_id $key\n";
178 my ( $num, $key_id ) = split(/-/,$key,2);
179 my @candidates = candidates $num => $key_id, 0.8; # XXX lower
181 print "## unique_id $num $key_id ",dump( @candidates ),$/;
182 $stat->{unique_id}++;
185 delete $stat->{unique_id};
190 print "# stat = ",dump( $stat );
191 #print "# keys = ",dump( $keys );
194 store $keys, $keys_file;