3 # TODO: skola <-> razred swap?
9 # apt install libtext-csv-perl libstring-similarity-perl
11 use Data::Dump qw(dump);
12 use String::Similarity;
15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
23 $use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
24 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
25 print "XXX use @for ",dump($use),$/ if $duplicate;
30 my ( $num, $key_id, $limit_sim ) = @_;
31 $limit_sim //= $ENV{LIMIT};
35 foreach my $key ( sort keys %{ $keys->{ $key_id } } ) {
36 next if $key eq $num; # XXX don't return key from input
37 my $s = similarity $num, $key, $limit_sim;
38 #warn "# $num $key $s\n";
39 if ($s > $limit_sim ) {
40 if ( exists $stat->{_}->{"$key-$key_id"} ) {
42 $use->{$_}++ foreach (
43 @{ $stat->{_}->{"$num-$key_id"} },
44 @{ $stat->{_}->{"$key-$key_id"} },
46 #print "XXX use $num $key ",dump($use);
47 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
49 print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/;
50 $stat->{suggest}->{duplicate}++;
52 push @candidates, { key => $key, s => $s };
55 print "XXX $limit_sim candidates $key missing\n";
62 my $keys_file = 'keys.storable';
63 if ( -e $keys_file ) {
64 #$keys = retrieve($keys_file) or die "$keys_file: $!";
65 print "LOAD $keys_file", scalar keys %$keys, "\n";
68 foreach my $nr ( 1 .. 4 ) {
72 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
73 open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
74 while (my $row = $csv->getline ($fh)) {
75 $stat->{lines}->{$nr}++;
76 $stat->{ $file }->{lines}++;
77 $row->[1] =~ s/\D+//g && print 'c1';
78 $row->[2] =~ s/\D+//g && print 'c2';
85 my $num = uc $row->[0];
86 if ( length $num < 3 ) {
87 print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/;
88 $stat->{ignore}->{$nr}++;
92 my $key_id = $row->[1] . '-' . $row->[2];
94 $stat->{A_key_id}->{$key_id}->{$nr}++;
96 $keys->{ $key_id }->{ $num }++;
98 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
100 push @{ $stat->{_}->{ $id } }, $nr;
111 print "# total = ",scalar keys %{ $stat->{_} }, $/;
112 foreach my $id ( sort keys %{ $stat->{_} } ) {
114 my $v = $stat->{_}->{$id};
115 if ( defined $v && ref $v eq 'ARRAY' ) {
121 $stat->{A_count}->{ scalar @val }++;
122 $stat->{A_count_total}++;
124 #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
127 $u->{$_}++ foreach @val;
128 my @u_v = sort keys %$u;
129 $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
131 if ( $#u_v < 3 ) { # single, double
132 my ( $num, $key_id ) = split(/-/,$id,2);
133 my @candidates = candidates $num => $key_id; #, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
135 print "MERGE $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
136 my @keys = map { $_->{key} } @candidates;
137 my $m_id = $id; # "$keys[0]-$key_id";
138 foreach my $i ( 0 .. $#keys ) {
139 my $id = "$keys[$i]-$key_id";
140 if ( ! exists $stat->{_}->{$id} ) {
141 print "ERROR: $num $key_id can't find $i $id";
144 # XXX I298O-4743-7996
145 if ( duplicate( $m_id => $id ) ) {
146 print "XXX duplicate2 $m_id $id\n";
147 $stat->{duplicate2}++;
152 my $o = delete $stat->{_}->{$id};
153 warn "can't find $id" if ! $o;
154 foreach my $val ( @$o ) {
156 push @{ $stat->{_}->{ $m_id } }, $val;
157 print "++ $m_id $val ";
158 $stat->{merge_val}->{$val}++;
160 print "result val=",dump( $stat->{_}->{ $m_id } ), " result_elements=", scalar @{ $stat->{_}->{ $m_id } }, $/;
168 print "# total after merge = ",scalar keys %{ $stat->{_} }, $/;
170 foreach my $id ( sort keys %{ $stat->{_} } ) {
171 my @val; # = @{ $stat->{_}->{$id} };
172 my $v = $stat->{_}->{$id};
173 if ( defined $v && ref $v eq 'ARRAY' ) {
179 $stat->{B_count}->{ scalar @val }++;
180 $stat->{B_count_total}++;
182 #$stat->{B_count_val_dup}->{ join(' ', @val) }++; # with duplicates
185 $u->{$_}++ foreach @val;
186 my @u_v = sort keys %$u;
187 $stat->{B_count_val}->{ join(' ', @u_v ) }++; # without duplicates
191 print "# stat = ",dump( $stat );
192 #print "# keys = ",dump( $keys );
195 store $keys, $keys_file;