3 # TODO: skola <-> razred swap?
9 # apt install libtext-csv-perl libstring-similarity-perl
11 use Data::Dump qw(dump);
12 use String::Similarity;
15 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
23 $use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
24 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
25 print "XXX use @for ",dump($use),$/ if $duplicate;
30 my ( $num, $key_id, $limit_sim ) = @_;
31 $limit_sim //= $ENV{LIMIT};
35 foreach my $key ( sort keys %{ $keys->{ $key_id } } ) {
36 next if $key eq $num; # XXX don't return key from input
37 my $s = similarity $num, $key, $limit_sim;
38 #warn "# $num $key $s\n";
39 if ($s > $limit_sim ) {
40 if ( exists $stat->{_}->{"$key-$key_id"} ) {
42 $use->{$_}++ foreach (
43 @{ $stat->{_}->{"$num-$key_id"} },
44 @{ $stat->{_}->{"$key-$key_id"} },
46 #print "XXX use $num $key ",dump($use);
47 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
49 print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/;
50 $stat->{suggest}->{duplicate}++;
52 push @candidates, { key => $key, s => $s };
55 print "XXX $limit_sim candidates $key missing\n";
60 for limit 0.6 to work, we need to prefer longer results over shorter ones:
61 MERGE A0246A 3078-8326 (
62 { key => "A4065A", s => 0.666666666666667 },
63 { key => "ANDREJA0246A", s => 0.666666666666667 },
65 .++ A0246A-3078-8326 1 .++ A0246A-3078-8326 2 result val=[3, 1, 2] result_elements=3
68 if ( $#candidates > 0 ) {
69 print "XXX candidates before = ",dump( \@candidates ),$/;
71 $b->{s} <=> $a->{s} # hi -> low
72 or length($b->{key}) <=> length($a->{key})
74 print "XXX candidates after = ",dump( \@candidates ),$/;
79 my $keys_file = 'keys.storable';
80 if ( -e $keys_file ) {
81 #$keys = retrieve($keys_file) or die "$keys_file: $!";
82 print "LOAD $keys_file", scalar keys %$keys, "\n";
85 foreach my $nr ( 1 .. 4 ) {
89 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
90 open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
91 while (my $row = $csv->getline ($fh)) {
92 $stat->{lines}->{$nr}++;
93 $stat->{ $file }->{lines}++;
94 $row->[1] =~ s/\D+//g && print 'c1';
95 $row->[2] =~ s/\D+//g && print 'c2';
102 my $num = uc $row->[0];
103 if ( length $num < 3 ) {
104 print "IGNORE $nr ",dump($row->[ 0 .. 5 ]),$/;
105 $stat->{ignore}->{$nr}++;
109 my $key_id = $row->[1] . '-' . $row->[2];
111 $stat->{A_key_id}->{$key_id}->{$nr}++;
113 $keys->{ $key_id }->{ $num }++;
115 $stat->{exists}->{$nr}++ if exists $stat->{_}->{ $id };
117 push @{ $stat->{_}->{ $id } }, $nr;
126 #foreach my $l ( 0 .. 5 ) { # 0.9 - 0.7 -- 0.6 is too lax
127 foreach my $l ( qw( 2 3 4 5 ) ) {
128 my $limit = 0.9 - "0.$l";
129 warn "XXX limit $limit\n";
131 print "# total = ",scalar keys %{ $stat->{_} }, $/;
132 foreach my $id ( sort keys %{ $stat->{_} } ) {
134 my $v = $stat->{_}->{$id};
135 if ( defined $v && ref $v eq 'ARRAY' ) {
141 $stat->{A_count}->{ scalar @val }++;
142 $stat->{A_count_total}++;
144 #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
147 $u->{$_}++ foreach @val;
148 my @u_v = sort keys %$u;
149 $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
151 if ( $#u_v < 3 ) { # single, double
152 my ( $num, $key_id ) = split(/-/,$id,2);
153 my @candidates = candidates $num => $key_id, $limit; #, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
155 print "MERGE ",scalar @candidates, " $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
156 my @keys = map { $_->{key} } @candidates;
157 my $m_id = $id; # "$keys[0]-$key_id";
158 foreach my $i ( 0 .. $#keys ) {
159 my $id = "$keys[$i]-$key_id";
160 if ( ! exists $stat->{_}->{$id} ) {
161 print "ERROR: $num $key_id can't find $i $id";
164 # XXX I298O-4743-7996
165 if ( duplicate( $m_id => $id ) ) {
166 print "XXX duplicate2 $m_id $id\n";
167 $stat->{duplicate2}++;
172 my $o = delete $stat->{_}->{$id};
173 warn "can't find $id" if ! $o;
174 foreach my $val ( @$o ) {
176 push @{ $stat->{_}->{ $m_id } }, $val;
177 print "++ $m_id $val ";
178 $stat->{merge_val}->{$val}++;
180 print "result val=",dump( $stat->{_}->{ $m_id } ), " result_elements=", scalar @{ $stat->{_}->{ $m_id } }, $/;
188 print "# total after merge $limit = ",scalar keys %{ $stat->{_} }, $/;
190 foreach my $id ( sort keys %{ $stat->{_} } ) {
191 my @val; # = @{ $stat->{_}->{$id} };
192 my $v = $stat->{_}->{$id};
193 if ( defined $v && ref $v eq 'ARRAY' ) {
199 $stat->{"B${limit}_count"}->{ scalar @val }++;
200 $stat->{"B${limit}_count_total"}++;
202 #$stat->{"B${limit}_count_val_dup"}->{ join(' ', @val) }++; # with duplicates
205 $u->{$_}++ foreach @val;
206 my @u_v = sort keys %$u;
207 $stat->{"B${limit}_count_val"}->{ join(' ', @u_v ) }++; # without duplicates
213 print "# stat = ",dump( $stat );
214 #print "# keys = ",dump( $keys );
217 store $keys, $keys_file;