7 # apt install libtext-csv-perl libstring-similarity-perl
9 use Data::Dump qw(dump);
10 use String::Similarity;
14 my $debug = $ENV{DEBUG};
16 my @files = qw( 1.csv 2.csv 3.csv 4.csv );
24 $use->{$_}++ foreach ( map { @{ $stat->{_}->{$_} } } @for );
25 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
26 print "XXX use @for ",dump($use),$/ if $debug && $duplicate;
31 my ( $num, $key_id, $limit_sim ) = @_;
32 $limit_sim //= $ENV{LIMIT};
36 foreach my $key ( sort keys %{ $keys->{ $key_id } } ) {
37 next if $key eq $num; # XXX don't return key from input
38 my $s = similarity $num, $key, $limit_sim;
39 #warn "# $num $key $s\n";
40 if ($s > $limit_sim ) {
41 if ( exists $stat->{_}->{"$key-$key_id"} ) {
43 $use->{$_}++ foreach (
44 @{ $stat->{_}->{"$num-$key_id"} },
45 @{ $stat->{_}->{"$key-$key_id"} },
47 #print "XXX use $num $key ",dump($use);
48 my $duplicate = grep { $use->{$_} > 1 } keys %$use;
50 print "XXX $limit_sim suggest duplicate $num $key SKIP duplicate ",dump($use), $/ if $debug;
51 $stat->{suggest}->{duplicate}++;
53 push @candidates, { key => $key, s => $s };
56 print "XXX $limit_sim candidates $key missing\n" if $debug;
61 for limit 0.6 to work, we need to prefer longer results over shorter ones:
62 MERGE A0246A 3078-8326 (
63 { key => "A4065A", s => 0.666666666666667 },
64 { key => "ANDREJA0246A", s => 0.666666666666667 },
66 .++ A0246A-3078-8326 1 .++ A0246A-3078-8326 2 result val=[3, 1, 2] result_elements=3
69 if ( $#candidates > 0 ) {
70 #print "XXX candidates before = ",dump( \@candidates ),$/;
72 $b->{s} <=> $a->{s} # hi -> low
73 or length($b->{key}) <=> length($a->{key})
75 #print "XXX candidates after = ",dump( \@candidates ),$/;
80 my $keys_file = 'keys.storable';
81 if ( -e $keys_file ) {
82 #$keys = retrieve($keys_file) or die "$keys_file: $!";
83 print "LOAD $keys_file", scalar keys %$keys, "\n";
90 foreach my $val ( 1 .. 4 ) {
91 my $file = "$val.csv";
94 open(my $duplicate_fh, '>', "duplicate-$val.csv");
96 my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1 });
97 open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
98 while (my $row = $csv->getline ($fh)) {
99 $stat->{file}->{$file}->{lines}++;
100 $stat->{file}->{$file}->{columns}->{ $#$row }++;
102 if ( ! exists $data_headers[$val] ) {
103 $data_headers[$val] = $row;
107 my $c_id = $row->[0];
110 $row->[0] =~ s/[^\w\d]+//ig && $stat->{file}->{$file}->{corrupt_id}->{$c_id}++ && print 'c0';
111 $row->[1] =~ s/\D+//g && $stat->{file}->{$file}->{corrupt_s}->{$c_s}++ && print 'c1';
112 $row->[2] =~ s/\D+//g && $stat->{file}->{$file}->{corrupt_r}->{$c_r}++ && print 'c2';
119 my $num = uc $row->[0];
120 if ( length $num < 3 ) {
121 print "IGNORE $val ",dump($row->[ 0 .. 5 ]),$/;
122 $stat->{ignore}->{$val}++;
126 my $key_id = $row->[1] . '-' . $row->[2];
128 $stat->{A_key_id}->{$key_id}->{$val}++;
130 $keys->{ $key_id }->{ $num }++;
132 $stat->{exists}->{$val}++ if exists $stat->{_}->{ $id };
134 push @{ $stat->{_}->{ $id } }, $val;
136 if ( exists $data->{$key_id}->{$num}->{$val} ) {
137 $stat->{file}->{$file}->{duplicate_keyid_num}->{$val}++;
138 print "DUPLICATE $file $key_id $num $val\n";
140 my $old = $data->{$key_id}->{$num}->{$val};
141 print $duplicate_fh join(',', $file, @$old), "\n";
142 print $duplicate_fh join(',', $file, @$row), "\n";
143 print $duplicate_fh "\n";
146 foreach ( 0 .. $#$row ) {
147 if ( $old->[$_] ne $row->[$_] ) {
148 $diff->[$_] = [ $old->[$_], $row->[$_] ];
151 print "diff = ",dump($diff) if $diff;
152 #print "old=", dump( $data->{$key_id}->{$num}->{$val} ), $/;
153 #print "new=", dump( $row ), $/;
156 # remove .000000 from values
157 $data->{$key_id}->{$num}->{$val} = [ map { s/^(\d+)\.0+$/$1/g; $_; } @$row ];
167 # 0.9 - 0.7 -- 0.6 is too lax
168 foreach my $limit ( 0.7 ) { #, 0.6 ) {
169 warn "XXX limit $limit\n";
171 print "# total = ",scalar keys %{ $stat->{_} }, $/;
172 foreach my $id ( sort keys %{ $stat->{_} } ) {
174 my $v = $stat->{_}->{$id};
175 if ( defined $v && ref $v eq 'ARRAY' ) {
182 $u->{$_}++ foreach @val;
183 my @u_v = sort keys %$u;
185 $stat->{A_count}->{ scalar @val }++;
186 $stat->{A_count_total}++;
188 #$stat->{A_count_val_dup}->{ join(' ', @val) }++; # with duplicates
190 $stat->{A_count_val}->{ join(' ', @u_v ) }++; # without duplicates
193 if ( $#u_v < 3 ) { # single, double
194 my ( $num, $key_id ) = split(/-/,$id,2);
195 my @candidates = candidates $num => $key_id, $limit; #, 0.7; # XXX 0.9 too high, 0.8 better, 0.7 too lax
197 print "MERGE ",scalar @candidates, " $limit $num $key_id ", dump( @candidates ), ' val=', dump( \@val ), $/;
198 my @keys = map { $_->{key} } @candidates;
199 my $m_id = $id; # "$keys[0]-$key_id";
200 foreach my $i ( 0 .. $#keys ) {
201 my $id = "$keys[$i]-$key_id";
202 if ( ! exists $stat->{_}->{$id} ) {
203 print "ERROR: $num $key_id can't find $i $id";
206 # XXX I298O-4743-7996
207 if ( duplicate( $m_id => $id ) ) {
208 print "XXX duplicate2 $m_id $id\n";
209 $stat->{duplicate2}++;
214 my $o = delete $stat->{_}->{$id};
215 die "FATAL: can't find $id" if ! $o;
217 my ( $id_s, $s, $r ) = split('-', $id);
220 foreach my $val ( @$o ) {
222 push @{ $stat->{_}->{ $m_id } }, $val;
223 print "++ $m_id $val ";
224 $stat->{merge_val}->{$val}++;
227 die "ERROR merge: $val $id $m_id exists",dump( $merge_ids->{$val}->{$key_s}->{$id_s} ) if exists $merge_ids->{$val}->{$key_s}->{$id_s};
228 my $m_id_s = (split('-',$m_id,3))[0];
229 $merge_ids->{$val}->{$key_s}->{$id_s} = $m_id_s;
231 my $o_row = delete $data->{$key_s}->{$id_s}->{$val};
232 die "FATAL: $id | $m_id | data $key_s $id_s $val" unless $o_row;
233 $data->{$key_s}->{$m_id_s}->{$val} = $o_row;
236 my @not_empty = sort keys %{ $data->{$key_s}->{$id_s} };
237 die "FATAL: $id_s not empty" if @not_empty;
238 delete $data->{$key_s}->{$id_s}; # FIXME check before cleanup
240 print "result val=",dump( $stat->{_}->{ $m_id } ), " result_elements=", scalar @{ $stat->{_}->{ $m_id } }, $/;
248 print "# total after merge $limit = ",scalar keys %{ $stat->{_} }, $/;
250 foreach my $id ( sort keys %{ $stat->{_} } ) {
251 my @val; # = @{ $stat->{_}->{$id} };
252 my $v = $stat->{_}->{$id};
253 if ( defined $v && ref $v eq 'ARRAY' ) {
259 $stat->{"B${limit}_count"}->{ scalar @val }++;
260 $stat->{"B${limit}_count_total"}++;
262 #$stat->{"B${limit}_count_val_dup"}->{ join(' ', @val) }++; # with duplicates
265 $u->{$_}++ foreach @val;
266 my @u_v = sort keys %$u;
267 $stat->{"B${limit}_count_val"}->{ join(' ', @u_v ) }++; # without duplicates
273 print "# stat = ",dump( $stat );
274 #print "# keys = ",dump( $keys );
276 store $keys, $keys_file;
278 my $merge_file = 'merge.storable';
279 store $merge_ids, $merge_file;
280 #print "XXX merge_ids = ", dump($merge_ids);
282 my $out_file = 'merged.csv';
283 print "out_file $out_file";
284 open(my $out_fh, '>', $out_file);
285 foreach my $val ( 1 .. 4 ) {
286 print $out_fh join(',', map { unac_string('utf-8',$_) . '_' . $val } @{ $data_headers[$val] });
289 print $out_fh "broj_valova";
292 foreach my $key ( sort keys %$data ) {
294 foreach my $id ( sort keys %{ $data->{$key} } ) {
295 #print $out_fh "## $id ## ";
299 foreach my $val ( 1 .. 4 ) {
300 if ( my $id_data = $data->{$key}->{$id}->{$val} ) {
301 print $out_fh join(',', map { m/,/ ? qq{"$_"} : $_ } @$id_data);
304 print $out_fh ( ',' x $#{ $data_headers[$val] } ) ; # FIXME +1?
308 print $out_fh $broj_valova;
313 print "\n", -s $out_file, " bytes created\n";