From: Dobrica Pavlinusic Date: Fri, 8 Dec 2023 10:42:39 +0000 (+0100) Subject: check if old duplicate is longer and keep it X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=25e8bc9f18e1f10a9890f3aff05ba09ea19d1a42;p=csv-join-similarity check if old duplicate is longer and keep it not found in this dataset --- diff --git a/upari.pl b/upari.pl index 60eb772..1e7e1bd 100755 --- a/upari.pl +++ b/upari.pl @@ -135,12 +135,23 @@ foreach my $val ( 1 .. 4 ) { if ( exists $data->{$key_id}->{$num}->{$val} ) { $stat->{file}->{$file}->{duplicate_keyid_num}->{$val}++; - print "DUPLICATE $file $key_id $num $val\n"; - + my $old = $data->{$key_id}->{$num}->{$val}; print $duplicate_fh join(',', $file, @$old), "\n"; print $duplicate_fh join(',', $file, @$row), "\n"; print $duplicate_fh "\n"; + + # select row by longer lenght; + my $l_old = length dump $old; + my $l_row = length dump $row; + + print "DUPLICATE $file $key_id $num $val len: $l_old < $l_row\n"; + + if ( $l_old > $l_row ) { + print "DUPLICATE KEEP old longer $l_old row (new only $l_row)\n"; + next; + } + =for diff my $diff; foreach ( 0 .. $#$row ) {