From: Dobrica Pavlinusic Date: Wed, 22 Nov 2023 11:27:41 +0000 (+0100) Subject: dump merged.csv X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=844c5b82db1e0b773f3db45ea1e4e016aeb28be3;p=csv-join-similarity dump merged.csv --- diff --git a/upari.pl b/upari.pl index 9a5c3c4..0dbb777 100755 --- a/upari.pl +++ b/upari.pl @@ -83,6 +83,7 @@ if ( -e $keys_file ) { } our $data; +our @data_headers; foreach my $val ( 1 .. 4 ) { my $file = "$val.csv"; @@ -96,11 +97,14 @@ foreach my $val ( 1 .. 4 ) { if ( ! exists $stat->{file}->{$file}->{header} ) { $stat->{file}->{$file}->{header} = $row; + $data_headers[$val] = $row; next; } + my $c_id = $row->[0]; my $c_s = $row->[1]; my $c_r = $row->[2]; + $row->[0] =~ s/[^A-Z0-9]+//g && $stat->{file}->{$file}->{corrupt_id}->{$c_id}++ && print 'c0'; $row->[1] =~ s/\D+//g && $stat->{file}->{$file}->{corrupt_s}->{$c_s}++ && print 'c1'; $row->[2] =~ s/\D+//g && $stat->{file}->{$file}->{corrupt_r}->{$c_r}++ && print 'c2'; my $id = join('-', @@ -126,7 +130,7 @@ foreach my $val ( 1 .. 4 ) { push @{ $stat->{_}->{ $id } }, $val; - $data->{$key_id}->{$val}->{$id} = $row; + $data->{$key_id}->{$num}->{$val} = $row; } close $fh; @@ -198,7 +202,9 @@ foreach my $id ( sort keys %{ $stat->{_} } ) { my $m_id_s = (split('-',$m_id,3))[0]; $merge_ids->{$val}->{$key_s}->{$id_s} = $m_id_s; - $data->{$key_s}->{$val}->{$id_s} = delete $data->{$key_s}->{$val}->{$m_id_s}; + my $o_row = delete $data->{$key_s}->{$id_s}->{$val}; + die "FATAL: $id | $m_id | data $key_s $id_s $val" unless $o_row; + $data->{$key_s}->{$m_id_s}->{$val} = $o_row; } print "result val=",dump( $stat->{_}->{ $m_id } ), " result_elements=", scalar @{ $stat->{_}->{ $m_id } }, $/; @@ -242,7 +248,27 @@ store $keys, $keys_file; my $merge_file = 'merge.storable'; store $merge_ids, $merge_file; -__END__ -open(my $out_fh, '>', 'merged.csv'); -foreach my $val ( 1 .. 4 ) { - foreach my $key_s +my $out_file = 'merged.csv'; +print "out_file $out_file"; +open(my $out_fh, '>', $out_file); +print $out_fh join(',', map { @{ $data_headers[$_] } } 1 .. 4 ), "\n"; + +foreach my $key ( sort keys %$data ) { + print " $key"; + foreach my $id ( sort keys %{ $data->{$key} } ) { + print $out_fh "## $id ## "; + foreach my $val ( 1 .. 4 ) { + if ( my $id_data = $data->{$key}->{$id}->{$val} ) { + print $out_fh join(',', @$id_data); + } else { + print $out_fh ( '_' x $#{ $data_headers[$val] } ) ; # FIXME +1? + } + print $out_fh '|' if $val < 4; + } + print $out_fh "\n"; + } + print $out_fh "\n"; +} +close $out_fh; +print "\n", -s $out_file, " bytes created\n"; +