new input file

[openface-maja] / p.pl
diff --git a/p.pl b/p.pl

index 0a85b1e..cdd2990 100755 (executable)
--- a/p.pl
+++ b/p.pl
@@ -3,8 +3,24 @@ use warnings;
  use strict;
  use autodie;
  use Data::Dump qw(dump);
+use Text::Unaccent;
+use POSIX qw(strftime);
+
+#my $filename = 'Intervali_Openface_novi_rad_Boris_export.txt';
+my $filename = 'originals_2023-07-09/Intervali_Openface_novi_rad_Boris_export.txt';
+
+my $debug = $ENV{DEBUG} || 0;
+
+# better time format for scp (don't have :)
+my $today = strftime("%Y-%m-%d_%H%M%S", localtime(time()));
+warn "# today $today";
+
+open(my $err_fh, '>', "out.$today.log");
+local $SIG{__WARN__} = sub {
+       print STDERR @_;
+       print $err_fh @_;
+};
  
-my $filename = 'Intervali_Openface_novi_rad_Boris_export.txt';
  
  sub col2nr {
         my @c = @_;
@@ -29,22 +45,81 @@ my @h2;
  my $h2nr;
  my $tell2;
  
+my @glob = glob 'originals_2023-07-07/*';
+my $map_un_back;
+my @glob_un = map {
+       my $un = lc( unac_string('utf-8', $_ ) );
+       $map_un_back->{ $un } = $_;
+       $un;
+} @glob;
+sub glob_id {
+       my $id = shift;
+       my $id_lc = lc( unac_string('utf-8', $id ) );
+       my $pos = -1;
+       my @found = grep { $pos++; m/$id_lc/ } @glob_un;
+       if ( @found ) {
+               if ( $#found == 0 ) {
+                       warn "XXX patt=$id pos=$pos found=",dump(@found);
+                       return $map_un_back->{$found[0]};
+                       #return $glob[$pos];
+               } else {
+                       warn "ERROR: pattern $id found $#found results, ignoring";
+                       return;
+               }
+       } else {
+               warn "not found $id";
+               return;
+       }
+}
+
+open(my $skipped_input, '>',"out.$today.skip");
+open(my $out_fh, '>', "out.$today.csv");
+sub out_csv {
+       my @a = @_;
+       print $out_fh join(",", map { m/\s/ ? qq{"$_"} : $_ } @a),"\n";
+}
+
+my @out_header;
  
  while(<$fh>) {
         chomp;
         next if m/^\s*$/;       # FIXME corrput data
         my @v = split(/\t/, $_);
-       warn "#[ $_ ]\n", dump( \@v );
+       warn "#[ $_ ]\n" if $debug;
+
+       my $id = $v[0];
+       if ( ! defined $id ) {
+               warn "ERROR: ID col 0 null [$_]";
+               next;
+       }
  
         if ( ! $last_id || $last_id ne $v[0] ) {
-               $file2 = glob 'originals_2023-07-07/' . $v[0] . '*.csv';
+               my $patt = 'originals_2023-07-07/' . $v[0] . '*.csv';
+               $file2 = glob $patt;
+               $file2 ||= glob 'originals_2023-07-07/' . $v[0] . '*.csv';
+               $file2 ||= glob 'originals_2023-07-07/' . unac_string('utf-8', $v[0] ) . '*.csv';
+               $file2 ||= glob_id $v[0];
+               # try to find file with same numbers
+               my $id_nrs = $v[0];
+               $id_nrs =~ s/\D//g;
+               $file2 ||= glob_id $id_nrs;
+
+               if ( ! defined $file2 ) {
+                       warn "SKIP ERROR: $patt glob didn't find anything for v[0]=$v[0] [$_]";
+                       print $skipped_input "$_\n";
+                       next;
+               }
  
-               next unless $file2; # FIXME
+               if ( ! -r $file2 ) {
+                       warn "ERROR: $patt glob file $file2 not readable: $! SKIPPING [$_]";
+                       print $skipped_input "$_\n";
+                       next;
+               }
  
                 warn "# file2 $file2 [$v[0]]";
                 open($fh2, '<', $file2);
                 $last_id = $v[0];
-warn "---0000 v=",dump( \@v ), " FROM >>>$_<<<\n";
+               @h2 = ();
         }
  
         sub col_v {
@@ -53,33 +128,42 @@ warn "---0000 v=",dump( \@v ), " FROM >>>$_<<<\n";
                 die "can't find $name" unless defined $i;
                 my $v = $v[$i];
                 die "can't find $i : $name in ",dump( \@v ) unless defined $v;
-               warn "## col_v $name -> $i -> $v",dump( \@v );
+               warn "## col_v $name -> $i -> $v",dump( \@v ) if $debug > 1;
                 return $v;
         }
  
         #my $start = $v[ $col2nr->{'Start (s)'} ];
         #my $stop  = $v[ $col2nr->{'Stop (s)'} ];
  
-       my $start = col_v 'Start (s)';
-       my $stop  = col_v 'Stop (s)';
+       #my $start = col_v 'Start (s)';
+       #my $stop  = col_v 'Stop (s)';
  
         my $start = $v[10];
         my $stop  = $v[11];
-       warn "# start: $start - stop: $stop\n";
+       warn "# start: $start - stop: $stop\n" if $debug;
  
         my @sum;
         my $count = 0;
  
         while(<$fh2>) {
                 chomp; s/\r+$//;
-               warn "## fh2 [ $_ ]\n";
+               warn "## fh2 [ $_ ]\n" if $debug;
                 #my @v2 = map { s/^\s+//; $_ } split(/\t/,$_);
-               my @v2 = map { s/^\s+//; $_ } split(/,\s+/,$_);
+               my @v2 = map { s/^\s+//; $_ } split(/[;,]\s*/,$_);
                 if ( ! @h2 ) {
                         @h2 = @v2;
                         $h2nr = col2nr @h2;
                         $tell2 = tell($fh2);
-                       warn "# h2nr ",dump( $h2nr );
+                       warn "# h2nr ",dump( $h2nr ) if $debug;
+                       if ( ! @out_header ) {
+                               push @out_header, @cols;
+                               push @out_header, map { $h2[$_] }
+                                       ( $h2nr->{'AU01_r'} .. $h2nr->{'AU45_r'} );
+                               push @out_header, map { $h2[$_] }
+                                       ( $h2nr->{'AU01_c'} .. $h2nr->{'AU45_c'} );
+                               warn "# out_header ",dump( \@out_header );
+                               out_csv @out_header;
+                       }
                         next;
                 }
  
@@ -98,9 +182,16 @@ warn "---0000 v=",dump( \@v ), " FROM >>>$_<<<\n";
                 #my $timestamp = col_v2 'timestamp';
                 my $timestamp = $v2[2];
  
-               $timestamp /= 1000 if $timestamp =~ m/^\d+$/;   # FIXME fix corrputed input data
+               #$timestamp /= 1000 if $timestamp =~ m/^\d+$/;  # FIXME fix corrputed input data
                 #warn "XXX filter $start - $stop from ",dump( $timestamp ); #$h2nr, \@v2 );
  
+               # Argument "1.324.400" isn't numeric
+               $timestamp =~ s/^(\d+)\.(\d\d\d)\.(\d\d\d)$/$1$2.$3/;
+
+               if ( $timestamp !~ m/^\d+(\.\d+)$/ ) {
+                       warn "timestamp [$timestamp] from $file2 not numeric [$_]";
+               }
+
                 if ( $timestamp < $start ) {
                         #warn "## $start > @v2";
                 } elsif ( $timestamp < $stop ) {
@@ -112,7 +203,7 @@ warn "---0000 v=",dump( \@v ), " FROM >>>$_<<<\n";
                         $count++;
                         #warn "XXX $count sum $h2nr->{'AU01_r'} .. $h2nr->{'AU45_c'} = ",dump( \@sum );
                 } else {
-                       warn "# seek $tell2";
+                       warn "# seek $tell2" if $debug;
                         seek $fh2, $tell2, 0;
  
                         die "count is 0" if $count == 0;
@@ -121,13 +212,13 @@ warn "---0000 v=",dump( \@v ), " FROM >>>$_<<<\n";
                         my @add_cols;
                         push @add_cols, map { $sum[$_] / $count } ( $h2nr->{'AU01_r'} .. $h2nr->{'AU45_r'} );
                         push @add_cols, map { $sum[$_] }          ( $h2nr->{'AU01_c'} .. $h2nr->{'AU45_c'} );
-                       warn "# add_cols = ",dump( \@add_cols );
-                       print join("\t", @v, @add_cols ), "\n";
+                       warn "# add_cols = ",dump( \@add_cols ) if $debug;
+                       out_csv @v, @add_cols;
  
                         @sum = ();
                         $count = 0;
  
-                       warn "# last";
+                       warn "# last" if $debug;
                         last;
                 }
         }