dee976514e8f9c8d596b0207d7b7f7597b3c4487
[MojoFacets.git] / lib / MojoFacets / Data.pm
1 package MojoFacets::Data;
2
3 use strict;
4 use warnings;
5
6 use base 'Mojolicious::Controller';
7
8 use Data::Dump qw(dump);
9 use File::Slurp;
10 use JSON;
11 use Encode;
12 use locale;
13 use File::Find;
14
15 our $loaded;
16 our $filters;
17
18 sub index {
19         my $self = shift;
20
21         my $path = $self->app->home->rel_dir('data');
22         die "no data dir $path" unless -d $path;
23
24         my @files;
25         find( sub {
26                 my $file = $File::Find::name;
27                 if ( -f $file && $file =~ m/\.(js(on)?|txt)$/ ) {
28                         $file =~ s/$path\/*//;
29                         push @files, $file;
30                 } else {
31                         warn "IGNORE: $file\n";
32                 }
33         }, $path);
34
35         @files = sort { lc $a cmp lc $b } @files;
36         my $size;
37         $size->{$_} = -s "$path/$_" foreach @files;
38
39         $self->render(
40                 files => [ @files ],
41                 size => $size,
42                 loaded => $loaded,
43                 filters => $filters,
44         );
45 }
46
47 sub _load_path {
48         my ( $self, $path ) = @_;
49
50         my $full_path = $self->app->home->rel_file( 'data/' . $path );
51         die "$full_path $!" unless -r $full_path;
52
53         if ( defined $loaded->{$path}->{data} ) {
54                 my $mtime = (stat($full_path))[9];
55                 return if $loaded->{$path}->{mtime} == $mtime;
56                 warn "reload $full_path, modified ", time() - $mtime, " seconds ago\n";
57         }
58
59         # we could use Mojo::JSON here, but it's too slow
60 #       $data = from_json read_file $path;
61         my $data = read_file $full_path;
62         warn "# data snippet: ", substr($data,0,200);
63         my @header;
64         if ( $path =~ m/\.js/ ) {
65                 Encode::_utf8_on($data);
66                 $data = from_json $data;
67         } elsif ( $path =~ m/\.txt/ ) {
68                 my @lines = split(/\r?\n/, $data);
69                 $data = { items => [] };
70
71                 my $header_line = shift @lines;
72                 my $multiline = $header_line =~ s/\^//g;
73                 @header = split(/\|/, $header_line );
74                 warn "# header ", dump( @header );
75                 while ( my $line = shift @lines ) {
76                         $line =~ s/\^//g;
77                         chomp $line;
78                         my @v = split(/\|/, $line);
79                         while ( @lines && $#v < $#header ) {
80                                 $line = $lines[0];
81                                 $line =~ s/\^//g;
82                                 chomp $line;
83                                 my @more_v = split(/\|/, $line);
84                                 if ( $#v + $#more_v > $#header ) {
85                                         warn "short line: ",dump( @v );
86                                         last;
87                                 }
88                                 shift @lines;
89                                 $v[ $#v ] .= shift @more_v if @more_v;
90                                 push @v, @more_v if @more_v;
91
92                                 if ( $#v > $#header ) {
93                                         die "# splice $#header ", dump( @v );
94                                         @v = splice @v, 0, $#header;
95                                 }
96                         }
97                         my $item;
98                         $item->{ $header[$_] || "f_$_" } = [ $v[$_] ] foreach ( 0 .. $#v );
99                         push @{ $data->{items} }, $item;
100                 }
101         } else {
102                 warn "file format unknown $path";
103         }
104
105         my $stats;
106
107         foreach my $e ( @{ $data->{items} } ) {
108                 foreach my $n ( keys %$e ) {
109                         $stats->{$n}->{count}++;
110                         my @v;
111                         if ( ref $e->{$n} eq 'ARRAY' ) {
112                                 $stats->{$n}->{array} += $#{ $e->{$n} } + 1;
113                                 @v = @{ $e->{$n} };
114                         } else {
115                                 @v = ( $e->{$n} );
116                         }
117
118                         foreach my $x ( @v ) {
119                                 $stats->{$n}->{numeric}++
120                                         if $x =~ m/^[-+]?([0-9]*\.[0-9]+|[0-9]+)$/;
121                                 $stats->{$n}->{empty}++
122                                         if length $x == 0; # faster than $x =~ m/^\s*$/;
123                         }
124
125                 }
126         }
127
128         foreach my $n ( keys %$stats ) {
129                 next unless defined $stats->{$n}->{array};
130                 delete $stats->{$n}->{array}
131                         if $stats->{$n}->{array} == $stats->{$n}->{count};
132         }
133
134         if ( ! @header ) {
135                 if ( defined $data->{header} ) {
136                         if ( ref $data->{header} eq 'ARRAY' ) {
137                                 @header = @{ $data->{header} };
138                         } else {
139                                 warn "header not array ", dump( $data->{header} );
140                         }
141                 }
142         }
143
144         @header =
145                 sort { $stats->{$b}->{count} <=> $stats->{$a}->{count} }
146                 grep { defined $stats->{$_}->{count} } keys %$stats
147                 unless @header;
148
149         warn dump($stats);
150
151         $loaded->{ $path } = {
152                 header => [ @header ],
153                 stats  => $stats,
154                 full_path => $full_path,
155                 size => -s $full_path,
156                 mtime => (stat($full_path))[9],
157                 data => $data,
158         };
159
160 }
161
162
163 sub load {
164         my $self = shift;
165
166         my @paths = $self->param('paths');
167         warn "# paths ", dump @paths;
168         $self->_load_path( $_ ) foreach @paths;
169
170         my $path = $self->param('path') || $self->redirect_to( '/data/index' );
171         warn "# path $path\n";
172         $self->session('path' => $path);
173         $self->_load_path( $path );
174
175         $self->session( 'header' => $loaded->{$path}->{header} );
176         if ( ! defined $loaded->{$path}->{columns} ) {
177                 $self->session( 'columns' => $loaded->{$path}->{header} );
178                 $self->session( 'order'   => $loaded->{$path}->{header}->[0] );
179                 $self->redirect_to( '/data/columns' );
180         } else {
181                 $self->session( 'columns' => $loaded->{$path}->{columns} );
182                 $self->session( 'order'   => $loaded->{$path}->{columns}->[0] );
183                 $self->redirect_to( '/data/items' );
184         }
185 }
186
187
188 sub _loaded {
189         my ( $self, $name ) = @_;
190         my $path = $self->session('path');
191         die "$path $name doesn't exist in loaded ",dump( $loaded )
192                 unless defined $loaded->{$path}->{$name};
193         return $loaded->{$path}->{$name};
194 }
195
196
197 sub _checked {
198         my $self = shift;
199         my $checked;
200         $checked->{$_}++ foreach @_;
201 #       warn "# _checked ",dump($checked);
202         return $checked;
203 }
204
205
206 sub columns {
207     my $self = shift;
208
209         if ( $self->param('columns') ) {
210                 $self->_perm_array('columns');
211                 $self->redirect_to('/data/items');
212         }
213
214         my $stats = $self->_loaded( 'stats' ); # || $self->redirect_to( '/data/index' );
215
216         my @columns;
217         @columns = grep { defined $stats->{$_}->{count} } @{ $self->session('columns') } if $self->session('columns');
218
219         foreach my $c ( @{ $self->session( 'header' ) } ) {
220                 push @columns, $c unless grep { /^\Q$c\E$/ } @columns;
221         }
222
223     $self->render(
224                 message => 'Select columns to display',
225                 stats => $stats,
226                 columns => \@columns,
227                 checked => $self->_checked( $self->_perm_array('columns') ),
228         );
229 }
230
231 sub _perm_array {
232     my ($self,$name) = @_;
233
234         my @array = $self->param($name);
235         my $path  = $self->session('path');
236
237         if ( @array ) {
238                 $self->session($name => [ @array ]);
239                 $loaded->{$path}->{$name} = [ @array ];
240         } elsif ( my $session = $self->session($name) ) {
241                 if ( ref $session eq 'ARRAY' ) {
242                         @array = @$session;
243                 } else {
244                         die "$name not array ",dump($session);
245                 }
246         }
247         #warn "# $name ",dump @array;
248         return @array;
249 }
250
251 sub _perm_scalar {
252     my ($self,$name,$default) = @_;
253
254         my $scalar = $self->param($name);
255
256         if ( defined $scalar ) {
257                 $self->session($name => $scalar);
258         } else {
259                 $scalar = $self->session($name);
260         }
261
262         if ( ! defined $scalar ) {
263                 $scalar = $default;
264                 die "no default for $name" unless defined $scalar;
265                 $self->session($name => $scalar);
266         }
267
268         warn "# _perm_scalar $name ",dump $scalar;
269         return $scalar;
270 }
271
272 sub filter {
273         my $self = shift;
274
275         my $name = $self->param('filter_name') || die "name?";
276         my @vals = $self->param('filter_vals');
277
278         $self->_remove_filter( $name );
279         $self->_filter_on_data( $name, @vals ) if @vals;
280
281         $self->session( 'offset' => 0 );
282
283         $self->redirect_to('/data/items');
284 }
285
286 sub _filter_on_data {
287         my ( $self, $name, @vals ) = @_;
288
289         my $path = $self->session('path');
290
291         if ( ! defined $loaded->{$path}->{stats}->{ $name } ) {
292                 warn "filter $name not found in data set";
293                 return;
294         }
295
296         $filters->{$name} = [ @vals ];
297         warn "_filter_on_data $name ", $#vals + 1, " values on $path\n";
298
299         my $filter_hash;
300         $filter_hash->{$_}++ foreach @vals;
301
302         #warn "# filter_hash ",dump( $filter_hash );
303
304         my $items = $self->_loaded('data')->{items};
305
306         my $include_missing = defined $filter_hash->{_missing};
307         my $filtered_items;
308
309         foreach my $i ( 0 .. $#$items ) {
310
311                 if ( defined $items->[$i]->{$name} ) {
312                         foreach my $v ( @{ $items->[$i]->{$name} } ) {
313                                 if ( defined $filter_hash->{ $v } ) {
314                                         $filtered_items->{$i}++;
315                                 }
316                         }
317                 } elsif ( $include_missing ) {
318                         $filtered_items->{$i}++;
319                 }
320         }
321
322         #warn "# filter $name ",dump($filtered_items);
323
324         $loaded->{$path}->{filters}->{$name} = $filtered_items;
325         warn "filter $name with ", scalar keys %$filtered_items, " items created\n";
326 }
327
328 sub _data_items {
329         my ( $self, $all ) = @_;
330         my $data = $self->_loaded( 'data' );
331
332         return @{ $data->{items} } if $all == 1;
333
334         my $filters = $self->_current_filters;
335         my $filter_value;
336         foreach my $f ( keys %$filters ) {
337                 foreach my $n ( @{ $filters->{$f} } ) {
338                         $filter_value->{$f}->{$n} = 1;
339                 }
340         }
341         my @items = @{ $data->{items} };
342         @items = grep {
343                 my $i = $_;
344                 my $pass = 1;
345                 foreach my $n ( keys %$filter_value ) {
346                         if ( ! exists $i->{$n} ) {
347                                 if ( defined $filter_value->{$n}->{_missing} ) {
348                                         $pass = 1;
349                                         next;
350                                 } else {
351                                         $pass = 0;
352                                         last;
353                                 }
354                         }
355                         # and match any of values in element
356                         my $have_values = 0;
357                         foreach my $v ( @{ $i->{$n} } ) { # FIXME not array?
358                                 $have_values ||= 1 if defined $filter_value->{$n}->{$v};
359                         }
360                         if ( ! $have_values ) {
361                                 $pass = 0;
362                                 last;
363                         }
364                 }
365                 $pass;
366         } @items if $filter_value;
367         return @items;
368 }
369
370
371 sub _current_filters {
372         my $self = shift;
373         my $current_filters;
374         $current_filters->{ $_ } = $filters->{ $_ }
375                 foreach (
376                         grep { defined $filters->{ $_ } }
377                         @{ $self->_loaded('header') }
378                 );
379         #warn "# current_filters ",dump($current_filters);
380         return $current_filters;
381 }
382
383 sub _data_sorted_by {
384         my ( $self, $order ) = @_;
385
386         my $path = $self->session('path');
387
388         warn "_data_sorted_by $order from $path";
389
390         if ( defined $loaded->{$path}->{sorted}->{$order} ) {
391                 return $loaded->{$path}->{sorted}->{$order};
392         }
393
394         my $data = $self->_loaded( 'data' );
395         my $numeric = $self->_is_numeric($order);
396         my $missing = $numeric ? 0 : '';
397         no warnings qw(numeric);
398         my $nr = 0;
399         my @sorted = map {
400                 $_->[0]
401         } sort {
402                 if ( $numeric ) {
403                         $a->[1] <=> $b->[1]
404                 } else {
405                         $a->[1] cmp $b->[1]
406                 }
407         } map {
408                 [ $nr++, exists $_->{$order} ? join('', @{$_->{$order}}) : $missing ]
409         } @{ $data->{items} }
410         ;
411
412         warn "sorted: $order numeric: $numeric items: ", $#sorted + 1, "\n";
413         #warn "# sorted ",dump( @sorted );
414
415         $loaded->{$path}->{sorted}->{$order} = [ @sorted ];
416 }
417
418
419 sub items {
420         my $self = shift;
421
422         my $path = $self->session('path');
423         $self->redirect_to('/data/index') unless defined $loaded->{ $path };
424
425         my @columns = $self->_perm_array('columns');
426         $self->redirect_to('/data/columns') unless @columns;
427         my $order   = $self->_perm_scalar('order', $columns[0]);
428         my $sort    = $self->_perm_scalar('sort', 'a');
429         my $offset  = $self->_perm_scalar('offset', 0);
430         my $limit   = $self->_perm_scalar('limit', 20);
431         $self->_perm_scalar('show', 'table');
432
433         # fix offset when changing limit
434         $offset = int( $offset / $limit ) * $limit;
435
436         my $sorted = $self->_data_sorted_by( $order );
437
438         my @filter_names;
439         if ( $filters ) {
440                 foreach my $name ( keys %$filters ) {
441                         if ( ! defined $loaded->{$path}->{stats}->{ $name } ) {
442                                 warn "skip filter $name not found in $path\n";
443                                 next;
444                         }
445                         push @filter_names, $name;
446                 }
447                 warn "filter_names ",dump( @filter_names );
448                 foreach my $name ( @filter_names ) {
449                         next if ref $loaded->{$path}->{filters}->{$name} eq 'ARRAY';
450                         $self->_filter_on_data( $name, @{ $filters->{$name} } );
451                 }
452         }
453
454         my $all_filters = join(' ',sort @filter_names,'order:',$order);
455
456 #       warn "# all_filters $all_filters ", dump( $loaded->{$path}->{filtered}->{$all_filters} );
457
458         if ( ! defined $loaded->{$path}->{filtered}->{$all_filters} ) {
459
460                 my $path_filters = $loaded->{$path}->{filters};
461
462                 warn "create combined filter for $all_filters\n";
463
464                 my @filtered;
465                 foreach my $i ( 0 .. $#$sorted ) {
466                         my $pos = $sorted->[$i];
467
468                         if ( $#filter_names == -1 ) {
469                                 push @filtered, $pos;
470                                 next;
471                         }
472
473                         my $skip = 0;
474                         foreach ( @filter_names ) {
475                                 $skip ||= 1 if ! defined $path_filters->{$_}->{$pos};
476                         }
477                         next if $skip;
478
479                         push @filtered, $pos;
480                 }
481
482                 $loaded->{$path}->{filtered}->{$all_filters} = [ @filtered ];
483         }
484
485         my $filtered = $loaded->{$path}->{filtered}->{$all_filters}
486                 if defined $loaded->{$path}->{filtered}->{$all_filters};
487
488         warn "all_filters $all_filters produced ", $#$filtered + 1, " items\n" if $filtered;
489
490         my $sorted_items;
491         my $data = $self->_loaded('data');
492         my $from_end = $sort eq 'd' ? $#$filtered : 0;
493         foreach ( 0 .. $limit ) {
494                 my $i = $_ + $offset;
495                 last unless defined $filtered->[$i];
496                 $i = $from_end - $i if $from_end;
497                 push @$sorted_items, $data->{items}->[ $filtered->[$i] ];
498         }
499
500         warn "# sorted_items ", $#$sorted_items + 1, " offset $offset limit $limit order $sort";
501
502         $self->render(
503                 order => $order,
504                 offset => $offset,
505                 limit => $limit,
506                 sorted => $sorted_items,
507                 columns => [ @columns ],
508                 rows => $#$filtered + 1,
509                 numeric => { map { $_, $self->_is_numeric($_) } @columns },
510                 filters => $self->_current_filters,
511         );
512
513 }
514
515
516 sub order {
517         my $self = shift;
518         $self->session('order', $self->param('order'));
519         $self->session('sort', $self->param('sort'));
520         $self->redirect_to('/data/items');
521 }
522
523 sub _is_numeric {
524         my ( $self, $name ) = @_;
525
526         my $stats = $self->_loaded( 'stats' );
527
528         # sort facet numerically if more >50% elements are numeric
529         my $count = $stats->{$name}->{count};
530         $count   -= $stats->{$name}->{empty} if defined $stats->{$name}->{empty};
531         defined $stats->{$name}->{numeric} &&
532                 $stats->{$name}->{numeric} > $count / 2;
533 }
534
535 sub _remove_filter {
536         my ($self,$name) = @_;
537         warn "_remove_filter $name\n";
538
539         my $path = $self->session('path');
540
541         delete $filters->{$name};
542         delete $loaded->{$path}->{filters}->{$name};
543         warn "filters left: ", keys %{ $loaded->{$path}->{filters} };
544
545         foreach (
546                         grep { /\b$name\b/ }
547                         keys %{ $loaded->{$path}->{filtered} }
548         ) {
549                 delete $loaded->{$path}->{filtered}->{$_};
550                 warn "remove filtered cache $_";
551         }
552 }
553
554 sub facet {
555         my $self = shift;
556
557         my $path = $self->session('path') || $self->redirect_to( '/data/index' );
558
559         if ( my $name = $self->param('remove') ) {
560                 $self->_remove_filter( $name );
561                 $self->redirect_to( '/data/items' );
562         }
563
564         my $facet;
565         my $name = $self->param('name') || die "no name";
566
567         my $all = $self->_perm_scalar('all', 1);
568         my $data = $self->_loaded('data');
569
570         my $filters = $self->_current_filters;
571         my $all_filters = join(' ',sort keys %$filters,'order:',$self->session('order'));
572         my $filtered = $loaded->{$path}->{filtered}->{$all_filters}
573                 if defined $loaded->{$path}->{filtered}->{$all_filters};
574
575         if ( ! $filtered || $all ) {
576                 $filtered = [ 0 .. $#{ $data->{items} } ];
577                 warn "filter all values\n";
578         } else {
579                 warn "filter using $all_filters\n";
580         }
581
582         foreach my $i ( @$filtered ) {
583                 my $item = $data->{items}->[$i];
584                 if ( ! exists $item->{$name} ) {
585                         $facet->{ _missing }++;
586                 } elsif ( ref $item->{$name} eq 'ARRAY' ) {
587                         $facet->{$_}++ foreach @{ $item->{$name} };
588                 } else {
589                         $facet->{ $item->{$name} }++;
590                 }
591         }
592
593 #       warn "# facet $name ",dump $facet;
594
595         my $checked;
596         my @facet_names =
597                   $all                      ? keys %$facet
598                 : defined $filters->{$name} ? @{ $filters->{$name} }
599                 : keys %$facet;
600
601         $checked = $self->_checked( @{ $filters->{$name} } ) if defined $filters->{$name};
602
603         my $numeric = $self->_is_numeric($name);
604
605         my $sort = $self->param('sort');
606         # sort numeric facets with more than 5 values ascending
607         $sort ||= $numeric && $#facet_names > 4 ? 'a' : 'c';
608
609         @facet_names = sort {
610                 my $result;
611                 if ( $sort eq 'a' ) {
612                         $result = $numeric ? $a <=> $b : lc $a cmp lc $b;
613                 } elsif ( $sort eq 'd' ) {
614                         $result = $numeric ? $b <=> $a : lc $b cmp lc $a;
615                 } elsif ( $sort eq 'c' ) {
616                         $result = ( $facet->{$b} || -1 ) <=> ( $facet->{$a} || -1 )
617                 } else {
618                         warn "unknown sort: $sort";
619                         $result = $a cmp $b;
620                 }
621                 $result = $a cmp $b unless defined $result; # FIXME cludge for numeric facets with invalid data
622                 $result;
623         } @facet_names;
624
625         $self->render( name => $name, facet => $facet, checked => $checked,
626                 facet_names => \@facet_names, sort => $sort, numeric => $numeric,
627         );
628 }
629
630 sub edit {
631         my $self = shift;
632         my $content = $self->param('content');
633
634         $self->render(
635                 content => $content
636         );
637 }
638
639 1;