correctly load multiple params for paths
[MojoFacets.git] / lib / MojoFacets / Import / HTMLTable.pm
1 package MojoFacets::Import::HTMLTable;
2
3 use warnings;
4 use strict;
5
6 use base 'Mojo::Base';
7
8 use HTML::TableExtract;
9 use File::Slurp;
10 use Data::Dump qw(dump);
11
12 __PACKAGE__->attr('full_path');
13
14 sub ext { '\.html$' => 'directory' }
15
16 sub __normalize_header {
17         map {
18                 s/^\s+//s;
19                 s/\s+$//s;
20                 s/\s\s+/ /gs;
21                 $_;
22         } @_
23 }
24
25 sub data {
26         my $self = shift;
27
28         my $items;
29         my $stats;
30         my @header;
31
32         foreach my $file ( glob $self->full_path . '/*.html' ) {
33                 warn "# file $file\n";
34                 my $te = HTML::TableExtract->new(
35                         keep_headers => 1,
36                 );
37
38                 $te->parse( scalar read_file $file );
39
40                 foreach my $ts ($te->tables) {
41                         warn "# table coords ", join(',', $ts->coords), "\n";
42                         warn "# hrow ", dump( $ts->hrow() ), "\n";
43                         my @column_map = $ts->column_map;
44                         warn "# column_map ", dump( @column_map );
45                         next unless $#column_map == 8;
46                         foreach my $row ($ts->rows) {
47                                 warn "# row ", dump( $row ),"\n";
48                                 if ( ! $stats->{$file} ) {
49                                         if ( ! @header ) {
50                                                 @header = __normalize_header( @$row );
51                                                 warn "# new header ",dump(@header);
52                                                 $row = undef;
53                                         } else {
54                                                 my $o = join('|', @header);
55                                                 my $n = join('|', __normalize_header(@$row));
56                                                 if ( $o eq $n ) {
57                                                         warn "# same header again in $file skipping\n";
58                                                         $row = undef;
59                                                 } else {
60                                                         warn "# header $n changed from $o in $file";
61                                                 }
62                                         }
63                                 };
64
65                                 if ( $row ) {
66                                         my $item;
67                                         foreach my $i ( 0 .. $#$row ) {
68                                                 $item->{ $header[$i] } = [ $row->[$i] ];
69                                         }
70                                         $stats->{$file}++;
71                                         warn "## item ",$stats->{$file}, ' ', dump($item);
72                                         push @$items, $item;
73
74                                 }
75                         }
76                 }
77
78         }
79
80         my $data = {
81                 header => [ @header ],
82                 file_stats => $stats,
83                 items => $items,
84         };
85
86         warn "# data ",dump( $data );
87         return $data;
88 }
89
90 1