22aea57b3a56dfb7bcb3bd96f4ead29d2d73f221
[MojoFacets.git] / lib / MojoFacets / Import / HTMLTable.pm
1 package MojoFacets::Import::HTMLTable;
2
3 use warnings;
4 use strict;
5
6 use base 'Mojo::Base';
7
8 use HTML::TableExtract;
9 use File::Slurp;
10 use Data::Dump qw(dump);
11
12 __PACKAGE__->attr('dir');
13
14 sub data {
15         my $self = shift;
16
17         my $items;
18         my $stats;
19         my @header;
20
21         foreach my $file ( glob $self->dir . '/*.html' ) {
22                 warn "# file $file\n";
23                 my $te = HTML::TableExtract->new(
24                         keep_headers => 1,
25                 );
26
27                 $te->parse( scalar read_file $file );
28
29                 foreach my $ts ($te->tables) {
30                         warn "# table coords ", join(',', $ts->coords), "\n";
31                         warn "# hrow ", dump( $ts->hrow() ), "\n";
32                         my @column_map = $ts->column_map;
33                         warn "# column_map ", dump( @column_map );
34                         next unless $#column_map == 8;
35                         foreach my $row ($ts->rows) {
36                                 warn "# row ", dump( $row ),"\n";
37                                 if ( ! $stats->{$file} ) {
38                                         if ( ! @header ) {
39                                                 @header = @$row;
40                                                 warn "# new header ",dump(@header);
41                                                 $row = undef;
42                                         } else {
43                                                 my $o = join('|', @header);
44                                                 my $n = join('|', @$row);
45                                                 if ( $o eq $n ) {
46                                                         warn "# same header again in $file skipping\n";
47                                                         $row = undef;
48                                                 } else {
49                                                         warn "# header $n changed from $o in $file";
50                                                 }
51                                         }
52                                 };
53
54                                 if ( $row ) {
55                                         my $item;
56                                         foreach my $i ( 0 .. $#$row ) {
57                                                 $item->{ $header[$i] } = [ $row->[$i] ];
58                                         }
59                                         warn "## item ",dump($item);
60                                         push @$items, $item;
61
62                                         $stats->{$file}++;
63                                 }
64                         }
65                 }
66
67         }
68
69         my $data = {
70                 header => [ @header ],
71                 file_stats => $stats,
72                 items => $items,
73         };
74
75         warn "# data ",dump( $data );
76         return $data;
77 }
78
79 1