a575635f744d651c7110e209c3dd3c5e6abaffc1
[MojoFacets.git] / lib / MojoFacets / Import / HTMLTable.pm
1 package MojoFacets::Import::HTMLTable;
2
3 use warnings;
4 use strict;
5
6 use base 'Mojo::Base';
7
8 use HTML::TableExtract;
9 use File::Slurp;
10 use Data::Dump qw(dump);
11
12 __PACKAGE__->attr('dir');
13
14 sub data {
15         my $self = shift;
16
17         my $items;
18         my $stats;
19         my @header;
20
21         foreach my $file ( glob $self->dir . '/*.html' ) {
22                 warn "# file $file\n";
23                 my $te = HTML::TableExtract->new(
24                         keep_headers => 1,
25                 );
26
27                 $te->parse( scalar read_file $file );
28
29                 foreach my $ts ($te->tables) {
30                         warn "# table coords ", join(',', $ts->coords), "\n";
31                         warn "# hrow ", dump( $ts->hrow() ), "\n";
32                         my @column_map = $ts->column_map;
33                         warn "# column_map ", dump( @column_map );
34                         next unless $#column_map == 8;
35                         foreach my $row ($ts->rows) {
36                                 warn "# row ", dump( $row ),"\n";
37                                 if ( ! $stats->{$file} ) {
38                                         if ( ! @header ) {
39                                                 @header = @$row;
40                                                 warn "# new header ",dump(@header);
41                                         } else {
42                                                 my $o = join('|', @header);
43                                                 my $n = join('|', @$row);
44                                                 if ( $o eq $n ) {
45                                                         warn "# same header again in $file skipping\n";
46                                                 } else {
47                                                         warn "# header $n changed from $o in $file";
48                                                         push @$items, $row;
49                                                         $stats->{$file}++;
50                                                 }
51                                         }
52                                 } else {
53                                         push @$items, $row;
54                                         $stats->{$file}++;
55                                 }
56                         }
57                 }
58
59         }
60
61         return {
62                 header => [ @header ],
63                 items => $items,
64                 stats => $stats,
65         }
66 }
67
68 1