rewrite CSV parser to support "quoted" fiends
[MojoFacets.git] / lib / MojoFacets / Import / HTMLTable.pm
1 package MojoFacets::Import::HTMLTable;
2
3 use warnings;
4 use strict;
5
6 use base 'Mojo::Base';
7
8 use HTML::TableExtract;
9 use File::Slurp;
10 use Data::Dump qw(dump);
11
12 __PACKAGE__->attr('dir');
13
14 sub __normalize_header {
15         map {
16                 s/^\s+//s;
17                 s/\s+$//s;
18                 s/\s\s+/ /gs;
19                 $_;
20         } @_
21 }
22
23 sub data {
24         my $self = shift;
25
26         my $items;
27         my $stats;
28         my @header;
29
30         foreach my $file ( glob $self->dir . '/*.html' ) {
31                 warn "# file $file\n";
32                 my $te = HTML::TableExtract->new(
33                         keep_headers => 1,
34                 );
35
36                 $te->parse( scalar read_file $file );
37
38                 foreach my $ts ($te->tables) {
39                         warn "# table coords ", join(',', $ts->coords), "\n";
40                         warn "# hrow ", dump( $ts->hrow() ), "\n";
41                         my @column_map = $ts->column_map;
42                         warn "# column_map ", dump( @column_map );
43                         next unless $#column_map == 8;
44                         foreach my $row ($ts->rows) {
45                                 warn "# row ", dump( $row ),"\n";
46                                 if ( ! $stats->{$file} ) {
47                                         if ( ! @header ) {
48                                                 @header = __normalize_header( @$row );
49                                                 warn "# new header ",dump(@header);
50                                                 $row = undef;
51                                         } else {
52                                                 my $o = join('|', @header);
53                                                 my $n = join('|', __normalize_header(@$row));
54                                                 if ( $o eq $n ) {
55                                                         warn "# same header again in $file skipping\n";
56                                                         $row = undef;
57                                                 } else {
58                                                         warn "# header $n changed from $o in $file";
59                                                 }
60                                         }
61                                 };
62
63                                 if ( $row ) {
64                                         my $item;
65                                         foreach my $i ( 0 .. $#$row ) {
66                                                 $item->{ $header[$i] } = [ $row->[$i] ];
67                                         }
68                                         $stats->{$file}++;
69                                         warn "## item ",$stats->{$file}, ' ', dump($item);
70                                         push @$items, $item;
71
72                                 }
73                         }
74                 }
75
76         }
77
78         my $data = {
79                 header => [ @header ],
80                 file_stats => $stats,
81                 items => $items,
82         };
83
84         warn "# data ",dump( $data );
85         return $data;
86 }
87
88 1