use File::Slurp;
use Data::Dump qw(dump);
-__PACKAGE__->attr('dir');
+__PACKAGE__->attr('full_path');
+
+sub ext { '\.html$' => 'directory' }
+
+sub __normalize_header {
+ map {
+ s/^\s+//s;
+ s/\s+$//s;
+ s/\s\s+/ /gs;
+ $_;
+ } @_
+}
sub data {
my $self = shift;
my $stats;
my @header;
- foreach my $file ( glob $self->dir . '/*.html' ) {
+ foreach my $file ( glob $self->full_path . '/*.html' ) {
warn "# file $file\n";
my $te = HTML::TableExtract->new(
keep_headers => 1,
warn "# row ", dump( $row ),"\n";
if ( ! $stats->{$file} ) {
if ( ! @header ) {
- @header = @$row;
+ @header = __normalize_header( @$row );
warn "# new header ",dump(@header);
+ $row = undef;
} else {
my $o = join('|', @header);
- my $n = join('|', @$row);
+ my $n = join('|', __normalize_header(@$row));
if ( $o eq $n ) {
warn "# same header again in $file skipping\n";
+ $row = undef;
} else {
warn "# header $n changed from $o in $file";
- push @$items, $row;
- $stats->{$file}++;
}
}
- } else {
- push @$items, $row;
+ };
+
+ if ( $row ) {
+ my $item;
+ foreach my $i ( 0 .. $#$row ) {
+ $item->{ $header[$i] } = [ $row->[$i] ];
+ }
$stats->{$file}++;
+ warn "## item ",$stats->{$file}, ' ', dump($item);
+ push @$items, $item;
+
}
}
}
}
- return {
+ my $data = {
header => [ @header ],
+ file_stats => $stats,
items => $items,
- stats => $stats,
- }
+ };
+
+ warn "# data ",dump( $data );
+ return $data;
}
1