From: Dobrica Pavlinusic Date: Thu, 26 May 2011 12:40:29 +0000 (+0200) Subject: Merge branch 'master' of git.rot13.org:/git/MojoFacets X-Git-Url: http://git.rot13.org/?p=MojoFacets.git;a=commitdiff_plain;h=6ab2b22ee8e99850a3a8ea97f80465df2c2c8651;hp=3e1ecb5815eb15e076a82ccf9e64cd7e03011162 Merge branch 'master' of git.rot13.org:/git/MojoFacets --- diff --git a/Makefile.PL b/Makefile.PL index 29d5e24..a55b88a 100755 --- a/Makefile.PL +++ b/Makefile.PL @@ -13,6 +13,7 @@ requires 'HTML::TableExtract'; requires 'File::Path'; requires 'Text::Unaccent::PurePerl'; requires 'Statistics::Descriptive'; +requires 'Text::CSV'; features( 'profile' => [ diff --git a/bin/debian-install.sh b/bin/debian-install.sh index 019f9e6..d2f28e2 100755 --- a/bin/debian-install.sh +++ b/bin/debian-install.sh @@ -1,3 +1,3 @@ #!/bin/sh -x -sudo apt-get install libhtml-tableextract-perl libjson-perl libmodule-install-perl libstatistics-descriptive-perl +sudo apt-get install libhtml-tableextract-perl libjson-perl libmodule-install-perl libstatistics-descriptive-perl libmodule-install-perl diff --git a/lib/MojoFacets/Import/CSV.pm b/lib/MojoFacets/Import/CSV.pm index 6ab3395..260c002 100644 --- a/lib/MojoFacets/Import/CSV.pm +++ b/lib/MojoFacets/Import/CSV.pm @@ -5,90 +5,43 @@ use strict; use base 'Mojo::Base'; -use File::Slurp; +use Text::CSV; use Data::Dump qw(dump); -use Encode; __PACKAGE__->attr('full_path'); -my $null = ''; # FIXME undef? - -sub _split_line { - my ( $delimiter, $line ) = @_; - my @v; - while ( $line ) { - my $v; - if ( $line =~ s/^"// ) { - $line =~ s/""/_qq_/gc; - $line =~ s/^\s*([^"]*)\s*"\Q$delimiter\E?// || die "can't parse [$line] ",dump(@v); - $v = $1; - } elsif ( $line =~ s/^\s*([^\Q$delimiter\E]+)\s*\Q$delimiter\E?// ) { - $v = $1; - } elsif ( $line =~ s/^\s*\Q$delimiter\E// ) { - $v = $null; - } else { - die "can't parse [$line]\n"; - } - - $v =~ s/^\s*(.+?)\s*$/$1/; - $v = $null if $v eq '_qq_'; # "" field which is not first one - $v =~ s/_qq_/"/g; - $v =~ s/_LF_/\n/g; - push @v, $v; - } - - return @v; -} - sub data { my $self = shift; my $path = $self->full_path; - my $data = read_file $path, { binmode => ':raw' }; # FIXME configurable! my $encoding = 'utf-8'; if ( $path =~ m/\.(\w+).csv/i ) { $encoding = $1; } - warn "decoding ", length($data), " bytes using $encoding\n"; - $data = decode($encoding, $data); - # multi-line strings - while ( $data =~ s/(,"[^"]*)[\n\r]+([^"]*)/$1_LF_$2/sg ) { - warn "multi-line quoted CSV data found"; - } + my $data = { items => [] }; + my @header; - my @lines = split(/\r?\n/, $data); - $data = { items => [] }; + my $csv = Text::CSV->new ( { binary => 1, eol => $/ } ) + or die "Cannot use CSV: ".Text::CSV->error_diag (); - my $delimiter = ','; - - if ( $lines[0] !~ m/,/ ) { - if ( $lines[0] =~ m/;/ ) { - $delimiter = ';'; - } elsif ( $lines[0] !~ /;/ && $lines[1] =~ /;/ ) { - shift @lines; # FIXME skip non-header line - $delimiter = ';'; + open my $fh, "<:encoding($encoding)", $path or die "$path: $!"; + while ( my $row = $csv->getline( $fh ) ) { + if ( ! @header ) { + @header = @$row; + next; } - } - - warn "$path ", $#lines + 1, " lines encoding: $encoding delimiter:",dump($delimiter); - - my $header_line = shift @lines; - - my @header = _split_line( $delimiter, $header_line ); - warn "# header ",dump( @header ); - - while ( my $line = shift @lines ) { - chomp $line; - my @v = _split_line($delimiter, $line); my $item; - foreach my $i ( 0 .. $#v ) { - $item->{ $header[$i] || "f_$i" } = [ $v[$i] ]; + foreach my $i ( 0 .. $#{$row} ) { + $item->{ $header[$i] || "f_$i" } = [ $row->[$i] ]; } push @{ $data->{items} }, $item; } + $csv->eof or $csv->error_diag(); + close $fh; + $data->{header} = [ @header ]; return $data;