X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=lib%2FMojoFacets%2FImport%2FCSV.pm;h=e8fa19624cc809f514e3438226b3d8dde7f63d3f;hb=ce2e4a650a6a3c52ddccc2382488c6ff4a2ed763;hp=fc9b1ebc61500d686cf2f9e3fd9a7955561596e2;hpb=b3637d471c17dfa513a774a093adf5fcc51d8b35;p=MojoFacets.git diff --git a/lib/MojoFacets/Import/CSV.pm b/lib/MojoFacets/Import/CSV.pm index fc9b1eb..e8fa196 100644 --- a/lib/MojoFacets/Import/CSV.pm +++ b/lib/MojoFacets/Import/CSV.pm @@ -5,56 +5,74 @@ use strict; use base 'Mojo::Base'; -use File::Slurp; +use Text::CSV; use Data::Dump qw(dump); -use Encode; -__PACKAGE__->attr('path'); -__PACKAGE__->attr('full_path'); # FIXME remove full_path +__PACKAGE__->attr('full_path'); + +sub ext { '\.[ct]sv$' }; sub data { my $self = shift; - my $path = $self->full_path || $self->path; + my $path = $self->full_path; - my $data = read_file $path, { binmode => ':raw' }; # FIXME configurable! my $encoding = 'utf-8'; - if ( $path =~ m/\.(\w+).csv/i ) { + if ( $path =~ m/\.([\w\-]+).[ct]sv/i ) { $encoding = $1; } - warn "decoding ", length($data), " bytes using $encoding\n"; - $data = decode($encoding, $data); - my @lines = split(/\r?\n/, $data); - $data = { items => [] }; + my $data = { items => [] }; + my @header; - my $delimiter = qr/,/; + open my $fh, "<:encoding($encoding)", $path or die "$path: $!"; + my $first = <$fh>; + my $possible_delimiters; + while ( $first =~ s/(\W)// ) { + $possible_delimiters->{$1}++; + } + warn "# possible_delimiters = ",dump($possible_delimiters); + seek $fh,0,0; # rewind for Text::CSV + + my @sep_by_usage = sort { $possible_delimiters->{$b} <=> $possible_delimiters->{$a} } keys %$possible_delimiters; + my $sep_char = shift @sep_by_usage; + while ( $sep_char =~ m/^\s$/ ) { + last if $sep_char eq "\t" && $path =~ m/\.tsv$/i; + warn "## skip whitespace separator ",dump($sep_char); + $sep_char = shift @sep_by_usage; + } - if ( $lines[0] !~ /;/ && $lines[1] =~ /;/ ) { - shift @lines; # FIXME ship non-header line - $delimiter = qr/;/; + while ( $sep_char =~ m/^\"$/ ) { + warn "## skip quote separator ",dump($sep_char); + $sep_char = shift @sep_by_usage; } - warn "$path ", $#lines + 1, " lines encoding: $encoding delimiter:",dump($delimiter); + if ( $sep_char !~ m/,/ && $possible_delimiters->{','} && $path =~ m/\.csv/i ) { + $sep_char = ','; + warn "## csv file detected so prefer , as separator"; + } - my $header_line = shift @lines; + warn "sep_char = [$sep_char] for $path\n"; - my @header = map { s/^"(.+)"$/$1/; s/^\s*(.+?)\s*$/$1/; $_ } split( $delimiter, $header_line ); - warn "# header ",dump( @header ); + my $csv = Text::CSV->new ( { binary => 1, eol => $/, sep_char => $sep_char } ) + or die "Cannot use CSV: ".Text::CSV->error_diag (); - while ( my $line = shift @lines ) { - chomp $line; - my @v = split($delimiter, $line); + while ( my $row = $csv->getline( $fh ) ) { + if ( ! @header ) { + @header = @$row; + $header[0] =~ s/^#// if $path =~ m/\.tsv/i; # remove hash from 1st column + next; + } my $item; - foreach my $i ( 0 .. $#v ) { - my $v = $v[$i]; - $v =~ s/^"(.+)"$/$1/; - $v =~ s/^\s*(.+?)\s*$/$1/; - $item->{ $header[$i] || "f_$i" } = [ $v ]; + foreach my $i ( 0 .. $#{$row} ) { + $item->{ $header[$i] || "f_$i" } = [ $row->[$i] ]; } push @{ $data->{items} }, $item; } + $csv->eof or $csv->error_diag(); + close $fh; + $data->{header} = [ @header ]; return $data;