use warnings;
use strict;
use autodie;
+use Digest::MD5 qw(md5_hex);
+use Data::Dump qw(dump);
my @files = @ARGV;
@files = glob '/tmp/isi.*-*.txt' unless @files;
open(my $out_fh, '>', $path);
print $out_fh "FN ISI Export Format\nVR 1.0\n";
+my $rec;
+my $nr = 0;
+
+my $md5;
+
+my $report;
+
foreach my $file ( sort {
my $a_r = $1 if $a =~ m{(\d+)-\d+};
my $b_r = $1 if $b =~ m{(\d+)-\d+};
} @files ) {
warn $file;
+ push @{ $report->{files} }, $file;
+
open(my $fh, '<', $file);
while(<$fh>) {
next if m/^(FN|VR|EF)/;
- print $out_fh $_;
+ if ( ! m/^[\r\n]+$/s ) {
+ $rec .= $_;
+ } else {
+ $nr++;
+ my $digest = md5_hex $rec;
+ if ( my $times = $md5->{$digest} ) {
+ warn "dumplicate $nr $digest $times\n";
+ $report->{file}->{$file}->{duplicates}++;
+ } else {
+ print $out_fh $rec . $_;
+ $report->{file}->{$file}->{records}++;
+ $report->{total_records}++;
+ }
+
+ $md5->{$digest}++;
+ $rec = '';
+ }
}
}
print $out_fh "EF\n";
close $out_fh;
-warn "# $path ", -s $path;
+
+warn "# $path ", -s $path, dump $report;
+