don't try to decode records as iso-8859-1, read them as utf-8
[webpac2] / bin / isi-merge.pl
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5 use autodie;
6 use Digest::MD5 qw(md5_hex);
7 use Digest::SHA1 qw(sha1_hex);
8 use Data::Dump qw(dump);
9
10 my $use_sha1 = $ENV{SHA1} || 1;
11
12 my @files = @ARGV;
13 @files = glob '/tmp/isi.*-*.txt' unless @files;
14
15 my $path = '/tmp/isi.full.txt';
16
17 warn "# ", $#files + 1, " files to $path sha:$use_sha1\n";
18
19 open(my $out_fh, '>', $path);
20 print $out_fh "FN ISI Export Format\nVR 1.0\n";
21
22 my $rec;
23 my $nr = 0;
24
25 my $md5;
26
27 my $report;
28
29 foreach my $file ( sort {
30         my $a_r = $1 if $a =~ m{(\d+)-\d+};
31         my $b_r = $1 if $b =~ m{(\d+)-\d+};
32         $a_r <=> $b_r;
33 } @files ) {
34         print STDERR $file;
35
36         push @{ $report->{files} }, $file;
37
38         open(my $fh, '<', $file);
39         while(<$fh>) {
40                 next if m/^(FN|VR|EF)/;
41
42                 if ( ! m/^[\r\n]+$/s ) {
43                         $rec .= $_;
44                 } else {
45                         $nr++;
46                         my $digest = $use_sha1 ? sha1_hex $rec : md5_hex $rec;
47                         if ( my $times = $md5->{$digest} ) {
48                                 print STDERR 'd';
49                                 $report->{file}->{$file}->{duplicates}++;
50                         } else {
51                                 print $out_fh $rec . $_;
52                                 $report->{file}->{$file}->{records}++;
53                                 $report->{total_records}++;
54                                 print STDERR '.';
55                         }
56
57                         $md5->{$digest}++;
58                         $rec = '';
59                 }
60         }
61         warn "\n";
62 }
63
64 print $out_fh "EF\n";
65 close $out_fh;
66
67
68 warn "# $path ", -s $path, dump $report;
69