X-Git-Url: http://git.rot13.org/?p=eprints3-migration.git;a=blobdiff_plain;f=mtoolkit%2Futf8-fix.pl;fp=mtoolkit%2Futf8-fix.pl;h=5054189c042d85980aa45f7f7369e8d9ff391767;hp=0000000000000000000000000000000000000000;hb=f8cf0c7e8ce2535598762fc1079d1be7cd09a0f6;hpb=a9d44e9916c9aa1ed8d125cf08711dce5afd9ad5 diff --git a/mtoolkit/utf8-fix.pl b/mtoolkit/utf8-fix.pl new file mode 100755 index 0000000..5054189 --- /dev/null +++ b/mtoolkit/utf8-fix.pl @@ -0,0 +1,68 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Data::Dump qw(dump); +use Encode; +use bytes; + +my $junk = '[\xC0-\xC4][\x80-\xC4]+(\xC2\x80|\xC3\x82)'; + +my $file = shift @ARGV; +my $stat; + +open(my $fh, '<:raw', $file) || die "$file: $!"; + +while(<$fh>) { + +my $orig = $_; + +# češki +s/\xC3\x83\xC2\x85\xC3\xA2\xC2\x84\xC2\xA2/ř/gs && $stat->{'r v'}++; +s/\xC3\x83\xC2\x83\xC3\x82\xC2\xBD/ý/gs && $stat->{'y /'}++; +s/\xC3\x83\xC2\x84\xC3\x8B\xC2\x86/Č/gs && $stat->{'Č ?'}++; + +s/\xC3\x83\xC2\x83\xC3\xA2\xC2\x80\xC2\xB0/É/gs && $stat->{'E actue'}++; +s/\xC3\x83\xC2\x83\xC3\xA2\xC2\x80\xC2\xA6/Å/gs && $stat->{'A circle'}++; +# "MAQBŪLI 'ĀRIF" (Potur Şāhidī). +s/\xC3\x83\xC2\x84\xC3\xA2\xC2\x82\xC2\xAC/Ā/gs && $stat->{'A line'}++; +s/\xC3\x83\xC2\x83\xC3\xA2\xC2\x80\xC2\x99/V/gs && $stat->{'V fake'}++; + +# njemački +s/\xC3\x83\xC2\x83\xC3\x85\xC2\xB8/ß/gs && $stat->{'ss'}++; + +s/$junk\xC2\x8D/č/gs && $stat->{'č'}++; +s/$junk\xC2\xA1/ć/gs && $stat->{'ć'}++; +s/$junk\xC2\xA1/š/gs && $stat->{'š'}++; +s/$junk\xC4\xA1/š/gs && $stat->{'š C4'}++; +s/$junk\xC2\xBE/ž/gs && $stat->{'ž'}++; +s/$junk\xC4\xBE/ž/gs && $stat->{'ž C4'}++; + +s/$junk\xC2\x98/đ/gs && $stat->{'đ'}++; +s/$junk\xC2\x90/Đ/gs && $stat->{'Đ'}++; + +s/$junk\xC2\x92/Č/gs && $stat->{'Č'}++; +s/$junk\xC2\xBD/Ž/gs && $stat->{'Ž'}++; +s/$junk\xC2\xA0/Š/gs && $stat->{'Š'}++; + +s/$junk\xC2\xAB/"/gs && $stat->{'" open ?'}++; + +s/$junk\xC2\xBE/"/gs && $stat->{'" open'}++; +s/$junk\xC2\x93/"/gs && $stat->{'" close'}++; + +s/$junk\xC2\xA2/'/gs && $stat->{"'"}++; +s/$junk\xC2\xAD/-/gs && $stat->{'-'}++; + +s/$junk\xC2\x9C/-/gs && $stat->{'--'}++; +s/$junk\xC2\x9D/-/gs && $stat->{'-- ?'}++; + + + my $e = $_; + if ( $e =~ m/([\xC0-\xC4][\x80-\xff]{4,8})/s ) { + warn "XXX ", dump($e), "\n$e\n"; + } + print $e; +} + +warn dump($stat);