#!/usr/bin/perl print < http://unicode.org/Public/UNIDATA/ EOF $dir = $ARGV[0] || '.'; open UDATA, $dir.'/UnicodeData.txt'; open BLOCKS, $dir.'/Blocks.txt'; $end = -1; $gap = 0; # get next block sub nblock { while () { last unless /^#/; } chomp; $lend = $end; ($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/; $h = {}; $h->{beg} = $beg; $h->{end} = $end; $beg = hex $beg; $end = hex $end; $gap += $beg - 1 - $lend if 65536 > $beg; $h->{len} = $end - $beg + 1; $h->{nam} = $block; # print "$beg..$end($h->{len}): $block\n"; push @blocks, $h; } nblock; # UnicodeData.txt: # 0 number (hex) # 1 name S # 2 general category E # 3 canonical combining class N # 4 bidi class E # 5 decomposition mapping S # 6-8 numeric type and value E/N # 9 bidi_mirrored B # 10 old name S # 11 comment S # 12-14 upper/lower/titlecase mapping S while () { @l = split /;/; # last if '10000' eq $l[0]; # BMP only $num = hex $l[0]; while ($num > $end) { nblock; } die "$_ not in any block!" if $num < $beg; # if ( $l[1] =~ /, First>$/ ) { $n = ; ($e) = split /;/,$n; $e = hex($e); # print stderr "$l[1]..Last: $num .. $e\n"; die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end; $h->{$l[2]} += $e - $num +1; next; } # gen cat $h->{$l[2]}++; # decomp if ($l[5]) { ($type) = ($l[5] =~ /<(.*)>/); $deco{$type||'(canonical)'}++; if ( ! $type ) { # decomp Canon $h->{'dC'}++; } } # has Upper if ($l[12]) { $h->{'uC'}++; } } print < http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings \$ EOF for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; } print "\$\n\n\n"; # 30 general categories + 2 specials @cat = ( 'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp', 'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So' ); @add = ( 'uC','dC' ); # table major categories to blocks print "\n*\tmajor category/block table\n"; print <{$_}; $tot{$_} += $h->{$_}; $maj{substr($_,0,1)} += $h->{$_}; } $maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned $Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP for (@mcat) { $tot{$_} += $maj{$_}; print $maj{$_},"\t"; } for (@add) { $tot{$_} += $h->{$_}; print $h->{$_}||'0',"\t"; } print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n"; } for (@mcat,@add) { print $tot{$_},"\t"; } print "\n"; print "\$\n"; print "BMP: nonblock $gap unassigned $Cn\n\n\n"; # list blocks to categories print < http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values \$ EOF for $h (@blocks) { print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t"; for (@cat) { if ($h->{$_}) { print $_,$h->{$_},"\t"; } } print "\n"; } print "\$\n\n\n";