4 unicode block statistics of general character categories,
5 decomposition and uppercase mappings based on
6 Blocks.txt and UnicodeData.txt
7 > http://unicode.org/Public/UNIDATA/
13 $dir = $ARGV[0] || '.';
14 open UDATA, $dir.'/UnicodeData.txt';
15 open BLOCKS, $dir.'/Blocks.txt';
26 ($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/;
32 $gap += $beg - 1 - $lend if 65536 > $beg;
33 $h->{len} = $end - $beg + 1;
35 # print "$beg..$end($h->{len}): $block\n";
45 # 2 general category E
46 # 3 canonical combining class N
48 # 5 decomposition <type> mapping <E>S
49 # 6-8 numeric type and value E/N
53 # 12-14 upper/lower/titlecase mapping S
56 # last if '10000' eq $l[0]; # BMP only
58 while ($num > $end) { nblock; }
59 die "$_ not in any block!" if $num < $beg;
60 # <CJK Ideograph, First>
61 if ( $l[1] =~ /, First>$/ ) {
65 # print stderr "$l[1]..Last: $num .. $e\n";
66 die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end;
67 $h->{$l[2]} += $e - $num +1;
74 ($type) = ($l[5] =~ /<(.*)>/);
75 $deco{$type||'(canonical)'}++;
76 if ( ! $type ) { # decomp Canon
88 *\tdecomposition mappings
90 > http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings
94 for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; }
98 # 30 general categories + 2 specials
100 'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp',
101 'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So'
103 @add = ( 'uC','dC' );
106 # table major categories to blocks
107 print "\n*\tmajor category/block table\n";
109 Categories are letter, mark, numeric, punctuation, symbol, separator and other.
110 Additional columns give number of characters which have an uppercase and
111 canonical decomposition mapping, resp.
112 Final columns give begin and end, block length and name.
117 @mcat = ('L','M','N','P','S','Z','C');
119 'Let','Mar','Num','Pun','Sym','Sep','Oth',
121 'beg','end','len','block'
128 $tot{$_} += $h->{$_};
129 $maj{substr($_,0,1)} += $h->{$_};
131 $maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned
132 $Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP
134 $tot{$_} += $maj{$_};
138 $tot{$_} += $h->{$_};
139 print $h->{$_}||'0',"\t";
141 print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n";
143 for (@mcat,@add) { print $tot{$_},"\t"; } print "\n";
145 print "BMP: nonblock $gap unassigned $Cn\n\n\n";
147 # list blocks to categories
149 *\tdetailled block stats
151 > http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
156 print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t";
159 print $_,$h->{$_},"\t";