+ // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
+ // - If we saw
+ // - at least one byte that starts a double-byte value (bytes that are rare in ISO-8859-1), or
+ // - over 5% of bytes that could be single-byte Katakana (also rare in ISO-8859-1),
+ // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
+ if ((sawDoubleByteStart || 20 * maybeSingleByteKatakanaCount > length) && canBeShiftJIS) {
+ return SHIFT_JIS;
+ }
+ // Otherwise, we default to ISO-8859-1 unless we know it can't be
+ if (canBeISO88591) {
+ return ISO88591;
+ }
+ // Otherwise, we take a wild guess with UTF-8
+ return UTF8;
+ }
+
+ private static int parseECIValue(BitSource bits) {
+ int firstByte = bits.readBits(8);
+ if ((firstByte & 0x80) == 0) {
+ // just one byte
+ return firstByte & 0x7F;
+ } else if ((firstByte & 0xC0) == 0x80) {
+ // two bytes
+ int secondByte = bits.readBits(8);
+ return ((firstByte & 0x3F) << 8) | secondByte;
+ } else if ((firstByte & 0xE0) == 0xC0) {
+ // three bytes
+ int secondThirdBytes = bits.readBits(16);
+ return ((firstByte & 0x1F) << 16) | secondThirdBytes;
+ }
+ throw new IllegalArgumentException("Bad ECI bits starting with byte " + firstByte);