+ if (utf8BytesLeft > 0) {
+ canBeUTF8 = false;
+ }
+
+ // Easy -- if assuming Shift_JIS and no evidence it can't be, done
+ if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
+ return SHIFT_JIS;
+ }
+ if (canBeUTF8 && sawUTF8Start) {
+ return UTF8;
+ }
+ // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
+ // - If we saw
+ // - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
+ // - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
+ // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
+ if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
+ return SHIFT_JIS;
+ }
+ // Otherwise, we default to ISO-8859-1 unless we know it can't be
+ if (!sawLatin1Supplement && canBeISO88591) {
+ return ISO88591;
+ }
+ // Otherwise, we take a wild guess with platform encoding
+ return PLATFORM_DEFAULT_ENCODING;