2 * DecodedBitStreamParser.cpp
5 * Created by Christian Brunschen on 20/05/2008.
6 * Copyright 2008 ZXing authors All rights reserved.
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
21 #include <zxing/qrcode/decoder/DecodedBitStreamParser.h>
27 // Required for compatibility. TODO: test on Symbian
28 #ifdef ZXING_ICONV_CONST
30 #define ICONV_CONST const
34 #define ICONV_CONST /**/
37 using namespace zxing;
44 const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
45 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
46 'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
49 const char *DecodedBitStreamParser::ASCII = "ASCII";
50 const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
51 const char *DecodedBitStreamParser::UTF8 = "UTF-8";
52 const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
53 const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
55 void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
61 iconv_t cd = iconv_open(UTF8, src);
62 const int maxOut = 4 * nIn + 1;
63 unsigned char* bufOut = new unsigned char[maxOut];
65 ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
67 char *toPtr = (char *)bufOut;
71 size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
72 if (oneway == (size_t)(-1)) {
75 throw ReaderException("error converting characters");
80 int nResult = maxOut - nTo;
81 bufOut[nResult] = '\0';
82 result.append((const char *)bufOut);
85 result.append((const char *)bufIn, nIn);
89 void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
90 // Each character will require 2 bytes. Read the characters as 2-byte pairs
91 // and decode as Shift_JIS afterwards
92 size_t nBytes = 2 * count;
93 unsigned char* buffer = new unsigned char[nBytes];
96 // Each 13 bits encodes a 2-byte character
98 int twoBytes = bits->readBits(13);
99 int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
100 if (assembledTwoBytes < 0x01F00) {
101 // In the 0x8140 to 0x9FFC range
102 assembledTwoBytes += 0x08140;
104 // In the 0xE040 to 0xEBBF range
105 assembledTwoBytes += 0x0C140;
107 buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
108 buffer[offset + 1] = (unsigned char)assembledTwoBytes;
113 append(result, buffer, nBytes, SHIFT_JIS);
117 void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
119 unsigned char* readBytes = new unsigned char[nBytes];
120 if (count << 3 > bits->available()) {
122 s << "Count too large: " << count;
124 throw ReaderException(s.str().c_str());
126 for (int i = 0; i < count; i++) {
127 readBytes[i] = (unsigned char)bits->readBits(8);
129 // The spec isn't clear on this mode; see
130 // section 6.4.5: t does not say which encoding to assuming
131 // upon decoding. I have seen ISO-8859-1 used as well as
132 // Shift_JIS -- without anything like an ECI designator to
134 const char *encoding = guessEncoding(readBytes, nBytes);
135 append(result, readBytes, nBytes, encoding);
139 void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
141 unsigned char* bytes = new unsigned char[nBytes];
143 // Read three digits at a time
145 // Each 10 bits encodes three digits
146 int threeDigitsBits = bits->readBits(10);
147 if (threeDigitsBits >= 1000) {
149 s << "Illegal value for 3-digit unit: " << threeDigitsBits;
151 throw ReaderException(s.str().c_str());
153 bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
154 bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
155 bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
159 // Two digits left over to read, encoded in 7 bits
160 int twoDigitsBits = bits->readBits(7);
161 if (twoDigitsBits >= 100) {
163 s << "Illegal value for 2-digit unit: " << twoDigitsBits;
165 throw ReaderException(s.str().c_str());
167 bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
168 bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
169 } else if (count == 1) {
170 // One digit left over to read
171 int digitBits = bits->readBits(4);
172 if (digitBits >= 10) {
174 s << "Illegal value for digit unit: " << digitBits;
176 throw ReaderException(s.str().c_str());
178 bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
180 append(result, bytes, nBytes, ASCII);
184 void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
186 unsigned char* bytes = new unsigned char[nBytes];
188 // Read two characters at a time
190 int nextTwoCharsBits = bits->readBits(11);
191 bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
192 bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
196 bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
198 append(result, bytes, nBytes, ASCII);
203 DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
204 const bool ASSUME_SHIFT_JIS = false;
205 char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
207 // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
208 if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
209 == (unsigned char)0xBF) {
212 // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
213 // which should be by far the most common encodings. ISO-8859-1
214 // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
215 // uses this as a first byte of a two-byte character. If we see this
216 // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
217 // If we see something else in that second byte, we'll make the risky guess
219 bool canBeISO88591 = true;
220 bool canBeShiftJIS = true;
221 bool canBeUTF8 = true;
222 int utf8BytesLeft = 0;
223 int maybeDoubleByteCount = 0;
224 int maybeSingleByteKatakanaCount = 0;
225 bool sawLatin1Supplement = false;
226 bool sawUTF8Start = false;
227 bool lastWasPossibleDoubleByteStart = false;
229 i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
231 int value = bytes[i] & 0xFF;
234 if (value >= 0x80 && value <= 0xBF) {
235 if (utf8BytesLeft > 0) {
239 if (utf8BytesLeft > 0) {
242 if (value >= 0xC0 && value <= 0xFD) {
244 int valueCopy = value;
245 while ((valueCopy & 0x40) != 0) {
254 if (value >= 0xA1 && value <= 0xDF) {
255 // count the number of characters that might be a Shift_JIS single-byte Katakana character
256 if (!lastWasPossibleDoubleByteStart) {
257 maybeSingleByteKatakanaCount++;
260 if (!lastWasPossibleDoubleByteStart &&
261 ((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
262 canBeShiftJIS = false;
264 if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
265 // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
267 if (lastWasPossibleDoubleByteStart) {
268 // If we just checked this and the last byte for being a valid double-byte
269 // char, don't check starting on this byte. If this and the last byte
270 // formed a valid pair, then this shouldn't be checked to see if it starts
271 // a double byte pair of course.
272 lastWasPossibleDoubleByteStart = false;
274 // ... otherwise do check to see if this plus the next byte form a valid
275 // double byte pair encoding a character.
276 lastWasPossibleDoubleByteStart = true;
277 if (i >= length - 1) {
278 canBeShiftJIS = false;
280 int nextValue = bytes[i + 1] & 0xFF;
281 if (nextValue < 0x40 || nextValue > 0xFC) {
282 canBeShiftJIS = false;
284 maybeDoubleByteCount++;
286 // There is some conflicting information out there about which bytes can follow which in
287 // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
291 lastWasPossibleDoubleByteStart = false;
294 if (utf8BytesLeft > 0) {
298 // Easy -- if assuming Shift_JIS and no evidence it can't be, done
299 if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
302 if (canBeUTF8 && sawUTF8Start) {
305 // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
307 // - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
308 // - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
309 // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
310 if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
313 // Otherwise, we default to ISO-8859-1 unless we know it can't be
314 if (!sawLatin1Supplement && canBeISO88591) {
317 // Otherwise, we take a wild guess with platform encoding
318 return PLATFORM_DEFAULT_ENCODING;
321 string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
323 Ref<BitSource> bits(new BitSource(bytes));
324 Mode *mode = &Mode::TERMINATOR;
326 // While still another segment to read...
327 if (bits->available() < 4) {
328 // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
329 mode = &Mode::TERMINATOR;
331 mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
333 if (mode != &Mode::TERMINATOR) {
334 // How many characters will follow, encoded in this mode?
335 int count = bits->readBits(mode->getCharacterCountBits(version));
336 if (mode == &Mode::NUMERIC) {
337 decodeNumericSegment(bits, result, count);
338 } else if (mode == &Mode::ALPHANUMERIC) {
339 decodeAlphanumericSegment(bits, result, count);
340 } else if (mode == &Mode::BYTE) {
341 decodeByteSegment(bits, result, count);
342 } else if (mode == &Mode::KANJI) {
343 decodeKanjiSegment(bits, result, count);
345 throw ReaderException("Unsupported mode indicator");
348 } while (mode != &Mode::TERMINATOR);