2 * DecodedBitStreamParser.cpp
5 * Created by Christian Brunschen on 20/05/2008.
6 * Copyright 2008 ZXing authors All rights reserved.
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
21 #include "DecodedBitStreamParser.h"
27 using namespace common;
30 char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = {
31 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
32 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
33 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
34 ' ', '$', '%', '*', '+', '-', '.', '/', ':'
37 char *DecodedBitStreamParser::ASCII = "ASCII";
38 char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
39 char *DecodedBitStreamParser::UTF8 = "UTF-8";
40 char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
41 char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
43 void DecodedBitStreamParser::append(ostream &ost,
45 size_t nIn, const char *src) {
50 iconv_t cd = iconv_open(UTF8, src);
51 int maxOut = 4 * nIn + 1;
52 unsigned char bufOut[maxOut];
54 char *fromPtr = (char *)bufIn;
56 char *toPtr = (char *)bufOut;
60 size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
61 if (oneway == (size_t)(-1)) {
62 throw new ReaderException("error converting characters");
67 int nResult = maxOut - nTo;
68 bufOut[nResult] = '\0';
73 void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits,
74 ostringstream &result,
76 // Each character will require 2 bytes. Read the characters as 2-byte pairs
77 // and decode as Shift_JIS afterwards
78 size_t nBytes = 2 * count;
79 unsigned char buffer[nBytes];
82 // Each 13 bits encodes a 2-byte character
83 int twoBytes = bits->readBits(13);
84 int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
85 if (assembledTwoBytes < 0x01F00) {
86 // In the 0x8140 to 0x9FFC range
87 assembledTwoBytes += 0x08140;
89 // In the 0xE040 to 0xEBBF range
90 assembledTwoBytes += 0x0C140;
92 buffer[offset] = (unsigned char) (assembledTwoBytes >> 8);
93 buffer[offset + 1] = (unsigned char) assembledTwoBytes;
98 append(result, buffer, nBytes, SHIFT_JIS);
101 void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits,
102 ostringstream &result,
105 unsigned char readBytes[nBytes];
106 if (count << 3 > bits->available()) {
108 s << "Count too large: " << count;
109 throw new ReaderException(s.str().c_str());
111 for (int i = 0; i < count; i++) {
112 readBytes[i] = (unsigned char) bits->readBits(8);
114 // The spec isn't clear on this mode; see
115 // section 6.4.5: t does not say which encoding to assuming
116 // upon decoding. I have seen ISO-8859-1 used as well as
117 // Shift_JIS -- without anything like an ECI designator to
119 char *encoding = guessEncoding(readBytes, nBytes);
120 append(result, readBytes, nBytes, encoding);
123 void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits,
124 ostringstream &result,
127 unsigned char bytes[nBytes];
129 // Read three digits at a time
131 // Each 10 bits encodes three digits
132 int threeDigitsBits = bits->readBits(10);
133 if (threeDigitsBits >= 1000) {
135 s << "Illegal value for 3-digit unit: " << threeDigitsBits;
136 throw new ReaderException(s.str().c_str());
138 bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
139 bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
140 bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
144 // Two digits left over to read, encoded in 7 bits
145 int twoDigitsBits = bits->readBits(7);
146 if (twoDigitsBits >= 100) {
148 s << "Illegal value for 2-digit unit: " << twoDigitsBits;
149 throw new ReaderException(s.str().c_str());
151 bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
152 bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
153 } else if (count == 1) {
154 // One digit left over to read
155 int digitBits = bits->readBits(4);
156 if (digitBits >= 10) {
158 s << "Illegal value for digit unit: " << digitBits;
159 throw new ReaderException(s.str().c_str());
161 bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
163 append(result, bytes, nBytes, ASCII);
166 void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits,
167 ostringstream &result,
170 unsigned char bytes[nBytes];
172 // Read two characters at a time
174 int nextTwoCharsBits = bits->readBits(11);
175 bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
176 bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
180 bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
182 append(result, bytes, nBytes, ASCII);
186 DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
187 // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
188 if (length > 3 && bytes[0] == (unsigned char) 0xEF &&
189 bytes[1] == (unsigned char) 0xBB &&
190 bytes[2] == (unsigned char) 0xBF) {
193 // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
194 // which should be by far the most common encodings. ISO-8859-1
195 // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
196 // uses this as a first byte of a two-byte character. If we see this
197 // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
198 // If we see something else in that second byte, we'll make the risky guess
200 bool canBeISO88591 = true;
201 bool lastWasPossibleDoubleByteStart = false;
202 for (int i = 0; i < length; i++) {
203 int value = bytes[i] & 0xFF;
204 if (value >= 0x80 && value <= 0x9F && i < length - 1) {
205 canBeISO88591 = false;
206 // ISO-8859-1 shouldn't use this, but before we decide it is Shift_JIS,
207 // just double check that it is followed by a byte that's valid in
208 // the Shift_JIS encoding
209 if (lastWasPossibleDoubleByteStart) {
210 // If we just checked this and the last byte for being a valid double-byte
211 // char, don't check starting on this byte. If the this and the last byte
212 // formed a valid pair, then this shouldn't be checked to see if it starts
213 // a double byte pair of course.
214 lastWasPossibleDoubleByteStart = false;
216 // ... otherwise do check to see if this plus the next byte form a valid
217 // double byte pair encoding a character.
218 lastWasPossibleDoubleByteStart = true;
219 int nextValue = bytes[i + 1] & 0xFF;
220 if ((value & 0x1) == 0) {
221 // if even, next value should be in [0x9F,0xFC]
222 // if not, we'll guess UTF-8
223 if (nextValue < 0x9F || nextValue > 0xFC) {
227 // if odd, next value should be in [0x40,0x9E]
228 // if not, we'll guess UTF-8
229 if (nextValue < 0x40 || nextValue > 0x9E) {
236 return canBeISO88591 ? ISO88591 : SHIFT_JIS;
240 string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes,
242 ostringstream result;
243 Ref<BitSource> bits(new BitSource(bytes));
244 Mode *mode = &Mode::TERMINATOR;
246 // While still another segment to read...
247 if (bits->available() == 0) {
248 // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
249 mode = &Mode::TERMINATOR;
251 mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
253 if (mode != &Mode::TERMINATOR) {
254 // How many characters will follow, encoded in this mode?
255 int count = bits->readBits(mode->getCharacterCountBits(version));
256 if (mode == &Mode::NUMERIC) {
257 decodeNumericSegment(bits, result, count);
258 } else if (mode == &Mode::ALPHANUMERIC) {
259 decodeAlphanumericSegment(bits, result, count);
260 } else if (mode == &Mode::BYTE) {
261 decodeByteSegment(bits, result, count);
262 } else if (mode == &Mode::KANJI) {
263 decodeKanjiSegment(bits, result, count);
265 throw new ReaderException("Unsupported mode indicator");
268 } while (mode != &Mode::TERMINATOR);