cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp

   1 /*
   2  *  DecodedBitStreamParser.cpp
   3  *  zxing
   4  *
   5  *  Created by Christian Brunschen on 20/05/2008.
   6  *  Copyright 2008 ZXing authors All rights reserved.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 #include <zxing/qrcode/decoder/DecodedBitStreamParser.h>
  22 #include <iostream>
  23 #ifndef NO_ICONV
  24 #include <iconv.h>
  25 #endif
  26
  27 // Required for compatibility. TODO: test on Symbian
  28 #ifdef ZXING_ICONV_CONST
  29 #undef ICONV_CONST
  30 #define ICONV_CONST const
  31 #endif
  32
  33 #ifndef ICONV_CONST
  34 #define ICONV_CONST /**/
  35 #endif
  36
  37 using namespace zxing;
  38
  39 namespace zxing {
  40 namespace qrcode {
  41
  42 using namespace std;
  43
  44 const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
  45     'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
  46     'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
  47                                                           };
  48
  49 const char *DecodedBitStreamParser::ASCII = "ASCII";
  50 const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
  51 const char *DecodedBitStreamParser::UTF8 = "UTF-8";
  52 const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
  53 const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
  54
  55 void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
  56 #ifndef NO_ICONV
  57   if (nIn == 0) {
  58     return;
  59   }
  60
  61   iconv_t cd = iconv_open(UTF8, src);
  62   const int maxOut = 4 * nIn + 1;
  63   unsigned char* bufOut = new unsigned char[maxOut];
  64
  65   ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
  66   size_t nFrom = nIn;
  67   char *toPtr = (char *)bufOut;
  68   size_t nTo = maxOut;
  69
  70   while (nFrom > 0) {
  71     size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
  72     if (oneway == (size_t)(-1)) {
  73       iconv_close(cd);
  74       delete[] bufOut;
  75       throw ReaderException("error converting characters");
  76     }
  77   }
  78   iconv_close(cd);
  79
  80   int nResult = maxOut - nTo;
  81   bufOut[nResult] = '\0';
  82   result.append((const char *)bufOut);
  83   delete[] bufOut;
  84  #else
  85   result.append((const char *)bufIn, nIn);
  86  #endif
  87 }
  88
  89 void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
  90   // Each character will require 2 bytes. Read the characters as 2-byte pairs
  91   // and decode as Shift_JIS afterwards
  92   size_t nBytes = 2 * count;
  93   unsigned char* buffer = new unsigned char[nBytes];
  94   int offset = 0;
  95   while (count > 0) {
  96     // Each 13 bits encodes a 2-byte character
  97
  98     int twoBytes = bits->readBits(13);
  99     int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
 100     if (assembledTwoBytes < 0x01F00) {
 101       // In the 0x8140 to 0x9FFC range
 102       assembledTwoBytes += 0x08140;
 103     } else {
 104       // In the 0xE040 to 0xEBBF range
 105       assembledTwoBytes += 0x0C140;
 106     }
 107     buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
 108     buffer[offset + 1] = (unsigned char)assembledTwoBytes;
 109     offset += 2;
 110     count--;
 111   }
 112
 113   append(result, buffer, nBytes, SHIFT_JIS);
 114   delete[] buffer;
 115 }
 116
 117 void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
 118   int nBytes = count;
 119   unsigned char* readBytes = new unsigned char[nBytes];
 120   if (count << 3 > bits->available()) {
 121     ostringstream s;
 122     s << "Count too large: " << count;
 123     delete[] readBytes;
 124     throw ReaderException(s.str().c_str());
 125   }
 126   for (int i = 0; i < count; i++) {
 127     readBytes[i] = (unsigned char)bits->readBits(8);
 128   }
 129   // The spec isn't clear on this mode; see
 130   // section 6.4.5: t does not say which encoding to assuming
 131   // upon decoding. I have seen ISO-8859-1 used as well as
 132   // Shift_JIS -- without anything like an ECI designator to
 133   // give a hint.
 134   const char *encoding = guessEncoding(readBytes, nBytes);
 135   append(result, readBytes, nBytes, encoding);
 136   delete[] readBytes;
 137 }
 138
 139 void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
 140   int nBytes = count;
 141   unsigned char* bytes = new unsigned char[nBytes];
 142   int i = 0;
 143   // Read three digits at a time
 144   while (count >= 3) {
 145     // Each 10 bits encodes three digits
 146     int threeDigitsBits = bits->readBits(10);
 147     if (threeDigitsBits >= 1000) {
 148       ostringstream s;
 149       s << "Illegal value for 3-digit unit: " << threeDigitsBits;
 150       delete[] bytes;
 151       throw ReaderException(s.str().c_str());
 152     }
 153     bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
 154     bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
 155     bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
 156     count -= 3;
 157   }
 158   if (count == 2) {
 159     // Two digits left over to read, encoded in 7 bits
 160     int twoDigitsBits = bits->readBits(7);
 161     if (twoDigitsBits >= 100) {
 162       ostringstream s;
 163       s << "Illegal value for 2-digit unit: " << twoDigitsBits;
 164       delete[] bytes;
 165       throw ReaderException(s.str().c_str());
 166     }
 167     bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
 168     bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
 169   } else if (count == 1) {
 170     // One digit left over to read
 171     int digitBits = bits->readBits(4);
 172     if (digitBits >= 10) {
 173       ostringstream s;
 174       s << "Illegal value for digit unit: " << digitBits;
 175       delete[] bytes;
 176       throw ReaderException(s.str().c_str());
 177     }
 178     bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
 179   }
 180   append(result, bytes, nBytes, ASCII);
 181   delete[] bytes;
 182 }
 183
 184 void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
 185   int nBytes = count;
 186   unsigned char* bytes = new unsigned char[nBytes];
 187   int i = 0;
 188   // Read two characters at a time
 189   while (count > 1) {
 190     int nextTwoCharsBits = bits->readBits(11);
 191     bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
 192     bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
 193     count -= 2;
 194   }
 195   if (count == 1) {
 196     bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
 197   }
 198   append(result, bytes, nBytes, ASCII);
 199   delete[] bytes;
 200 }
 201
 202 const char *
 203 DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
 204   const bool ASSUME_SHIFT_JIS = false;
 205   char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
 206
 207   // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
 208   if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
 209       == (unsigned char)0xBF) {
 210     return UTF8;
 211   }
 212   // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
 213   // which should be by far the most common encodings. ISO-8859-1
 214   // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
 215   // uses this as a first byte of a two-byte character. If we see this
 216   // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
 217   // If we see something else in that second byte, we'll make the risky guess
 218   // that it's UTF-8.
 219   bool canBeISO88591 = true;
 220   bool canBeShiftJIS = true;
 221   bool canBeUTF8 = true;
 222   int utf8BytesLeft = 0;
 223   int maybeDoubleByteCount = 0;
 224   int maybeSingleByteKatakanaCount = 0;
 225   bool sawLatin1Supplement = false;
 226   bool sawUTF8Start = false;
 227   bool lastWasPossibleDoubleByteStart = false;
 228   for (int i = 0;
 229        i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
 230        i++) {
 231     int value = bytes[i] & 0xFF;
 232
 233     // UTF-8 stuff
 234     if (value >= 0x80 && value <= 0xBF) {
 235       if (utf8BytesLeft > 0) {
 236         utf8BytesLeft--;
 237       }
 238     } else {
 239       if (utf8BytesLeft > 0) {
 240         canBeUTF8 = false;
 241       }
 242       if (value >= 0xC0 && value <= 0xFD) {
 243         sawUTF8Start = true;
 244         int valueCopy = value;
 245         while ((valueCopy & 0x40) != 0) {
 246           utf8BytesLeft++;
 247           valueCopy <<= 1;
 248         }
 249       }
 250     }
 251
 252     // Shift_JIS stuff
 253
 254     if (value >= 0xA1 && value <= 0xDF) {
 255       // count the number of characters that might be a Shift_JIS single-byte Katakana character
 256       if (!lastWasPossibleDoubleByteStart) {
 257         maybeSingleByteKatakanaCount++;
 258       }
 259     }
 260     if (!lastWasPossibleDoubleByteStart &&
 261         ((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
 262       canBeShiftJIS = false;
 263     }
 264     if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
 265       // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
 266       // second byte.
 267       if (lastWasPossibleDoubleByteStart) {
 268         // If we just checked this and the last byte for being a valid double-byte
 269         // char, don't check starting on this byte. If this and the last byte
 270         // formed a valid pair, then this shouldn't be checked to see if it starts
 271         // a double byte pair of course.
 272         lastWasPossibleDoubleByteStart = false;
 273       } else {
 274         // ... otherwise do check to see if this plus the next byte form a valid
 275         // double byte pair encoding a character.
 276         lastWasPossibleDoubleByteStart = true;
 277         if (i >= length - 1) {
 278           canBeShiftJIS = false;
 279         } else {
 280           int nextValue = bytes[i + 1] & 0xFF;
 281           if (nextValue < 0x40 || nextValue > 0xFC) {
 282             canBeShiftJIS = false;
 283           } else {
 284             maybeDoubleByteCount++;
 285           }
 286           // There is some conflicting information out there about which bytes can follow which in
 287           // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
 288         }
 289       }
 290     } else {
 291       lastWasPossibleDoubleByteStart = false;
 292     }
 293   }
 294   if (utf8BytesLeft > 0) {
 295     canBeUTF8 = false;
 296   }
 297
 298   // Easy -- if assuming Shift_JIS and no evidence it can't be, done
 299   if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
 300     return SHIFT_JIS;
 301   }
 302   if (canBeUTF8 && sawUTF8Start) {
 303     return UTF8;
 304   }
 305   // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
 306   // - If we saw
 307   //   - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
 308   //   - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
 309   // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
 310   if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
 311     return SHIFT_JIS;
 312   }
 313   // Otherwise, we default to ISO-8859-1 unless we know it can't be
 314   if (!sawLatin1Supplement && canBeISO88591) {
 315     return ISO88591;
 316   }
 317   // Otherwise, we take a wild guess with platform encoding
 318   return PLATFORM_DEFAULT_ENCODING;
 319 }
 320
 321 string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
 322   string result;
 323   Ref<BitSource> bits(new BitSource(bytes));
 324   Mode *mode = &Mode::TERMINATOR;
 325   do {
 326     // While still another segment to read...
 327     if (bits->available() < 4) {
 328       // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
 329       mode = &Mode::TERMINATOR;
 330     } else {
 331       mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
 332     }
 333     if (mode != &Mode::TERMINATOR) {
 334       // How many characters will follow, encoded in this mode?
 335       int count = bits->readBits(mode->getCharacterCountBits(version));
 336       if (mode == &Mode::NUMERIC) {
 337         decodeNumericSegment(bits, result, count);
 338       } else if (mode == &Mode::ALPHANUMERIC) {
 339         decodeAlphanumericSegment(bits, result, count);
 340       } else if (mode == &Mode::BYTE) {
 341         decodeByteSegment(bits, result, count);
 342       } else if (mode == &Mode::KANJI) {
 343         decodeKanjiSegment(bits, result, count);
 344       } else {
 345         throw ReaderException("Unsupported mode indicator");
 346       }
 347     }
 348   } while (mode != &Mode::TERMINATOR);
 349   return result;
 350 }
 351
 352 }
 353 }