cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp

   1 /*
   2  *  DecodedBitStreamParser.cpp
   3  *  zxing
   4  *
   5  *  Created by Christian Brunschen on 20/05/2008.
   6  *  Copyright 2008 ZXing authors All rights reserved.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 #include <zxing/qrcode/decoder/DecodedBitStreamParser.h>
  22 #include <iostream>
  23 #include <iconv.h>
  24
  25 // Required for compatibility. TODO: test on Symbian
  26 #ifdef ZXING_ICONV_CONST
  27 #undef ICONV_CONST
  28 #define ICONV_CONST const
  29 #endif
  30
  31 #ifndef ICONV_CONST
  32 #define ICONV_CONST /**/
  33 #endif
  34
  35 using namespace zxing;
  36
  37 namespace zxing {
  38 namespace qrcode {
  39
  40 using namespace std;
  41
  42 const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
  43     'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
  44     'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
  45                                                           };
  46
  47 const char *DecodedBitStreamParser::ASCII = "ASCII";
  48 const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
  49 const char *DecodedBitStreamParser::UTF8 = "UTF-8";
  50 const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
  51 const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
  52
  53 void DecodedBitStreamParser::append(ostream &ost, const unsigned char *bufIn, size_t nIn, const char *src) {
  54   if (nIn == 0) {
  55     return;
  56   }
  57
  58   iconv_t cd = iconv_open(UTF8, src);
  59   const int maxOut = 4 * nIn + 1;
  60   unsigned char* bufOut = new unsigned char[maxOut];
  61
  62   ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
  63   size_t nFrom = nIn;
  64   char *toPtr = (char *)bufOut;
  65   size_t nTo = maxOut;
  66
  67   while (nFrom > 0) {
  68     size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
  69     if (oneway == (size_t)(-1)) {
  70       iconv_close(cd);
  71       delete[] bufOut;
  72       throw ReaderException("error converting characters");
  73     }
  74   }
  75   iconv_close(cd);
  76
  77   int nResult = maxOut - nTo;
  78   bufOut[nResult] = '\0';
  79
  80   ost << bufOut;
  81   delete[] bufOut;
  82 }
  83
  84 void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, ostringstream &result, int count) {
  85   // Each character will require 2 bytes. Read the characters as 2-byte pairs
  86   // and decode as Shift_JIS afterwards
  87   size_t nBytes = 2 * count;
  88   unsigned char* buffer = new unsigned char[nBytes];
  89   int offset = 0;
  90   while (count > 0) {
  91     // Each 13 bits encodes a 2-byte character
  92
  93     int twoBytes = bits->readBits(13);
  94     int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
  95     if (assembledTwoBytes < 0x01F00) {
  96       // In the 0x8140 to 0x9FFC range
  97       assembledTwoBytes += 0x08140;
  98     } else {
  99       // In the 0xE040 to 0xEBBF range
 100       assembledTwoBytes += 0x0C140;
 101     }
 102     buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
 103     buffer[offset + 1] = (unsigned char)assembledTwoBytes;
 104     offset += 2;
 105     count--;
 106   }
 107
 108   append(result, buffer, nBytes, SHIFT_JIS);
 109   delete[] buffer;
 110 }
 111
 112 void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, ostringstream &result, int count) {
 113   int nBytes = count;
 114   unsigned char* readBytes = new unsigned char[nBytes];
 115   if (count << 3 > bits->available()) {
 116     ostringstream s;
 117     s << "Count too large: " << count;
 118     delete[] readBytes;
 119     throw ReaderException(s.str().c_str());
 120   }
 121   for (int i = 0; i < count; i++) {
 122     readBytes[i] = (unsigned char)bits->readBits(8);
 123   }
 124   // The spec isn't clear on this mode; see
 125   // section 6.4.5: t does not say which encoding to assuming
 126   // upon decoding. I have seen ISO-8859-1 used as well as
 127   // Shift_JIS -- without anything like an ECI designator to
 128   // give a hint.
 129   const char *encoding = guessEncoding(readBytes, nBytes);
 130   append(result, readBytes, nBytes, encoding);
 131   delete[] readBytes;
 132 }
 133
 134 void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, ostringstream &result, int count) {
 135   int nBytes = count;
 136   unsigned char* bytes = new unsigned char[nBytes];
 137   int i = 0;
 138   // Read three digits at a time
 139   while (count >= 3) {
 140     // Each 10 bits encodes three digits
 141     int threeDigitsBits = bits->readBits(10);
 142     if (threeDigitsBits >= 1000) {
 143       ostringstream s;
 144       s << "Illegal value for 3-digit unit: " << threeDigitsBits;
 145       delete[] bytes;
 146       throw ReaderException(s.str().c_str());
 147     }
 148     bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
 149     bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
 150     bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
 151     count -= 3;
 152   }
 153   if (count == 2) {
 154     // Two digits left over to read, encoded in 7 bits
 155     int twoDigitsBits = bits->readBits(7);
 156     if (twoDigitsBits >= 100) {
 157       ostringstream s;
 158       s << "Illegal value for 2-digit unit: " << twoDigitsBits;
 159       delete[] bytes;
 160       throw ReaderException(s.str().c_str());
 161     }
 162     bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
 163     bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
 164   } else if (count == 1) {
 165     // One digit left over to read
 166     int digitBits = bits->readBits(4);
 167     if (digitBits >= 10) {
 168       ostringstream s;
 169       s << "Illegal value for digit unit: " << digitBits;
 170       delete[] bytes;
 171       throw ReaderException(s.str().c_str());
 172     }
 173     bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
 174   }
 175   append(result, bytes, nBytes, ASCII);
 176   delete[] bytes;
 177 }
 178
 179 void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, ostringstream &result, int count) {
 180   int nBytes = count;
 181   unsigned char* bytes = new unsigned char[nBytes];
 182   int i = 0;
 183   // Read two characters at a time
 184   while (count > 1) {
 185     int nextTwoCharsBits = bits->readBits(11);
 186     bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
 187     bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
 188     count -= 2;
 189   }
 190   if (count == 1) {
 191     bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
 192   }
 193   append(result, bytes, nBytes, ASCII);
 194   delete[] bytes;
 195 }
 196
 197 const char *
 198 DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
 199   // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
 200   if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
 201       == (unsigned char)0xBF) {
 202     return UTF8;
 203   }
 204   // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
 205   // which should be by far the most common encodings. ISO-8859-1
 206   // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
 207   // uses this as a first byte of a two-byte character. If we see this
 208   // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
 209   // If we see something else in that second byte, we'll make the risky guess
 210   // that it's UTF-8.
 211   bool canBeISO88591 = true;
 212   bool lastWasPossibleDoubleByteStart = false;
 213   for (int i = 0; i < length; i++) {
 214     int value = bytes[i] & 0xFF;
 215     if (value >= 0x80 && value <= 0x9F && i < length - 1) {
 216       canBeISO88591 = false;
 217       // ISO-8859-1 shouldn't use this, but before we decide it is Shift_JIS,
 218       // just double check that it is followed by a byte that's valid in
 219       // the Shift_JIS encoding
 220       if (lastWasPossibleDoubleByteStart) {
 221         // If we just checked this and the last byte for being a valid double-byte
 222         // char, don't check starting on this byte. If the this and the last byte
 223         // formed a valid pair, then this shouldn't be checked to see if it starts
 224         // a double byte pair of course.
 225         lastWasPossibleDoubleByteStart = false;
 226       } else {
 227         // ... otherwise do check to see if this plus the next byte form a valid
 228         // double byte pair encoding a character.
 229         lastWasPossibleDoubleByteStart = true;
 230         int nextValue = bytes[i + 1] & 0xFF;
 231         if ((value & 0x1) == 0) {
 232           // if even, next value should be in [0x9F,0xFC]
 233           // if not, we'll guess UTF-8
 234           if (nextValue < 0x9F || nextValue > 0xFC) {
 235             return UTF8;
 236           }
 237         } else {
 238           // if odd, next value should be in [0x40,0x9E]
 239           // if not, we'll guess UTF-8
 240           if (nextValue < 0x40 || nextValue > 0x9E) {
 241             return UTF8;
 242           }
 243         }
 244       }
 245     }
 246   }
 247   return canBeISO88591 ? ISO88591 : SHIFT_JIS;
 248 }
 249
 250 string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
 251   ostringstream result;
 252   Ref<BitSource> bits(new BitSource(bytes));
 253   Mode *mode = &Mode::TERMINATOR;
 254   do {
 255     // While still another segment to read...
 256     if (bits->available() < 4) {
 257       // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
 258       mode = &Mode::TERMINATOR;
 259     } else {
 260       mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
 261     }
 262     if (mode != &Mode::TERMINATOR) {
 263       // How many characters will follow, encoded in this mode?
 264       int count = bits->readBits(mode->getCharacterCountBits(version));
 265       if (mode == &Mode::NUMERIC) {
 266         decodeNumericSegment(bits, result, count);
 267       } else if (mode == &Mode::ALPHANUMERIC) {
 268         decodeAlphanumericSegment(bits, result, count);
 269       } else if (mode == &Mode::BYTE) {
 270         decodeByteSegment(bits, result, count);
 271       } else if (mode == &Mode::KANJI) {
 272         decodeKanjiSegment(bits, result, count);
 273       } else {
 274         throw ReaderException("Unsupported mode indicator");
 275       }
 276     }
 277   } while (mode != &Mode::TERMINATOR);
 278   return result.str();
 279 }
 280
 281 }
 282 }