cpp/core/src/qrcode/decoder/DecodedBitStreamParser.cpp

   1 /*
   2  *  DecodedBitStreamParser.cpp
   3  *  zxing
   4  *
   5  *  Created by Christian Brunschen on 20/05/2008.
   6  *  Copyright 2008 ZXing authors All rights reserved.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 #include "DecodedBitStreamParser.h"
  22 #include <iostream>
  23
  24 namespace qrcode {
  25   namespace decoder {
  26
  27     using namespace common;
  28     using namespace std;
  29
  30     char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = {
  31       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
  32       'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
  33       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  34       ' ', '$', '%', '*', '+', '-', '.', '/', ':'
  35     };
  36
  37     char *DecodedBitStreamParser::ASCII = "ASCII";
  38     char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
  39     char *DecodedBitStreamParser::UTF8 = "UTF-8";
  40     char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
  41     char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
  42
  43     void DecodedBitStreamParser::append(ostream &ost,
  44                                         unsigned char *bufIn,
  45                                         size_t nIn, const char *src) {
  46       if (nIn == 0) {
  47         return;
  48       }
  49
  50       iconv_t cd = iconv_open(UTF8, src);
  51       int maxOut = 4 * nIn + 1;
  52       unsigned char bufOut[maxOut];
  53
  54       char *fromPtr = (char *)bufIn;
  55       size_t nFrom = nIn;
  56       char *toPtr = (char *)bufOut;
  57       size_t nTo = maxOut;
  58
  59       while (nFrom > 0) {
  60         size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
  61         if (oneway == (size_t)(-1)) {
  62           throw new ReaderException("error converting characters");
  63         }
  64       }
  65       iconv_close(cd);
  66
  67       int nResult = maxOut - nTo;
  68       bufOut[nResult] = '\0';
  69
  70       ost << bufOut;
  71     }
  72
  73     void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits,
  74                                                     ostringstream &result,
  75                                                     int count) {
  76       // Each character will require 2 bytes. Read the characters as 2-byte pairs
  77       // and decode as Shift_JIS afterwards
  78       size_t nBytes = 2 * count;
  79       unsigned char buffer[nBytes];
  80       int offset = 0;
  81       while (count > 0) {
  82         // Each 13 bits encodes a 2-byte character
  83         int twoBytes = bits->readBits(13);
  84         int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
  85         if (assembledTwoBytes < 0x01F00) {
  86           // In the 0x8140 to 0x9FFC range
  87           assembledTwoBytes += 0x08140;
  88         } else {
  89           // In the 0xE040 to 0xEBBF range
  90           assembledTwoBytes += 0x0C140;
  91         }
  92         buffer[offset] = (unsigned char) (assembledTwoBytes >> 8);
  93         buffer[offset + 1] = (unsigned char) assembledTwoBytes;
  94         offset += 2;
  95         count--;
  96       }
  97
  98       append(result, buffer, nBytes, SHIFT_JIS);
  99     }
 100
 101     void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits,
 102                                                    ostringstream &result,
 103                                                    int count) {
 104       int nBytes = count;
 105       unsigned char readBytes[nBytes];
 106       if (count << 3 > bits->available()) {
 107         ostringstream s;
 108         s << "Count too large: " << count;
 109         throw new ReaderException(s.str().c_str());
 110       }
 111       for (int i = 0; i < count; i++) {
 112         readBytes[i] = (unsigned char) bits->readBits(8);
 113       }
 114       // The spec isn't clear on this mode; see
 115       // section 6.4.5: t does not say which encoding to assuming
 116       // upon decoding. I have seen ISO-8859-1 used as well as
 117       // Shift_JIS -- without anything like an ECI designator to
 118       // give a hint.
 119       char *encoding = guessEncoding(readBytes, nBytes);
 120       append(result, readBytes, nBytes, encoding);
 121     }
 122
 123     void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits,
 124                                                       ostringstream &result,
 125                                                       int count) {
 126       int nBytes = count;
 127       unsigned char bytes[nBytes];
 128       int i = 0;
 129       // Read three digits at a time
 130       while (count >= 3) {
 131         // Each 10 bits encodes three digits
 132         int threeDigitsBits = bits->readBits(10);
 133         if (threeDigitsBits >= 1000) {
 134           ostringstream s;
 135           s << "Illegal value for 3-digit unit: " << threeDigitsBits;
 136           throw new ReaderException(s.str().c_str());
 137         }
 138         bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
 139         bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
 140         bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
 141         count -= 3;
 142       }
 143       if (count == 2) {
 144         // Two digits left over to read, encoded in 7 bits
 145         int twoDigitsBits = bits->readBits(7);
 146         if (twoDigitsBits >= 100) {
 147           ostringstream s;
 148           s << "Illegal value for 2-digit unit: " << twoDigitsBits;
 149           throw new ReaderException(s.str().c_str());
 150         }
 151         bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
 152         bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
 153       } else if (count == 1) {
 154         // One digit left over to read
 155         int digitBits = bits->readBits(4);
 156         if (digitBits >= 10) {
 157           ostringstream s;
 158           s << "Illegal value for digit unit: " << digitBits;
 159           throw new ReaderException(s.str().c_str());
 160         }
 161         bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
 162       }
 163       append(result, bytes, nBytes, ASCII);
 164     }
 165
 166     void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits,
 167                                                            ostringstream &result,
 168                                                            int count) {
 169       int nBytes = count;
 170       unsigned char bytes[nBytes];
 171       int i = 0;
 172       // Read two characters at a time
 173       while (count > 1) {
 174         int nextTwoCharsBits = bits->readBits(11);
 175         bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
 176         bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
 177         count -= 2;
 178       }
 179       if (count == 1) {
 180         bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
 181       }
 182       append(result, bytes, nBytes, ASCII);
 183     }
 184
 185     char *
 186     DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
 187       // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
 188       if (length > 3 && bytes[0] == (unsigned char) 0xEF &&
 189           bytes[1] == (unsigned char) 0xBB &&
 190           bytes[2] == (unsigned char) 0xBF) {
 191         return UTF8;
 192       }
 193       // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
 194       // which should be by far the most common encodings. ISO-8859-1
 195       // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
 196       // uses this as a first byte of a two-byte character. If we see this
 197       // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
 198       // If we see something else in that second byte, we'll make the risky guess
 199       // that it's UTF-8.
 200       bool canBeISO88591 = true;
 201       bool lastWasPossibleDoubleByteStart = false;
 202       for (int i = 0; i < length; i++) {
 203         int value = bytes[i] & 0xFF;
 204         if (value >= 0x80 && value <= 0x9F && i < length - 1) {
 205           canBeISO88591 = false;
 206           // ISO-8859-1 shouldn't use this, but before we decide it is Shift_JIS,
 207           // just double check that it is followed by a byte that's valid in
 208           // the Shift_JIS encoding
 209           if (lastWasPossibleDoubleByteStart) {
 210             // If we just checked this and the last byte for being a valid double-byte
 211             // char, don't check starting on this byte. If the this and the last byte
 212             // formed a valid pair, then this shouldn't be checked to see if it starts
 213             // a double byte pair of course.
 214             lastWasPossibleDoubleByteStart = false;
 215           } else {
 216             // ... otherwise do check to see if this plus the next byte form a valid
 217             // double byte pair encoding a character.
 218             lastWasPossibleDoubleByteStart = true;
 219             int nextValue = bytes[i + 1] & 0xFF;
 220             if ((value & 0x1) == 0) {
 221               // if even, next value should be in [0x9F,0xFC]
 222               // if not, we'll guess UTF-8
 223               if (nextValue < 0x9F || nextValue > 0xFC) {
 224                 return UTF8;
 225               }
 226             } else {
 227               // if odd, next value should be in [0x40,0x9E]
 228               // if not, we'll guess UTF-8
 229               if (nextValue < 0x40 || nextValue > 0x9E) {
 230                 return UTF8;
 231               }
 232             }
 233           }
 234         }
 235       }
 236       return canBeISO88591 ? ISO88591 : SHIFT_JIS;
 237     }
 238
 239
 240     string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes,
 241                                           Version *version) {
 242       ostringstream result;
 243       Ref<BitSource> bits(new BitSource(bytes));
 244       Mode *mode = &Mode::TERMINATOR;
 245       do {
 246         // While still another segment to read...
 247         if (bits->available() == 0) {
 248           // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
 249           mode = &Mode::TERMINATOR;
 250         } else {
 251           mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
 252         }
 253         if (mode != &Mode::TERMINATOR) {
 254           // How many characters will follow, encoded in this mode?
 255           int count = bits->readBits(mode->getCharacterCountBits(version));
 256           if (mode == &Mode::NUMERIC) {
 257             decodeNumericSegment(bits, result, count);
 258           } else if (mode == &Mode::ALPHANUMERIC) {
 259             decodeAlphanumericSegment(bits, result, count);
 260           } else if (mode == &Mode::BYTE) {
 261             decodeByteSegment(bits, result, count);
 262           } else if (mode == &Mode::KANJI) {
 263             decodeKanjiSegment(bits, result, count);
 264           } else {
 265             throw new ReaderException("Unsupported mode indicator");
 266           }
 267         }
 268       } while (mode != &Mode::TERMINATOR);
 269       return result.str();
 270     }
 271
 272   }
 273 }