csharp/qrcode/decoder/DecodedBitStreamParser.cs

   1 /*\r
   2 * Copyright 2007 ZXing authors\r
   3 *\r
   4 * Licensed under the Apache License, Version 2.0 (the "License");\r
   5 * you may not use this file except in compliance with the License.\r
   6 * You may obtain a copy of the License at\r
   7 *\r
   8 *      http://www.apache.org/licenses/LICENSE-2.0\r
   9 *\r
  10 * Unless required by applicable law or agreed to in writing, software\r
  11 * distributed under the License is distributed on an "AS IS" BASIS,\r
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
  13 * See the License for the specific language governing permissions and\r
  14 * limitations under the License.\r
  15 */\r
  16 using System;\r
  17 using ReaderException = com.google.zxing.ReaderException;\r
  18 using BitSource = com.google.zxing.common.BitSource;\r
  19 using CharacterSetECI = com.google.zxing.common.CharacterSetECI;\r
  20 using DecoderResult = com.google.zxing.common.DecoderResult;\r
  21 namespace com.google.zxing.qrcode.decoder\r
  22 {\r
  23         \r
  24         /// <summary> <p>QR Codes can encode text as bits in one of several modes, and can use multiple modes\r
  25         /// in one QR Code. This class decodes the bits back into text.</p>\r
  26         /// \r
  27         /// <p>See ISO 18004:2006, 6.4.3 - 6.4.7</p>\r
  28         /// \r
  29         /// </summary>\r
  30         /// <author>  Sean Owen\r
  31         /// </author>\r
  32         /// <author>www.Redivivus.in (suraj.supekar@redivivus.in) - Ported from ZXING Java Source \r
  33         /// </author>\r
  34         sealed class DecodedBitStreamParser\r
  35         {\r
  36                 \r
  37                 /// <summary> See ISO 18004:2006, 6.4.4 Table 5</summary>\r
  38                 //UPGRADE_NOTE: Final was removed from the declaration of 'ALPHANUMERIC_CHARS'. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1003'"\r
  39                 private static readonly char[] ALPHANUMERIC_CHARS = new char[]{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'};\r
  40                 private const System.String SHIFT_JIS = "SJIS";\r
  41                 private const System.String EUC_JP = "EUC_JP";\r
  42                 private static bool ASSUME_SHIFT_JIS;\r
  43                 private const System.String UTF8 = "UTF8";\r
  44         // Redivivus.in Java to c# Porting update\r
  45         // 30/01/2010 \r
  46         // Commented & Added        \r
  47         private const System.String ISO88591 = "ISO-8859-1";\r
  48                 \r
  49                 private DecodedBitStreamParser()\r
  50                 {\r
  51                 }\r
  52                 \r
  53                 internal static DecoderResult decode(sbyte[] bytes, Version version, ErrorCorrectionLevel ecLevel)\r
  54                 {\r
  55                         BitSource bits = new BitSource(bytes);\r
  56                         System.Text.StringBuilder result = new System.Text.StringBuilder(50);\r
  57                         CharacterSetECI currentCharacterSetECI = null;\r
  58                         bool fc1InEffect = false;\r
  59                         System.Collections.ArrayList byteSegments = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(1));\r
  60                         Mode mode;\r
  61                         do \r
  62                         {\r
  63                                 // While still another segment to read...\r
  64                                 if (bits.available() < 4)\r
  65                                 {\r
  66                                         // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here\r
  67                                         mode = Mode.TERMINATOR;\r
  68                                 }\r
  69                                 else\r
  70                                 {\r
  71                                         try\r
  72                                         {\r
  73                                                 mode = Mode.forBits(bits.readBits(4)); // mode is encoded by 4 bits\r
  74                                         }\r
  75                                         catch (System.ArgumentException iae)\r
  76                                         {\r
  77                                                 throw ReaderException.Instance;\r
  78                                         }\r
  79                                 }\r
  80                                 if (!mode.Equals(Mode.TERMINATOR))\r
  81                                 {\r
  82                                         if (mode.Equals(Mode.FNC1_FIRST_POSITION) || mode.Equals(Mode.FNC1_SECOND_POSITION))\r
  83                                         {\r
  84                                                 // We do little with FNC1 except alter the parsed result a bit according to the spec\r
  85                                                 fc1InEffect = true;\r
  86                                         }\r
  87                                         else if (mode.Equals(Mode.STRUCTURED_APPEND))\r
  88                                         {\r
  89                                                 // not really supported; all we do is ignore it\r
  90                                                 // Read next 8 bits (symbol sequence #) and 8 bits (parity data), then continue\r
  91                                                 bits.readBits(16);\r
  92                                         }\r
  93                                         else if (mode.Equals(Mode.ECI))\r
  94                                         {\r
  95                                                 // Count doesn't apply to ECI\r
  96                                                 int value_Renamed = parseECIValue(bits);\r
  97                                                 currentCharacterSetECI = CharacterSetECI.getCharacterSetECIByValue(value_Renamed);\r
  98                                                 if (currentCharacterSetECI == null)\r
  99                                                 {\r
 100                                                         throw ReaderException.Instance;\r
 101                                                 }\r
 102                                         }\r
 103                                         else\r
 104                                         {\r
 105                                                 // How many characters will follow, encoded in this mode?\r
 106                                                 int count = bits.readBits(mode.getCharacterCountBits(version));\r
 107                                                 if (mode.Equals(Mode.NUMERIC))\r
 108                                                 {\r
 109                                                         decodeNumericSegment(bits, result, count);\r
 110                                                 }\r
 111                                                 else if (mode.Equals(Mode.ALPHANUMERIC))\r
 112                                                 {\r
 113                                                         decodeAlphanumericSegment(bits, result, count, fc1InEffect);\r
 114                                                 }\r
 115                                                 else if (mode.Equals(Mode.BYTE))\r
 116                                                 {\r
 117                                                         decodeByteSegment(bits, result, count, currentCharacterSetECI, byteSegments);\r
 118                                                 }\r
 119                                                 else if (mode.Equals(Mode.KANJI))\r
 120                                                 {\r
 121                                                         decodeKanjiSegment(bits, result, count);\r
 122                                                 }\r
 123                                                 else\r
 124                                                 {\r
 125                                                         throw ReaderException.Instance;\r
 126                                                 }\r
 127                                         }\r
 128                                 }\r
 129                         }\r
 130                         while (!mode.Equals(Mode.TERMINATOR));\r
 131                         \r
 132                         return new DecoderResult(bytes, result.ToString(), (byteSegments.Count == 0)?null:byteSegments, ecLevel);\r
 133                 }\r
 134                 \r
 135                 private static void  decodeKanjiSegment(BitSource bits, System.Text.StringBuilder result, int count)\r
 136                 {\r
 137                         // Each character will require 2 bytes. Read the characters as 2-byte pairs\r
 138                         // and decode as Shift_JIS afterwards\r
 139                         sbyte[] buffer = new sbyte[2 * count];\r
 140                         int offset = 0;\r
 141                         while (count > 0)\r
 142                         {\r
 143                                 // Each 13 bits encodes a 2-byte character\r
 144                                 int twoBytes = bits.readBits(13);\r
 145                                 int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);\r
 146                                 if (assembledTwoBytes < 0x01F00)\r
 147                                 {\r
 148                                         // In the 0x8140 to 0x9FFC range\r
 149                                         assembledTwoBytes += 0x08140;\r
 150                                 }\r
 151                                 else\r
 152                                 {\r
 153                                         // In the 0xE040 to 0xEBBF range\r
 154                                         assembledTwoBytes += 0x0C140;\r
 155                                 }\r
 156                                 buffer[offset] = (sbyte) (assembledTwoBytes >> 8);\r
 157                                 buffer[offset + 1] = (sbyte) assembledTwoBytes;\r
 158                                 offset += 2;\r
 159                                 count--;\r
 160                         }\r
 161                         // Shift_JIS may not be supported in some environments:\r
 162                         try\r
 163                         {\r
 164                                 //UPGRADE_TODO: The differences in the Format  of parameters for constructor 'java.lang.String.String'  may cause compilation errors.  "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1092'"\r
 165                                 result.Append(System.Text.Encoding.GetEncoding(SHIFT_JIS).GetString(SupportClass.ToByteArray(buffer)));\r
 166                         }\r
 167                         catch (System.IO.IOException uee)\r
 168                         {\r
 169                                 throw ReaderException.Instance;\r
 170                         }\r
 171                 }\r
 172                 \r
 173                 private static void  decodeByteSegment(BitSource bits, System.Text.StringBuilder result, int count, CharacterSetECI currentCharacterSetECI, System.Collections.ArrayList byteSegments)\r
 174                 {\r
 175                         sbyte[] readBytes = new sbyte[count];\r
 176                         if (count << 3 > bits.available())\r
 177                         {\r
 178                                 throw ReaderException.Instance;\r
 179                         }\r
 180                         for (int i = 0; i < count; i++)\r
 181                         {\r
 182                                 readBytes[i] = (sbyte) bits.readBits(8);\r
 183                         }\r
 184                         System.String encoding;\r
 185                         if (currentCharacterSetECI == null)\r
 186                         {\r
 187                                 // The spec isn't clear on this mode; see\r
 188                                 // section 6.4.5: t does not say which encoding to assuming\r
 189                                 // upon decoding. I have seen ISO-8859-1 used as well as\r
 190                                 // Shift_JIS -- without anything like an ECI designator to\r
 191                                 // give a hint.\r
 192                                 encoding = guessEncoding(readBytes);\r
 193                         }\r
 194                         else\r
 195                         {\r
 196                                 encoding = currentCharacterSetECI.EncodingName;\r
 197                         }\r
 198                         try\r
 199                         {\r
 200                                 //UPGRADE_TODO: The differences in the Format  of parameters for constructor 'java.lang.String.String'  may cause compilation errors.  "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1092'"\r
 201                                 result.Append(System.Text.Encoding.GetEncoding(encoding).GetString(SupportClass.ToByteArray(readBytes)));\r
 202                         }\r
 203                         catch (System.IO.IOException uce)\r
 204                         {\r
 205                                 throw ReaderException.Instance;\r
 206                         }\r
 207                         byteSegments.Add(SupportClass.ToByteArray(readBytes));\r
 208                 }\r
 209                 \r
 210                 private static void  decodeAlphanumericSegment(BitSource bits, System.Text.StringBuilder result, int count, bool fc1InEffect)\r
 211                 {\r
 212                         // Read two characters at a time\r
 213                         int start = result.Length;\r
 214                         while (count > 1)\r
 215                         {\r
 216                                 int nextTwoCharsBits = bits.readBits(11);\r
 217                                 result.Append(ALPHANUMERIC_CHARS[nextTwoCharsBits / 45]);\r
 218                                 result.Append(ALPHANUMERIC_CHARS[nextTwoCharsBits % 45]);\r
 219                                 count -= 2;\r
 220                         }\r
 221                         if (count == 1)\r
 222                         {\r
 223                                 // special case: one character left\r
 224                                 result.Append(ALPHANUMERIC_CHARS[bits.readBits(6)]);\r
 225                         }\r
 226                         // See section 6.4.8.1, 6.4.8.2\r
 227                         if (fc1InEffect)\r
 228                         {\r
 229                                 // We need to massage the result a bit if in an FNC1 mode:\r
 230                                 for (int i = start; i < result.Length; i++)\r
 231                                 {\r
 232                                         if (result[i] == '%')\r
 233                                         {\r
 234                                                 if (i < result.Length - 1 && result[i + 1] == '%')\r
 235                                                 {\r
 236                                                         // %% is rendered as %\r
 237                                                         result.Remove(i + 1, 1);\r
 238                                                 }\r
 239                                                 else\r
 240                                                 {\r
 241                                                         // In alpha mode, % should be converted to FNC1 separator 0x1D\r
 242                                                         result[i] = (char) 0x1D;\r
 243                                                 }\r
 244                                         }\r
 245                                 }\r
 246                         }\r
 247                 }\r
 248                 \r
 249                 private static void  decodeNumericSegment(BitSource bits, System.Text.StringBuilder result, int count)\r
 250                 {\r
 251                         // Read three digits at a time\r
 252                         while (count >= 3)\r
 253                         {\r
 254                                 // Each 10 bits encodes three digits\r
 255                                 int threeDigitsBits = bits.readBits(10);\r
 256                                 if (threeDigitsBits >= 1000)\r
 257                                 {\r
 258                                         throw ReaderException.Instance;\r
 259                                 }\r
 260                                 result.Append(ALPHANUMERIC_CHARS[threeDigitsBits / 100]);\r
 261                                 result.Append(ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10]);\r
 262                                 result.Append(ALPHANUMERIC_CHARS[threeDigitsBits % 10]);\r
 263                                 count -= 3;\r
 264                         }\r
 265                         if (count == 2)\r
 266                         {\r
 267                                 // Two digits left over to read, encoded in 7 bits\r
 268                                 int twoDigitsBits = bits.readBits(7);\r
 269                                 if (twoDigitsBits >= 100)\r
 270                                 {\r
 271                                         throw ReaderException.Instance;\r
 272                                 }\r
 273                                 result.Append(ALPHANUMERIC_CHARS[twoDigitsBits / 10]);\r
 274                                 result.Append(ALPHANUMERIC_CHARS[twoDigitsBits % 10]);\r
 275                         }\r
 276                         else if (count == 1)\r
 277                         {\r
 278                                 // One digit left over to read\r
 279                                 int digitBits = bits.readBits(4);\r
 280                                 if (digitBits >= 10)\r
 281                                 {\r
 282                                         throw ReaderException.Instance;\r
 283                                 }\r
 284                                 result.Append(ALPHANUMERIC_CHARS[digitBits]);\r
 285                         }\r
 286                 }\r
 287                 \r
 288                 private static System.String guessEncoding(sbyte[] bytes)\r
 289                 {\r
 290                         if (ASSUME_SHIFT_JIS)\r
 291                         {\r
 292                                 return SHIFT_JIS;\r
 293                         }\r
 294                         // Does it start with the UTF-8 byte order mark? then guess it's UTF-8\r
 295                         if (bytes.Length > 3 && bytes[0] == (sbyte) SupportClass.Identity(0xEF) && bytes[1] == (sbyte) SupportClass.Identity(0xBB) && bytes[2] == (sbyte) SupportClass.Identity(0xBF))\r
 296                         {\r
 297                                 return UTF8;\r
 298                         }\r
 299                         // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,\r
 300                         // which should be by far the most common encodings. ISO-8859-1\r
 301                         // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS\r
 302                         // uses this as a first byte of a two-byte character. If we see this\r
 303                         // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.\r
 304                         // If we see something else in that second byte, we'll make the risky guess\r
 305                         // that it's UTF-8.\r
 306                         int length = bytes.Length;\r
 307                         bool canBeISO88591 = true;\r
 308                         bool canBeShiftJIS = true;\r
 309                         int maybeDoubleByteCount = 0;\r
 310                         int maybeSingleByteKatakanaCount = 0;\r
 311                         bool sawLatin1Supplement = false;\r
 312                         bool lastWasPossibleDoubleByteStart = false;\r
 313                         for (int i = 0; i < length && (canBeISO88591 || canBeShiftJIS); i++)\r
 314                         {\r
 315                                 int value_Renamed = bytes[i] & 0xFF;\r
 316                                 if ((value_Renamed == 0xC2 || value_Renamed == 0xC3) && i < length - 1)\r
 317                                 {\r
 318                                         // This is really a poor hack. The slightly more exotic characters people might want to put in\r
 319                                         // a QR Code, by which I mean the Latin-1 supplement characters (e.g. u-umlaut) have encodings\r
 320                                         // that start with 0xC2 followed by [0xA0,0xBF], or start with 0xC3 followed by [0x80,0xBF].\r
 321                                         int nextValue = bytes[i + 1] & 0xFF;\r
 322                                         if (nextValue <= 0xBF && ((value_Renamed == 0xC2 && nextValue >= 0xA0) || (value_Renamed == 0xC3 && nextValue >= 0x80)))\r
 323                                         {\r
 324                                                 sawLatin1Supplement = true;\r
 325                                         }\r
 326                                 }\r
 327                                 if (value_Renamed >= 0x7F && value_Renamed <= 0x9F)\r
 328                                 {\r
 329                                         canBeISO88591 = false;\r
 330                                 }\r
 331                                 if (value_Renamed >= 0xA1 && value_Renamed <= 0xDF)\r
 332                                 {\r
 333                                         // count the number of characters that might be a Shift_JIS single-byte Katakana character\r
 334                                         if (!lastWasPossibleDoubleByteStart)\r
 335                                         {\r
 336                                                 maybeSingleByteKatakanaCount++;\r
 337                                         }\r
 338                                 }\r
 339                                 if (!lastWasPossibleDoubleByteStart && ((value_Renamed >= 0xF0 && value_Renamed <= 0xFF) || value_Renamed == 0x80 || value_Renamed == 0xA0))\r
 340                                 {\r
 341                                         canBeShiftJIS = false;\r
 342                                 }\r
 343                                 if (((value_Renamed >= 0x81 && value_Renamed <= 0x9F) || (value_Renamed >= 0xE0 && value_Renamed <= 0xEF)))\r
 344                                 {\r
 345                                         // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid\r
 346                                         // second byte.\r
 347                                         if (lastWasPossibleDoubleByteStart)\r
 348                                         {\r
 349                                                 // If we just checked this and the last byte for being a valid double-byte\r
 350                                                 // char, don't check starting on this byte. If this and the last byte\r
 351                                                 // formed a valid pair, then this shouldn't be checked to see if it starts\r
 352                                                 // a double byte pair of course.\r
 353                                                 lastWasPossibleDoubleByteStart = false;\r
 354                                         }\r
 355                                         else\r
 356                                         {\r
 357                                                 // ... otherwise do check to see if this plus the next byte form a valid\r
 358                                                 // double byte pair encoding a character.\r
 359                                                 lastWasPossibleDoubleByteStart = true;\r
 360                                                 if (i >= bytes.Length - 1)\r
 361                                                 {\r
 362                                                         canBeShiftJIS = false;\r
 363                                                 }\r
 364                                                 else\r
 365                                                 {\r
 366                                                         int nextValue = bytes[i + 1] & 0xFF;\r
 367                                                         if (nextValue < 0x40 || nextValue > 0xFC)\r
 368                                                         {\r
 369                                                                 canBeShiftJIS = false;\r
 370                                                         }\r
 371                                                         else\r
 372                                                         {\r
 373                                                                 maybeDoubleByteCount++;\r
 374                                                         }\r
 375                                                         // There is some conflicting information out there about which bytes can follow which in\r
 376                                                         // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.\r
 377                                                 }\r
 378                                         }\r
 379                                 }\r
 380                                 else\r
 381                                 {\r
 382                                         lastWasPossibleDoubleByteStart = false;\r
 383                                 }\r
 384                         }\r
 385                         // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:\r
 386                         // - If we saw\r
 387                         //   - at least three byte that starts a double-byte value (bytes that are rare in ISO-8859-1), or\r
 388                         //   - over 5% of bytes that could be single-byte Katakana (also rare in ISO-8859-1),\r
 389                         // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS\r
 390                         if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length))\r
 391                         {\r
 392                                 return SHIFT_JIS;\r
 393                         }\r
 394                         // Otherwise, we default to ISO-8859-1 unless we know it can't be\r
 395                         if (!sawLatin1Supplement && canBeISO88591)\r
 396                         {\r
 397                                 return ISO88591;\r
 398                         }\r
 399                         // Otherwise, we take a wild guess with UTF-8\r
 400                         return UTF8;\r
 401                 }\r
 402                 \r
 403                 private static int parseECIValue(BitSource bits)\r
 404                 {\r
 405                         int firstByte = bits.readBits(8);\r
 406                         if ((firstByte & 0x80) == 0)\r
 407                         {\r
 408                                 // just one byte\r
 409                                 return firstByte & 0x7F;\r
 410                         }\r
 411                         else if ((firstByte & 0xC0) == 0x80)\r
 412                         {\r
 413                                 // two bytes\r
 414                                 int secondByte = bits.readBits(8);\r
 415                                 return ((firstByte & 0x3F) << 8) | secondByte;\r
 416                         }\r
 417                         else if ((firstByte & 0xE0) == 0xC0)\r
 418                         {\r
 419                                 // three bytes\r
 420                                 int secondThirdBytes = bits.readBits(16);\r
 421                                 return ((firstByte & 0x1F) << 16) | secondThirdBytes;\r
 422                         }\r
 423                         throw new System.ArgumentException("Bad ECI bits starting with byte " + firstByte);\r
 424                 }\r
 425                 static DecodedBitStreamParser()\r
 426                 {\r
 427                         {\r
 428                 // Redivivus.in Java to c# Porting update\r
 429                 // 30/01/2010 \r
 430                 // Commented & Added\r
 431                                 //System.String platformDefault = System_Renamed.getProperty("file.encoding");\r
 432                                 //ASSUME_SHIFT_JIS = SHIFT_JIS.ToUpper().Equals(platformDefault.ToUpper()) || EUC_JP.ToUpper().Equals(platformDefault.ToUpper());\r
 433                 ASSUME_SHIFT_JIS = false;\r
 434                         }\r
 435                 }\r
 436         }\r
 437 }