1 files changed, 313 insertions, 0 deletions
diff --git a/src/ui/encoding/TextEncodingDetect.cpp b/src/ui/encoding/TextEncodingDetect.cpp
new file mode 100644
index 00000000..22ae5897
--- /dev/null
+++ b/src/ui/encoding/TextEncodingDetect.cpp
@@ -0,0 +1,313 @@
+//
+// Copyright 2015-2016 Jonathan Bennett <[email protected]>
+//
+// https://www.autoitscript.com
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// Includes
+#include "TextEncodingDetect.h"
+
+using namespace AutoIt::Common;
+
+static const unsigned char TextEncodingDetect_UTF16_BOM_LE[] = {
+    (unsigned char)(0xFF), (unsigned char)(0xFE)};
+static const unsigned char TextEncodingDetect_UTF16_BOM_BE[] = {
+    (unsigned char)(0xFE), (unsigned char)(0xFF)};
+static const unsigned char TextEncodingDetect_UTF8_BOM[] = {
+    (unsigned char)(0xEF), (unsigned char)(0xBB), (unsigned char)(0xBF)};
+
+const unsigned char *TextEncodingDetect::utf16_bom_le_ =
+    TextEncodingDetect_UTF16_BOM_LE;
+const unsigned char *TextEncodingDetect::utf16_bom_be_ =
+    TextEncodingDetect_UTF16_BOM_BE;
+const unsigned char *TextEncodingDetect::utf8_bom_ =
+    TextEncodingDetect_UTF8_BOM;
+
+///////////////////////////////////////////////////////////////////////////////
+// Constructor()
+// Default constructor
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::TextEncodingDetect() {
+  // By default, assume nulls can't appear in ANSI/ASCII/UTF8 text files
+  null_suggests_binary_ = true;
+
+  // Set defaults for utf16 detection based the use of odd/even nulls
+  utf16_expected_null_percent_ = 70;
+  utf16_unexpected_null_percent_ = 10;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Set the percentages used in utf16 detection using nulls.
+///////////////////////////////////////////////////////////////////////////////
+
+void TextEncodingDetect::SetUtf16UnexpectedNullPercent(int percent) {
+  if (percent > 0 && percent < 100) utf16_expected_null_percent_ = percent;
+}
+
+void TextEncodingDetect::SetUtf16ExpectedNullPercent(int percent) {
+  if (percent > 0 && percent < 100) utf16_unexpected_null_percent_ = percent;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Simple function to return the length of the BOM for a particular encoding
+// mode.
+///////////////////////////////////////////////////////////////////////////////
+
+int TextEncodingDetect::GetBOMLengthFromEncodingMode(Encoding encoding) {
+  int length = 0;
+
+  if (encoding == UTF16_BE_BOM || encoding == UTF16_LE_BOM)
+    length = 2;
+  else if (encoding == UTF8_BOM)
+    length = 3;
+
+  return length;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains a valid BOM and returns the encoding based on it.
+// Returns encoding "None" if there is no BOM.
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckBOM(
+    const unsigned char *pBuffer, size_t size) {
+  // Check for BOM
+  if (size >= 2 && pBuffer[0] == utf16_bom_le_[0] &&
+      pBuffer[1] == utf16_bom_le_[1]) {
+    return UTF16_LE_BOM;
+  } else if (size >= 2 && pBuffer[0] == utf16_bom_be_[0] &&
+             pBuffer[1] == utf16_bom_be_[1]) {
+    return UTF16_BE_BOM;
+  } else if (size >= 3 && pBuffer[0] == utf8_bom_[0] &&
+             pBuffer[1] == utf8_bom_[1] && pBuffer[2] == utf8_bom_[2]) {
+    return UTF8_BOM;
+  } else {
+    return None;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains a valid BOM and returns the encoding based on it.
+// If it doesn't contain a BOM it tries to guess what the encoding is or
+// "None" if it just looks like binary data.
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::DetectEncoding(
+    const unsigned char *pBuffer, size_t size) const {
+  // First check if we have a BOM and return that if so
+  Encoding encoding = CheckBOM(pBuffer, size);
+  if (encoding != None) return encoding;
+
+  // Now check for valid UTF8
+  encoding = CheckUTF8(pBuffer, size);
+  if (encoding != None) return encoding;
+
+  // Now try UTF16
+  encoding = CheckUTF16NewlineChars(pBuffer, size);
+  if (encoding != None) return encoding;
+
+  encoding = CheckUTF16ASCII(pBuffer, size);
+  if (encoding != None) return encoding;
+
+  // ANSI or None (binary) then
+  if (!DoesContainNulls(pBuffer, size))
+    return ANSI;
+  else {
+    // Found a null, return based on the preference in null_suggests_binary_
+    if (null_suggests_binary_)
+      return None;
+    else
+      return ANSI;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains valid utf8. Returns:
+// None - not valid utf8
+// UTF8_NOBOM - valid utf8 encodings and multibyte sequences
+// ASCII - Only data in the 0-127 range.
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF8(
+    const unsigned char *pBuffer, size_t size) const {
+  // UTF8 Valid sequences
+  // 0xxxxxxx  ASCII
+  // 110xxxxx 10xxxxxx  2-byte
+  // 1110xxxx 10xxxxxx 10xxxxxx  3-byte
+  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  4-byte
+  //
+  // Width in UTF8
+  // Decimal		Width
+  // 0-127		1 byte
+  // 194-223		2 bytes
+  // 224-239		3 bytes
+  // 240-244		4 bytes
+  //
+  // Subsequent chars are in the range 128-191
+
+  bool only_saw_ascii_range = true;
+  size_t pos = 0;
+  int more_chars;
+
+  while (pos < size) {
+    unsigned char ch = pBuffer[pos++];
+
+    if (ch == 0 && null_suggests_binary_) {
+      return None;
+    } else if (ch <= 127) {
+      // 1 byte
+      more_chars = 0;
+    } else if (ch >= 194 && ch <= 223) {
+      // 2 Byte
+      more_chars = 1;
+    } else if (ch >= 224 && ch <= 239) {
+      // 3 Byte
+      more_chars = 2;
+    } else if (ch >= 240 && ch <= 244) {
+      // 4 Byte
+      more_chars = 3;
+    } else {
+      return None;  // Not utf8
+    }
+
+    // Check secondary chars are in range if we are expecting any
+    while (more_chars && pos < size) {
+      only_saw_ascii_range = false;  // Seen non-ascii chars now
+
+      ch = pBuffer[pos++];
+      if (ch < 128 || ch > 191) return None;  // Not utf8
+
+      --more_chars;
+    }
+  }
+
+  // If we get to here then only valid UTF-8 sequences have been processed
+
+  // If we only saw chars in the range 0-127 then we can't assume UTF8 (the
+  // caller will need to decide)
+  if (only_saw_ascii_range)
+    return ASCII;
+  else
+    return UTF8_NOBOM;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains text that looks like utf16 by scanning for
+// newline chars that would be present even in non-english text.
+// Returns:
+// None - not valid utf16
+// UTF16_LE_NOBOM - looks like utf16 le
+// UTF16_BE_NOBOM - looks like utf16 be
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16NewlineChars(
+    const unsigned char *pBuffer, size_t size) {
+  if (size < 2) return None;
+
+  // Reduce size by 1 so we don't need to worry about bounds checking for pairs
+  // of bytes
+  size--;
+
+  int le_control_chars = 0;
+  int be_control_chars = 0;
+  unsigned char ch1, ch2;
+
+  size_t pos = 0;
+  while (pos < size) {
+    ch1 = pBuffer[pos++];
+    ch2 = pBuffer[pos++];
+
+    if (ch1 == 0) {
+      if (ch2 == 0x0a || ch2 == 0x0d) ++be_control_chars;
+    } else if (ch2 == 0) {
+      if (ch1 == 0x0a || ch1 == 0x0d) ++le_control_chars;
+    }
+
+    // If we are getting both LE and BE control chars then this file is not
+    // utf16
+    if (le_control_chars && be_control_chars) return None;
+  }
+
+  if (le_control_chars)
+    return UTF16_LE_NOBOM;
+  else if (be_control_chars)
+    return UTF16_BE_NOBOM;
+  else
+    return None;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains text that looks like utf16. This is done based
+// the use of nulls which in ASCII/script like text can be useful to identify.
+// Returns:
+// None - not valid utf16
+// UTF16_LE_NOBOM - looks like utf16 le
+// UTF16_BE_NOBOM - looks like utf16 be
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16ASCII(
+    const unsigned char *pBuffer, size_t size) const {
+  int num_odd_nulls = 0;
+  int num_even_nulls = 0;
+
+  // Get even nulls
+  size_t pos = 0;
+  while (pos < size) {
+    if (pBuffer[pos] == 0) num_even_nulls++;
+
+    pos += 2;
+  }
+
+  // Get odd nulls
+  pos = 1;
+  while (pos < size) {
+    if (pBuffer[pos] == 0) num_odd_nulls++;
+
+    pos += 2;
+  }
+
+  double even_null_threshold = (num_even_nulls * 2.0) / size;
+  double odd_null_threshold = (num_odd_nulls * 2.0) / size;
+  double expected_null_threshold = utf16_expected_null_percent_ / 100.0;
+  double unexpected_null_threshold = utf16_unexpected_null_percent_ / 100.0;
+
+  // Lots of odd nulls, low number of even nulls
+  if (even_null_threshold < unexpected_null_threshold &&
+      odd_null_threshold > expected_null_threshold)
+    return UTF16_LE_NOBOM;
+
+  // Lots of even nulls, low number of odd nulls
+  if (odd_null_threshold < unexpected_null_threshold &&
+      even_null_threshold > expected_null_threshold)
+    return UTF16_BE_NOBOM;
+
+  // Don't know
+  return None;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains any nulls. Used to check for binary vs text data.
+///////////////////////////////////////////////////////////////////////////////
+
+bool TextEncodingDetect::DoesContainNulls(const unsigned char *pBuffer,
+                                          size_t size) {
+  size_t pos = 0;
+  while (pos < size) {
+    if (pBuffer[pos++] == 0) return true;
+  }
+
+  return false;
+}