aboutsummaryrefslogtreecommitdiffstats
path: root/src/ui/encoding/TextEncodingDetect.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/ui/encoding/TextEncodingDetect.cpp')
-rw-r--r--src/ui/encoding/TextEncodingDetect.cpp313
1 files changed, 313 insertions, 0 deletions
diff --git a/src/ui/encoding/TextEncodingDetect.cpp b/src/ui/encoding/TextEncodingDetect.cpp
new file mode 100644
index 00000000..22ae5897
--- /dev/null
+++ b/src/ui/encoding/TextEncodingDetect.cpp
@@ -0,0 +1,313 @@
+//
+// Copyright 2015-2016 Jonathan Bennett <[email protected]>
+//
+// https://www.autoitscript.com
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// Includes
+#include "TextEncodingDetect.h"
+
+using namespace AutoIt::Common;
+
+static const unsigned char TextEncodingDetect_UTF16_BOM_LE[] = {
+ (unsigned char)(0xFF), (unsigned char)(0xFE)};
+static const unsigned char TextEncodingDetect_UTF16_BOM_BE[] = {
+ (unsigned char)(0xFE), (unsigned char)(0xFF)};
+static const unsigned char TextEncodingDetect_UTF8_BOM[] = {
+ (unsigned char)(0xEF), (unsigned char)(0xBB), (unsigned char)(0xBF)};
+
+const unsigned char *TextEncodingDetect::utf16_bom_le_ =
+ TextEncodingDetect_UTF16_BOM_LE;
+const unsigned char *TextEncodingDetect::utf16_bom_be_ =
+ TextEncodingDetect_UTF16_BOM_BE;
+const unsigned char *TextEncodingDetect::utf8_bom_ =
+ TextEncodingDetect_UTF8_BOM;
+
+///////////////////////////////////////////////////////////////////////////////
+// Constructor()
+// Default constructor
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::TextEncodingDetect() {
+ // By default, assume nulls can't appear in ANSI/ASCII/UTF8 text files
+ null_suggests_binary_ = true;
+
+ // Set defaults for utf16 detection based the use of odd/even nulls
+ utf16_expected_null_percent_ = 70;
+ utf16_unexpected_null_percent_ = 10;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Set the percentages used in utf16 detection using nulls.
+///////////////////////////////////////////////////////////////////////////////
+
+void TextEncodingDetect::SetUtf16UnexpectedNullPercent(int percent) {
+ if (percent > 0 && percent < 100) utf16_expected_null_percent_ = percent;
+}
+
+void TextEncodingDetect::SetUtf16ExpectedNullPercent(int percent) {
+ if (percent > 0 && percent < 100) utf16_unexpected_null_percent_ = percent;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Simple function to return the length of the BOM for a particular encoding
+// mode.
+///////////////////////////////////////////////////////////////////////////////
+
+int TextEncodingDetect::GetBOMLengthFromEncodingMode(Encoding encoding) {
+ int length = 0;
+
+ if (encoding == UTF16_BE_BOM || encoding == UTF16_LE_BOM)
+ length = 2;
+ else if (encoding == UTF8_BOM)
+ length = 3;
+
+ return length;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains a valid BOM and returns the encoding based on it.
+// Returns encoding "None" if there is no BOM.
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckBOM(
+ const unsigned char *pBuffer, size_t size) {
+ // Check for BOM
+ if (size >= 2 && pBuffer[0] == utf16_bom_le_[0] &&
+ pBuffer[1] == utf16_bom_le_[1]) {
+ return UTF16_LE_BOM;
+ } else if (size >= 2 && pBuffer[0] == utf16_bom_be_[0] &&
+ pBuffer[1] == utf16_bom_be_[1]) {
+ return UTF16_BE_BOM;
+ } else if (size >= 3 && pBuffer[0] == utf8_bom_[0] &&
+ pBuffer[1] == utf8_bom_[1] && pBuffer[2] == utf8_bom_[2]) {
+ return UTF8_BOM;
+ } else {
+ return None;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains a valid BOM and returns the encoding based on it.
+// If it doesn't contain a BOM it tries to guess what the encoding is or
+// "None" if it just looks like binary data.
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::DetectEncoding(
+ const unsigned char *pBuffer, size_t size) const {
+ // First check if we have a BOM and return that if so
+ Encoding encoding = CheckBOM(pBuffer, size);
+ if (encoding != None) return encoding;
+
+ // Now check for valid UTF8
+ encoding = CheckUTF8(pBuffer, size);
+ if (encoding != None) return encoding;
+
+ // Now try UTF16
+ encoding = CheckUTF16NewlineChars(pBuffer, size);
+ if (encoding != None) return encoding;
+
+ encoding = CheckUTF16ASCII(pBuffer, size);
+ if (encoding != None) return encoding;
+
+ // ANSI or None (binary) then
+ if (!DoesContainNulls(pBuffer, size))
+ return ANSI;
+ else {
+ // Found a null, return based on the preference in null_suggests_binary_
+ if (null_suggests_binary_)
+ return None;
+ else
+ return ANSI;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains valid utf8. Returns:
+// None - not valid utf8
+// UTF8_NOBOM - valid utf8 encodings and multibyte sequences
+// ASCII - Only data in the 0-127 range.
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF8(
+ const unsigned char *pBuffer, size_t size) const {
+ // UTF8 Valid sequences
+ // 0xxxxxxx ASCII
+ // 110xxxxx 10xxxxxx 2-byte
+ // 1110xxxx 10xxxxxx 10xxxxxx 3-byte
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
+ //
+ // Width in UTF8
+ // Decimal Width
+ // 0-127 1 byte
+ // 194-223 2 bytes
+ // 224-239 3 bytes
+ // 240-244 4 bytes
+ //
+ // Subsequent chars are in the range 128-191
+
+ bool only_saw_ascii_range = true;
+ size_t pos = 0;
+ int more_chars;
+
+ while (pos < size) {
+ unsigned char ch = pBuffer[pos++];
+
+ if (ch == 0 && null_suggests_binary_) {
+ return None;
+ } else if (ch <= 127) {
+ // 1 byte
+ more_chars = 0;
+ } else if (ch >= 194 && ch <= 223) {
+ // 2 Byte
+ more_chars = 1;
+ } else if (ch >= 224 && ch <= 239) {
+ // 3 Byte
+ more_chars = 2;
+ } else if (ch >= 240 && ch <= 244) {
+ // 4 Byte
+ more_chars = 3;
+ } else {
+ return None; // Not utf8
+ }
+
+ // Check secondary chars are in range if we are expecting any
+ while (more_chars && pos < size) {
+ only_saw_ascii_range = false; // Seen non-ascii chars now
+
+ ch = pBuffer[pos++];
+ if (ch < 128 || ch > 191) return None; // Not utf8
+
+ --more_chars;
+ }
+ }
+
+ // If we get to here then only valid UTF-8 sequences have been processed
+
+ // If we only saw chars in the range 0-127 then we can't assume UTF8 (the
+ // caller will need to decide)
+ if (only_saw_ascii_range)
+ return ASCII;
+ else
+ return UTF8_NOBOM;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains text that looks like utf16 by scanning for
+// newline chars that would be present even in non-english text.
+// Returns:
+// None - not valid utf16
+// UTF16_LE_NOBOM - looks like utf16 le
+// UTF16_BE_NOBOM - looks like utf16 be
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16NewlineChars(
+ const unsigned char *pBuffer, size_t size) {
+ if (size < 2) return None;
+
+ // Reduce size by 1 so we don't need to worry about bounds checking for pairs
+ // of bytes
+ size--;
+
+ int le_control_chars = 0;
+ int be_control_chars = 0;
+ unsigned char ch1, ch2;
+
+ size_t pos = 0;
+ while (pos < size) {
+ ch1 = pBuffer[pos++];
+ ch2 = pBuffer[pos++];
+
+ if (ch1 == 0) {
+ if (ch2 == 0x0a || ch2 == 0x0d) ++be_control_chars;
+ } else if (ch2 == 0) {
+ if (ch1 == 0x0a || ch1 == 0x0d) ++le_control_chars;
+ }
+
+ // If we are getting both LE and BE control chars then this file is not
+ // utf16
+ if (le_control_chars && be_control_chars) return None;
+ }
+
+ if (le_control_chars)
+ return UTF16_LE_NOBOM;
+ else if (be_control_chars)
+ return UTF16_BE_NOBOM;
+ else
+ return None;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains text that looks like utf16. This is done based
+// the use of nulls which in ASCII/script like text can be useful to identify.
+// Returns:
+// None - not valid utf16
+// UTF16_LE_NOBOM - looks like utf16 le
+// UTF16_BE_NOBOM - looks like utf16 be
+///////////////////////////////////////////////////////////////////////////////
+
+TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16ASCII(
+ const unsigned char *pBuffer, size_t size) const {
+ int num_odd_nulls = 0;
+ int num_even_nulls = 0;
+
+ // Get even nulls
+ size_t pos = 0;
+ while (pos < size) {
+ if (pBuffer[pos] == 0) num_even_nulls++;
+
+ pos += 2;
+ }
+
+ // Get odd nulls
+ pos = 1;
+ while (pos < size) {
+ if (pBuffer[pos] == 0) num_odd_nulls++;
+
+ pos += 2;
+ }
+
+ double even_null_threshold = (num_even_nulls * 2.0) / size;
+ double odd_null_threshold = (num_odd_nulls * 2.0) / size;
+ double expected_null_threshold = utf16_expected_null_percent_ / 100.0;
+ double unexpected_null_threshold = utf16_unexpected_null_percent_ / 100.0;
+
+ // Lots of odd nulls, low number of even nulls
+ if (even_null_threshold < unexpected_null_threshold &&
+ odd_null_threshold > expected_null_threshold)
+ return UTF16_LE_NOBOM;
+
+ // Lots of even nulls, low number of odd nulls
+ if (odd_null_threshold < unexpected_null_threshold &&
+ even_null_threshold > expected_null_threshold)
+ return UTF16_BE_NOBOM;
+
+ // Don't know
+ return None;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks if a buffer contains any nulls. Used to check for binary vs text data.
+///////////////////////////////////////////////////////////////////////////////
+
+bool TextEncodingDetect::DoesContainNulls(const unsigned char *pBuffer,
+ size_t size) {
+ size_t pos = 0;
+ while (pos < size) {
+ if (pBuffer[pos++] == 0) return true;
+ }
+
+ return false;
+}