// // Copyright 2015-2016 Jonathan Bennett // // https://www.autoitscript.com // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Includes #include "TextEncodingDetect.h" using namespace AutoIt::Common; static const unsigned char TextEncodingDetect_UTF16_BOM_LE[] = { (unsigned char)(0xFF), (unsigned char)(0xFE)}; static const unsigned char TextEncodingDetect_UTF16_BOM_BE[] = { (unsigned char)(0xFE), (unsigned char)(0xFF)}; static const unsigned char TextEncodingDetect_UTF8_BOM[] = { (unsigned char)(0xEF), (unsigned char)(0xBB), (unsigned char)(0xBF)}; const unsigned char *TextEncodingDetect::utf16_bom_le_ = TextEncodingDetect_UTF16_BOM_LE; const unsigned char *TextEncodingDetect::utf16_bom_be_ = TextEncodingDetect_UTF16_BOM_BE; const unsigned char *TextEncodingDetect::utf8_bom_ = TextEncodingDetect_UTF8_BOM; /////////////////////////////////////////////////////////////////////////////// // Constructor() // Default constructor /////////////////////////////////////////////////////////////////////////////// TextEncodingDetect::TextEncodingDetect() { // By default, assume nulls can't appear in ANSI/ASCII/UTF8 text files null_suggests_binary_ = true; // Set defaults for utf16 detection based the use of odd/even nulls utf16_expected_null_percent_ = 70; utf16_unexpected_null_percent_ = 10; } /////////////////////////////////////////////////////////////////////////////// // Set the percentages used in utf16 detection using nulls. /////////////////////////////////////////////////////////////////////////////// void TextEncodingDetect::SetUtf16UnexpectedNullPercent(int percent) { if (percent > 0 && percent < 100) utf16_expected_null_percent_ = percent; } void TextEncodingDetect::SetUtf16ExpectedNullPercent(int percent) { if (percent > 0 && percent < 100) utf16_unexpected_null_percent_ = percent; } /////////////////////////////////////////////////////////////////////////////// // Simple function to return the length of the BOM for a particular encoding // mode. /////////////////////////////////////////////////////////////////////////////// int TextEncodingDetect::GetBOMLengthFromEncodingMode(Encoding encoding) { int length = 0; if (encoding == UTF16_BE_BOM || encoding == UTF16_LE_BOM) length = 2; else if (encoding == UTF8_BOM) length = 3; return length; } /////////////////////////////////////////////////////////////////////////////// // Checks if a buffer contains a valid BOM and returns the encoding based on it. // Returns encoding "None" if there is no BOM. /////////////////////////////////////////////////////////////////////////////// TextEncodingDetect::Encoding TextEncodingDetect::CheckBOM( const unsigned char *pBuffer, size_t size) { // Check for BOM if (size >= 2 && pBuffer[0] == utf16_bom_le_[0] && pBuffer[1] == utf16_bom_le_[1]) { return UTF16_LE_BOM; } else if (size >= 2 && pBuffer[0] == utf16_bom_be_[0] && pBuffer[1] == utf16_bom_be_[1]) { return UTF16_BE_BOM; } else if (size >= 3 && pBuffer[0] == utf8_bom_[0] && pBuffer[1] == utf8_bom_[1] && pBuffer[2] == utf8_bom_[2]) { return UTF8_BOM; } else { return None; } } /////////////////////////////////////////////////////////////////////////////// // Checks if a buffer contains a valid BOM and returns the encoding based on it. // If it doesn't contain a BOM it tries to guess what the encoding is or // "None" if it just looks like binary data. /////////////////////////////////////////////////////////////////////////////// TextEncodingDetect::Encoding TextEncodingDetect::DetectEncoding( const unsigned char *pBuffer, size_t size) const { // First check if we have a BOM and return that if so Encoding encoding = CheckBOM(pBuffer, size); if (encoding != None) return encoding; // Now check for valid UTF8 encoding = CheckUTF8(pBuffer, size); if (encoding != None) return encoding; // Now try UTF16 encoding = CheckUTF16NewlineChars(pBuffer, size); if (encoding != None) return encoding; encoding = CheckUTF16ASCII(pBuffer, size); if (encoding != None) return encoding; // ANSI or None (binary) then if (!DoesContainNulls(pBuffer, size)) return ANSI; else { // Found a null, return based on the preference in null_suggests_binary_ if (null_suggests_binary_) return None; else return ANSI; } } /////////////////////////////////////////////////////////////////////////////// // Checks if a buffer contains valid utf8. Returns: // None - not valid utf8 // UTF8_NOBOM - valid utf8 encodings and multibyte sequences // ASCII - Only data in the 0-127 range. /////////////////////////////////////////////////////////////////////////////// TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF8( const unsigned char *pBuffer, size_t size) const { // UTF8 Valid sequences // 0xxxxxxx ASCII // 110xxxxx 10xxxxxx 2-byte // 1110xxxx 10xxxxxx 10xxxxxx 3-byte // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte // // Width in UTF8 // Decimal Width // 0-127 1 byte // 194-223 2 bytes // 224-239 3 bytes // 240-244 4 bytes // // Subsequent chars are in the range 128-191 bool only_saw_ascii_range = true; size_t pos = 0; int more_chars; while (pos < size) { unsigned char ch = pBuffer[pos++]; if (ch == 0 && null_suggests_binary_) { return None; } else if (ch <= 127) { // 1 byte more_chars = 0; } else if (ch >= 194 && ch <= 223) { // 2 Byte more_chars = 1; } else if (ch >= 224 && ch <= 239) { // 3 Byte more_chars = 2; } else if (ch >= 240 && ch <= 244) { // 4 Byte more_chars = 3; } else { return None; // Not utf8 } // Check secondary chars are in range if we are expecting any while (more_chars && pos < size) { only_saw_ascii_range = false; // Seen non-ascii chars now ch = pBuffer[pos++]; if (ch < 128 || ch > 191) return None; // Not utf8 --more_chars; } } // If we get to here then only valid UTF-8 sequences have been processed // If we only saw chars in the range 0-127 then we can't assume UTF8 (the // caller will need to decide) if (only_saw_ascii_range) return ASCII; else return UTF8_NOBOM; } /////////////////////////////////////////////////////////////////////////////// // Checks if a buffer contains text that looks like utf16 by scanning for // newline chars that would be present even in non-english text. // Returns: // None - not valid utf16 // UTF16_LE_NOBOM - looks like utf16 le // UTF16_BE_NOBOM - looks like utf16 be /////////////////////////////////////////////////////////////////////////////// TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16NewlineChars( const unsigned char *pBuffer, size_t size) { if (size < 2) return None; // Reduce size by 1 so we don't need to worry about bounds checking for pairs // of bytes size--; int le_control_chars = 0; int be_control_chars = 0; unsigned char ch1, ch2; size_t pos = 0; while (pos < size) { ch1 = pBuffer[pos++]; ch2 = pBuffer[pos++]; if (ch1 == 0) { if (ch2 == 0x0a || ch2 == 0x0d) ++be_control_chars; } else if (ch2 == 0) { if (ch1 == 0x0a || ch1 == 0x0d) ++le_control_chars; } // If we are getting both LE and BE control chars then this file is not // utf16 if (le_control_chars && be_control_chars) return None; } if (le_control_chars) return UTF16_LE_NOBOM; else if (be_control_chars) return UTF16_BE_NOBOM; else return None; } /////////////////////////////////////////////////////////////////////////////// // Checks if a buffer contains text that looks like utf16. This is done based // the use of nulls which in ASCII/script like text can be useful to identify. // Returns: // None - not valid utf16 // UTF16_LE_NOBOM - looks like utf16 le // UTF16_BE_NOBOM - looks like utf16 be /////////////////////////////////////////////////////////////////////////////// TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16ASCII( const unsigned char *pBuffer, size_t size) const { int num_odd_nulls = 0; int num_even_nulls = 0; // Get even nulls size_t pos = 0; while (pos < size) { if (pBuffer[pos] == 0) num_even_nulls++; pos += 2; } // Get odd nulls pos = 1; while (pos < size) { if (pBuffer[pos] == 0) num_odd_nulls++; pos += 2; } double even_null_threshold = (num_even_nulls * 2.0) / size; double odd_null_threshold = (num_odd_nulls * 2.0) / size; double expected_null_threshold = utf16_expected_null_percent_ / 100.0; double unexpected_null_threshold = utf16_unexpected_null_percent_ / 100.0; // Lots of odd nulls, low number of even nulls if (even_null_threshold < unexpected_null_threshold && odd_null_threshold > expected_null_threshold) return UTF16_LE_NOBOM; // Lots of even nulls, low number of odd nulls if (odd_null_threshold < unexpected_null_threshold && even_null_threshold > expected_null_threshold) return UTF16_BE_NOBOM; // Don't know return None; } /////////////////////////////////////////////////////////////////////////////// // Checks if a buffer contains any nulls. Used to check for binary vs text data. /////////////////////////////////////////////////////////////////////////////// bool TextEncodingDetect::DoesContainNulls(const unsigned char *pBuffer, size_t size) { size_t pos = 0; while (pos < size) { if (pBuffer[pos++] == 0) return true; } return false; }