diff options
author | Saturneric <[email protected]> | 2022-07-23 12:27:31 +0000 |
---|---|---|
committer | Saturneric <[email protected]> | 2022-07-23 12:27:31 +0000 |
commit | e8c8e13835ef6faa7d6122cd89f4915cccac3d49 (patch) | |
tree | 85efb71562e00fc116e8789fd9d9bdd1d4f3fbcd | |
parent | feat(ui): add tab to show gnupg info (diff) | |
download | GpgFrontend-dev/2.0.8/main.tar.gz GpgFrontend-dev/2.0.8/main.zip |
refactor(third_party): remove encoding-detectdev/2.0.8/main
-rw-r--r-- | third_party/encoding-detect/TextEncodingDetect.cpp | 313 | ||||
-rw-r--r-- | third_party/encoding-detect/TextEncodingDetect.h | 85 |
2 files changed, 0 insertions, 398 deletions
diff --git a/third_party/encoding-detect/TextEncodingDetect.cpp b/third_party/encoding-detect/TextEncodingDetect.cpp deleted file mode 100644 index 22ae5897..00000000 --- a/third_party/encoding-detect/TextEncodingDetect.cpp +++ /dev/null @@ -1,313 +0,0 @@ -// -// Copyright 2015-2016 Jonathan Bennett <[email protected]> -// -// https://www.autoitscript.com -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// Includes -#include "TextEncodingDetect.h" - -using namespace AutoIt::Common; - -static const unsigned char TextEncodingDetect_UTF16_BOM_LE[] = { - (unsigned char)(0xFF), (unsigned char)(0xFE)}; -static const unsigned char TextEncodingDetect_UTF16_BOM_BE[] = { - (unsigned char)(0xFE), (unsigned char)(0xFF)}; -static const unsigned char TextEncodingDetect_UTF8_BOM[] = { - (unsigned char)(0xEF), (unsigned char)(0xBB), (unsigned char)(0xBF)}; - -const unsigned char *TextEncodingDetect::utf16_bom_le_ = - TextEncodingDetect_UTF16_BOM_LE; -const unsigned char *TextEncodingDetect::utf16_bom_be_ = - TextEncodingDetect_UTF16_BOM_BE; -const unsigned char *TextEncodingDetect::utf8_bom_ = - TextEncodingDetect_UTF8_BOM; - -/////////////////////////////////////////////////////////////////////////////// -// Constructor() -// Default constructor -/////////////////////////////////////////////////////////////////////////////// - -TextEncodingDetect::TextEncodingDetect() { - // By default, assume nulls can't appear in ANSI/ASCII/UTF8 text files - null_suggests_binary_ = true; - - // Set defaults for utf16 detection based the use of odd/even nulls - utf16_expected_null_percent_ = 70; - utf16_unexpected_null_percent_ = 10; -} - -/////////////////////////////////////////////////////////////////////////////// -// Set the percentages used in utf16 detection using nulls. -/////////////////////////////////////////////////////////////////////////////// - -void TextEncodingDetect::SetUtf16UnexpectedNullPercent(int percent) { - if (percent > 0 && percent < 100) utf16_expected_null_percent_ = percent; -} - -void TextEncodingDetect::SetUtf16ExpectedNullPercent(int percent) { - if (percent > 0 && percent < 100) utf16_unexpected_null_percent_ = percent; -} - -/////////////////////////////////////////////////////////////////////////////// -// Simple function to return the length of the BOM for a particular encoding -// mode. -/////////////////////////////////////////////////////////////////////////////// - -int TextEncodingDetect::GetBOMLengthFromEncodingMode(Encoding encoding) { - int length = 0; - - if (encoding == UTF16_BE_BOM || encoding == UTF16_LE_BOM) - length = 2; - else if (encoding == UTF8_BOM) - length = 3; - - return length; -} - -/////////////////////////////////////////////////////////////////////////////// -// Checks if a buffer contains a valid BOM and returns the encoding based on it. -// Returns encoding "None" if there is no BOM. -/////////////////////////////////////////////////////////////////////////////// - -TextEncodingDetect::Encoding TextEncodingDetect::CheckBOM( - const unsigned char *pBuffer, size_t size) { - // Check for BOM - if (size >= 2 && pBuffer[0] == utf16_bom_le_[0] && - pBuffer[1] == utf16_bom_le_[1]) { - return UTF16_LE_BOM; - } else if (size >= 2 && pBuffer[0] == utf16_bom_be_[0] && - pBuffer[1] == utf16_bom_be_[1]) { - return UTF16_BE_BOM; - } else if (size >= 3 && pBuffer[0] == utf8_bom_[0] && - pBuffer[1] == utf8_bom_[1] && pBuffer[2] == utf8_bom_[2]) { - return UTF8_BOM; - } else { - return None; - } -} - -/////////////////////////////////////////////////////////////////////////////// -// Checks if a buffer contains a valid BOM and returns the encoding based on it. -// If it doesn't contain a BOM it tries to guess what the encoding is or -// "None" if it just looks like binary data. -/////////////////////////////////////////////////////////////////////////////// - -TextEncodingDetect::Encoding TextEncodingDetect::DetectEncoding( - const unsigned char *pBuffer, size_t size) const { - // First check if we have a BOM and return that if so - Encoding encoding = CheckBOM(pBuffer, size); - if (encoding != None) return encoding; - - // Now check for valid UTF8 - encoding = CheckUTF8(pBuffer, size); - if (encoding != None) return encoding; - - // Now try UTF16 - encoding = CheckUTF16NewlineChars(pBuffer, size); - if (encoding != None) return encoding; - - encoding = CheckUTF16ASCII(pBuffer, size); - if (encoding != None) return encoding; - - // ANSI or None (binary) then - if (!DoesContainNulls(pBuffer, size)) - return ANSI; - else { - // Found a null, return based on the preference in null_suggests_binary_ - if (null_suggests_binary_) - return None; - else - return ANSI; - } -} - -/////////////////////////////////////////////////////////////////////////////// -// Checks if a buffer contains valid utf8. Returns: -// None - not valid utf8 -// UTF8_NOBOM - valid utf8 encodings and multibyte sequences -// ASCII - Only data in the 0-127 range. -/////////////////////////////////////////////////////////////////////////////// - -TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF8( - const unsigned char *pBuffer, size_t size) const { - // UTF8 Valid sequences - // 0xxxxxxx ASCII - // 110xxxxx 10xxxxxx 2-byte - // 1110xxxx 10xxxxxx 10xxxxxx 3-byte - // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte - // - // Width in UTF8 - // Decimal Width - // 0-127 1 byte - // 194-223 2 bytes - // 224-239 3 bytes - // 240-244 4 bytes - // - // Subsequent chars are in the range 128-191 - - bool only_saw_ascii_range = true; - size_t pos = 0; - int more_chars; - - while (pos < size) { - unsigned char ch = pBuffer[pos++]; - - if (ch == 0 && null_suggests_binary_) { - return None; - } else if (ch <= 127) { - // 1 byte - more_chars = 0; - } else if (ch >= 194 && ch <= 223) { - // 2 Byte - more_chars = 1; - } else if (ch >= 224 && ch <= 239) { - // 3 Byte - more_chars = 2; - } else if (ch >= 240 && ch <= 244) { - // 4 Byte - more_chars = 3; - } else { - return None; // Not utf8 - } - - // Check secondary chars are in range if we are expecting any - while (more_chars && pos < size) { - only_saw_ascii_range = false; // Seen non-ascii chars now - - ch = pBuffer[pos++]; - if (ch < 128 || ch > 191) return None; // Not utf8 - - --more_chars; - } - } - - // If we get to here then only valid UTF-8 sequences have been processed - - // If we only saw chars in the range 0-127 then we can't assume UTF8 (the - // caller will need to decide) - if (only_saw_ascii_range) - return ASCII; - else - return UTF8_NOBOM; -} - -/////////////////////////////////////////////////////////////////////////////// -// Checks if a buffer contains text that looks like utf16 by scanning for -// newline chars that would be present even in non-english text. -// Returns: -// None - not valid utf16 -// UTF16_LE_NOBOM - looks like utf16 le -// UTF16_BE_NOBOM - looks like utf16 be -/////////////////////////////////////////////////////////////////////////////// - -TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16NewlineChars( - const unsigned char *pBuffer, size_t size) { - if (size < 2) return None; - - // Reduce size by 1 so we don't need to worry about bounds checking for pairs - // of bytes - size--; - - int le_control_chars = 0; - int be_control_chars = 0; - unsigned char ch1, ch2; - - size_t pos = 0; - while (pos < size) { - ch1 = pBuffer[pos++]; - ch2 = pBuffer[pos++]; - - if (ch1 == 0) { - if (ch2 == 0x0a || ch2 == 0x0d) ++be_control_chars; - } else if (ch2 == 0) { - if (ch1 == 0x0a || ch1 == 0x0d) ++le_control_chars; - } - - // If we are getting both LE and BE control chars then this file is not - // utf16 - if (le_control_chars && be_control_chars) return None; - } - - if (le_control_chars) - return UTF16_LE_NOBOM; - else if (be_control_chars) - return UTF16_BE_NOBOM; - else - return None; -} - -/////////////////////////////////////////////////////////////////////////////// -// Checks if a buffer contains text that looks like utf16. This is done based -// the use of nulls which in ASCII/script like text can be useful to identify. -// Returns: -// None - not valid utf16 -// UTF16_LE_NOBOM - looks like utf16 le -// UTF16_BE_NOBOM - looks like utf16 be -/////////////////////////////////////////////////////////////////////////////// - -TextEncodingDetect::Encoding TextEncodingDetect::CheckUTF16ASCII( - const unsigned char *pBuffer, size_t size) const { - int num_odd_nulls = 0; - int num_even_nulls = 0; - - // Get even nulls - size_t pos = 0; - while (pos < size) { - if (pBuffer[pos] == 0) num_even_nulls++; - - pos += 2; - } - - // Get odd nulls - pos = 1; - while (pos < size) { - if (pBuffer[pos] == 0) num_odd_nulls++; - - pos += 2; - } - - double even_null_threshold = (num_even_nulls * 2.0) / size; - double odd_null_threshold = (num_odd_nulls * 2.0) / size; - double expected_null_threshold = utf16_expected_null_percent_ / 100.0; - double unexpected_null_threshold = utf16_unexpected_null_percent_ / 100.0; - - // Lots of odd nulls, low number of even nulls - if (even_null_threshold < unexpected_null_threshold && - odd_null_threshold > expected_null_threshold) - return UTF16_LE_NOBOM; - - // Lots of even nulls, low number of odd nulls - if (odd_null_threshold < unexpected_null_threshold && - even_null_threshold > expected_null_threshold) - return UTF16_BE_NOBOM; - - // Don't know - return None; -} - -/////////////////////////////////////////////////////////////////////////////// -// Checks if a buffer contains any nulls. Used to check for binary vs text data. -/////////////////////////////////////////////////////////////////////////////// - -bool TextEncodingDetect::DoesContainNulls(const unsigned char *pBuffer, - size_t size) { - size_t pos = 0; - while (pos < size) { - if (pBuffer[pos++] == 0) return true; - } - - return false; -} diff --git a/third_party/encoding-detect/TextEncodingDetect.h b/third_party/encoding-detect/TextEncodingDetect.h deleted file mode 100644 index 6d861716..00000000 --- a/third_party/encoding-detect/TextEncodingDetect.h +++ /dev/null @@ -1,85 +0,0 @@ -#pragma once -#ifndef TEXT_ENCODING_DETECT_H_ -#define TEXT_ENCODING_DETECT_H_ - -// -// Copyright 2015 Jonathan Bennett <[email protected]> -// -// https://www.autoitscript.com -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// Includes -#include <stddef.h> - -namespace AutoIt::Common { -class TextEncodingDetect { - public: - enum Encoding { - None, // Unknown or binary - ANSI, // 0-255 - ASCII, // 0-127 - UTF8_BOM, // UTF8 with BOM - UTF8_NOBOM, // UTF8 without BOM - UTF16_LE_BOM, // UTF16 LE with BOM - UTF16_LE_NOBOM, // UTF16 LE without BOM - UTF16_BE_BOM, // UTF16-BE with BOM - UTF16_BE_NOBOM, // UTF16-BE without BOM - }; - - TextEncodingDetect(); - ~TextEncodingDetect() = default; - - static Encoding CheckBOM( - const unsigned char *pBuffer, - size_t size); // Just check if there is a BOM and return - Encoding DetectEncoding(const unsigned char *pBuffer, size_t size) - const; // Check BOM and also guess if there is no BOM - static int GetBOMLengthFromEncodingMode( - Encoding encoding); // Just return the BOM length of a given mode - - void SetNullSuggestsBinary(bool null_suggests_binary) { - null_suggests_binary_ = null_suggests_binary; - } - void SetUtf16UnexpectedNullPercent(int percent); - void SetUtf16ExpectedNullPercent(int percent); - - private: - TextEncodingDetect(const TextEncodingDetect &); - const TextEncodingDetect &operator=(const TextEncodingDetect &); - - static const unsigned char *utf16_bom_le_; - static const unsigned char *utf16_bom_be_; - static const unsigned char *utf8_bom_; - - bool null_suggests_binary_; - int utf16_expected_null_percent_; - int utf16_unexpected_null_percent_; - - Encoding CheckUTF8(const unsigned char *pBuffer, - size_t size) const; // Check for valid UTF8 with no BOM - static Encoding CheckUTF16NewlineChars( - const unsigned char *pBuffer, - size_t size); // Check for valid UTF16 with no BOM via control chars - Encoding CheckUTF16ASCII(const unsigned char *pBuffer, size_t size) - const; // Check for valid UTF16 with no BOM via null distribution - static bool DoesContainNulls(const unsigned char *pBuffer, - size_t size); // Check for nulls -}; - -} // namespace AutoIt::Common - -////////////////////////////////////////////////////////////////////// - -#endif |