From 4ff310c7e445c2b39d6ffda2b75a097d50fabaae Mon Sep 17 00:00:00 2001 From: Vincent Richard Date: Fri, 21 May 2010 07:41:15 +0000 Subject: [PATCH] Always encode special charsets. --- src/utility/stringUtils.cpp | 18 ++++++++++ src/word.cpp | 28 ++++++--------- src/wordEncoder.cpp | 66 ++++++++++++++++++++++++++++++++--- tests/parser/textTest.cpp | 33 +++++++++++++++++- vmime/utility/stringUtils.hpp | 8 +++++ vmime/wordEncoder.hpp | 17 +++++++-- 6 files changed, 144 insertions(+), 26 deletions(-) diff --git a/src/utility/stringUtils.cpp b/src/utility/stringUtils.cpp index abae8f9a..7f61a7d3 100644 --- a/src/utility/stringUtils.cpp +++ b/src/utility/stringUtils.cpp @@ -151,6 +151,24 @@ string::size_type stringUtils::countASCIIchars } +string::size_type stringUtils::findFirstNonASCIIchar + (const string::const_iterator begin, const string::const_iterator end) +{ + string::size_type pos = string::npos; + + for (string::const_iterator i = begin ; i != end ; ++i) + { + if (!parserHelpers::isAscii(*i)) + { + pos = i - begin; + break; + } + } + + return pos; +} + + const string stringUtils::unquote(const string& str) { if (str.length() < 2) diff --git a/src/word.cpp b/src/word.cpp index 667f1fbb..9d0177fa 100644 --- a/src/word.cpp +++ b/src/word.cpp @@ -336,30 +336,22 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe if (state == NULL) state = &defaultGeneratorState; - // Calculate the number of ASCII chars to check whether encoding is needed - // and _which_ encoding to use. - const string::size_type asciiCount = - utility::stringUtils::countASCIIchars(m_buffer.begin(), m_buffer.end()); + // Find out if encoding is forced or required by contents + charset + bool encodingNeeded = (flags & text::FORCE_ENCODING) != 0; - bool noEncoding = (flags & text::FORCE_NO_ENCODING) || - (!(flags & text::FORCE_ENCODING) && asciiCount == m_buffer.length()); - - if (!(flags & text::FORCE_NO_ENCODING) && - m_buffer.find_first_of("\n\r") != string::npos) - { - // Force encoding when there are only ASCII chars, but there is - // also at least one of '\n' or '\r' (header fields) - noEncoding = false; - } + if (encodingNeeded == false) + encodingNeeded = wordEncoder::isEncodingNeeded(m_buffer, m_charset); + else if ((flags & text::FORCE_NO_ENCODING) != 0) + encodingNeeded = false; // If possible and requested (with flag), quote the buffer (no folding is performed). // Quoting is possible if and only if: - // - the whole buffer is ASCII-only + // - the buffer does not need to be encoded // - the buffer does not contain quoting character (") // - there is enough remaining space on the current line to hold the whole buffer - if (!noEncoding && + if (!encodingNeeded && (flags & text::QUOTE_IF_POSSIBLE) && - asciiCount == m_buffer.length() && + !encodingNeeded && m_buffer.find('"') == string::npos && (curLineLength + 2 /* 2 x " */ + m_buffer.length()) < maxLineLength) { @@ -367,7 +359,7 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe curLineLength += 2 + m_buffer.length(); } // We will fold lines without encoding them. - else if (noEncoding) + else if (!encodingNeeded) { string::const_iterator lastWSpos = m_buffer.end(); // last white-space position string::const_iterator curLineStart = m_buffer.begin(); // current line start diff --git a/src/wordEncoder.cpp b/src/wordEncoder.cpp index 154b4efc..cc8292f8 100644 --- a/src/wordEncoder.cpp +++ b/src/wordEncoder.cpp @@ -260,17 +260,75 @@ wordEncoder::Encoding wordEncoder::getEncoding() const } +// Explicitly force encoding for some charsets +struct CharsetEncodingEntry +{ + CharsetEncodingEntry(const std::string& charset_, const wordEncoder::Encoding encoding_) + : charset(charset_), encoding(encoding_) + { + } + + std::string charset; + wordEncoder::Encoding encoding; +}; + +CharsetEncodingEntry g_charsetEncodingMap[] = +{ + // Use QP encoding for ISO-8859-x charsets + CharsetEncodingEntry("iso-8859", wordEncoder::ENCODING_QP), + CharsetEncodingEntry("iso8859", wordEncoder::ENCODING_QP), + + // RFC-1468 states: + // " ISO-2022-JP may also be used in MIME Part 2 headers. The "B" + // encoding should be used with ISO-2022-JP text. " + // Use Base64 encoding for all ISO-2022 charsets. + CharsetEncodingEntry("iso-2022", wordEncoder::ENCODING_B64), + CharsetEncodingEntry("iso2022", wordEncoder::ENCODING_B64), + + // Last entry is not used + CharsetEncodingEntry("", wordEncoder::ENCODING_AUTO) +}; + + +// static +bool wordEncoder::isEncodingNeeded(const string& buffer, const charset& charset) +{ + // Special treatment for some charsets + const string cset = utility::stringUtils::toLower(charset.getName()); + + for (unsigned int i = 0 ; i < (sizeof(g_charsetEncodingMap) / sizeof(g_charsetEncodingMap[0])) - 1 ; ++i) + { + if (cset.find(g_charsetEncodingMap[i].charset) != string::npos) + { + if (g_charsetEncodingMap[i].encoding != wordEncoder::ENCODING_AUTO) + return true; + } + } + + // No encoding is needed if the buffer only contains ASCII chars + if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos) + return true; + + // Force encoding when there are only ASCII chars, but there is + // also at least one of '\n' or '\r' (header fields) + if (buffer.find_first_of("\n\r") != string::npos) + return true; + + return false; +} + + // static wordEncoder::Encoding wordEncoder::guessBestEncoding (const string& buffer, const charset& charset) { - // If the charset is ISO-8859-x, set to QP encoding + // Special treatment for some charsets const string cset = utility::stringUtils::toLower(charset.getName()); - if (cset.find("iso-8859") != string::npos || - cset.find("iso8859") != string::npos) + for (unsigned int i = 0 ; i < (sizeof(g_charsetEncodingMap) / sizeof(g_charsetEncodingMap[0])) - 1 ; ++i) { - return ENCODING_QP; + if (cset.find(g_charsetEncodingMap[i].charset) != string::npos) + return g_charsetEncodingMap[i].encoding; } // Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default) diff --git a/tests/parser/textTest.cpp b/tests/parser/textTest.cpp index 5c9b5213..4a7e394f 100644 --- a/tests/parser/textTest.cpp +++ b/tests/parser/textTest.cpp @@ -44,6 +44,8 @@ VMIME_TEST_SUITE_BEGIN VMIME_TEST(testWordGenerateSpace) VMIME_TEST(testWordGenerateSpace2) VMIME_TEST(testWordGenerateMultiBytes) + VMIME_TEST(testWordGenerateQuote) + VMIME_TEST(testWordGenerateSpecialCharsets) VMIME_TEST_LIST_END @@ -335,9 +337,38 @@ VMIME_TEST_SUITE_BEGIN VASSERT_EQ("1", "=?utf-8?Q?aaa?==?utf-8?Q?=C3=A9?==?utf-8?Q?zzz?=", cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(16))); - VASSERT_EQ("1", "=?utf-8?Q?aaa=C3=A9?==?utf-8?Q?zzz?=", + VASSERT_EQ("2", "=?utf-8?Q?aaa=C3=A9?==?utf-8?Q?zzz?=", cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(17))); } + void testWordGenerateQuote() + { + std::string str; + vmime::utility::outputStreamStringAdapter os(str); + + // ASCII-only text is quotable + str.clear(); + vmime::word("Quoted text").generate(os, 1000, 0, NULL, vmime::text::QUOTE_IF_POSSIBLE, NULL); + VASSERT_EQ("1", "\"Quoted text\"", cleanGeneratedWords(str)); + + // Text with CR/LF is not quotable + str.clear(); + vmime::word("Non-quotable\ntext", "us-ascii").generate(os, 1000, 0, NULL, vmime::text::QUOTE_IF_POSSIBLE, NULL); + VASSERT_EQ("2", "=?us-ascii?Q?Non-quotable=0Atext?=", cleanGeneratedWords(str)); + + // Text with non-ASCII chars is not quotable + str.clear(); + vmime::word("Non-quotable text \xc3\xa9").generate(os, 1000, 0, NULL, vmime::text::QUOTE_IF_POSSIBLE, NULL); + VASSERT_EQ("3", "=?UTF-8?Q?Non-quotable_text_=C3=A9?=", cleanGeneratedWords(str)); + } + + void testWordGenerateSpecialCharsets() + { + // ISO-2022-JP only uses 7-bit chars but should be encoded in Base64 + VASSERT_EQ("1", "=?iso-2022-jp?B?XlskQiVRITwlPSVKJWshJiU9JVUlSCUmJSclIl5bKEI=?=", + cleanGeneratedWords(vmime::word("^[$B%Q!<%=%J%k!&%=%U%H%&%'%\"^[(B", + vmime::charset("iso-2022-jp")).generate(100))); + } + VMIME_TEST_SUITE_END diff --git a/vmime/utility/stringUtils.hpp b/vmime/utility/stringUtils.hpp index b6589dbc..a8270d32 100644 --- a/vmime/utility/stringUtils.hpp +++ b/vmime/utility/stringUtils.hpp @@ -104,6 +104,14 @@ public: */ static string::size_type countASCIIchars(const string::const_iterator begin, const string::const_iterator end); + /** Returns the position of the first non 7-bit US-ASCII character in a string. + * + * @param begin start position + * @param end end position + * @return position since begin, or string::npos + */ + static string::size_type findFirstNonASCIIchar(const string::const_iterator begin, const string::const_iterator end); + /** Convert the specified value to a string value. * * @param value to convert diff --git a/vmime/wordEncoder.hpp b/vmime/wordEncoder.hpp index 17ca8081..1a492ea6 100644 --- a/vmime/wordEncoder.hpp +++ b/vmime/wordEncoder.hpp @@ -73,12 +73,23 @@ public: */ Encoding getEncoding() const; -private: + /** Test whether RFC-2047 encoding is needed. + * + * @param buffer buffer to analyze + * @param charset charset of the buffer + * @return true if encoding is needed, false otherwise. + */ + static bool isEncodingNeeded(const string& buffer, const charset& charset); + /** Guess the best RFC-2047 encoding to use for the specified buffer. + * + * @param buffer buffer to analyze + * @param charset charset of the buffer + * @return RFC-2047 encoding + */ static Encoding guessBestEncoding(const string& buffer, const charset& charset); - void guessBestEncoding(); - +private: string m_buffer; string::size_type m_pos;