Always encode special charsets.
This commit is contained in:
parent
b2b01b8c26
commit
4ff310c7e4
@ -151,6 +151,24 @@ string::size_type stringUtils::countASCIIchars
|
||||
}
|
||||
|
||||
|
||||
string::size_type stringUtils::findFirstNonASCIIchar
|
||||
(const string::const_iterator begin, const string::const_iterator end)
|
||||
{
|
||||
string::size_type pos = string::npos;
|
||||
|
||||
for (string::const_iterator i = begin ; i != end ; ++i)
|
||||
{
|
||||
if (!parserHelpers::isAscii(*i))
|
||||
{
|
||||
pos = i - begin;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
const string stringUtils::unquote(const string& str)
|
||||
{
|
||||
if (str.length() < 2)
|
||||
|
28
src/word.cpp
28
src/word.cpp
@ -336,30 +336,22 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe
|
||||
if (state == NULL)
|
||||
state = &defaultGeneratorState;
|
||||
|
||||
// Calculate the number of ASCII chars to check whether encoding is needed
|
||||
// and _which_ encoding to use.
|
||||
const string::size_type asciiCount =
|
||||
utility::stringUtils::countASCIIchars(m_buffer.begin(), m_buffer.end());
|
||||
// Find out if encoding is forced or required by contents + charset
|
||||
bool encodingNeeded = (flags & text::FORCE_ENCODING) != 0;
|
||||
|
||||
bool noEncoding = (flags & text::FORCE_NO_ENCODING) ||
|
||||
(!(flags & text::FORCE_ENCODING) && asciiCount == m_buffer.length());
|
||||
|
||||
if (!(flags & text::FORCE_NO_ENCODING) &&
|
||||
m_buffer.find_first_of("\n\r") != string::npos)
|
||||
{
|
||||
// Force encoding when there are only ASCII chars, but there is
|
||||
// also at least one of '\n' or '\r' (header fields)
|
||||
noEncoding = false;
|
||||
}
|
||||
if (encodingNeeded == false)
|
||||
encodingNeeded = wordEncoder::isEncodingNeeded(m_buffer, m_charset);
|
||||
else if ((flags & text::FORCE_NO_ENCODING) != 0)
|
||||
encodingNeeded = false;
|
||||
|
||||
// If possible and requested (with flag), quote the buffer (no folding is performed).
|
||||
// Quoting is possible if and only if:
|
||||
// - the whole buffer is ASCII-only
|
||||
// - the buffer does not need to be encoded
|
||||
// - the buffer does not contain quoting character (")
|
||||
// - there is enough remaining space on the current line to hold the whole buffer
|
||||
if (!noEncoding &&
|
||||
if (!encodingNeeded &&
|
||||
(flags & text::QUOTE_IF_POSSIBLE) &&
|
||||
asciiCount == m_buffer.length() &&
|
||||
!encodingNeeded &&
|
||||
m_buffer.find('"') == string::npos &&
|
||||
(curLineLength + 2 /* 2 x " */ + m_buffer.length()) < maxLineLength)
|
||||
{
|
||||
@ -367,7 +359,7 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe
|
||||
curLineLength += 2 + m_buffer.length();
|
||||
}
|
||||
// We will fold lines without encoding them.
|
||||
else if (noEncoding)
|
||||
else if (!encodingNeeded)
|
||||
{
|
||||
string::const_iterator lastWSpos = m_buffer.end(); // last white-space position
|
||||
string::const_iterator curLineStart = m_buffer.begin(); // current line start
|
||||
|
@ -260,17 +260,75 @@ wordEncoder::Encoding wordEncoder::getEncoding() const
|
||||
}
|
||||
|
||||
|
||||
// Explicitly force encoding for some charsets
|
||||
struct CharsetEncodingEntry
|
||||
{
|
||||
CharsetEncodingEntry(const std::string& charset_, const wordEncoder::Encoding encoding_)
|
||||
: charset(charset_), encoding(encoding_)
|
||||
{
|
||||
}
|
||||
|
||||
std::string charset;
|
||||
wordEncoder::Encoding encoding;
|
||||
};
|
||||
|
||||
CharsetEncodingEntry g_charsetEncodingMap[] =
|
||||
{
|
||||
// Use QP encoding for ISO-8859-x charsets
|
||||
CharsetEncodingEntry("iso-8859", wordEncoder::ENCODING_QP),
|
||||
CharsetEncodingEntry("iso8859", wordEncoder::ENCODING_QP),
|
||||
|
||||
// RFC-1468 states:
|
||||
// " ISO-2022-JP may also be used in MIME Part 2 headers. The "B"
|
||||
// encoding should be used with ISO-2022-JP text. "
|
||||
// Use Base64 encoding for all ISO-2022 charsets.
|
||||
CharsetEncodingEntry("iso-2022", wordEncoder::ENCODING_B64),
|
||||
CharsetEncodingEntry("iso2022", wordEncoder::ENCODING_B64),
|
||||
|
||||
// Last entry is not used
|
||||
CharsetEncodingEntry("", wordEncoder::ENCODING_AUTO)
|
||||
};
|
||||
|
||||
|
||||
// static
|
||||
bool wordEncoder::isEncodingNeeded(const string& buffer, const charset& charset)
|
||||
{
|
||||
// Special treatment for some charsets
|
||||
const string cset = utility::stringUtils::toLower(charset.getName());
|
||||
|
||||
for (unsigned int i = 0 ; i < (sizeof(g_charsetEncodingMap) / sizeof(g_charsetEncodingMap[0])) - 1 ; ++i)
|
||||
{
|
||||
if (cset.find(g_charsetEncodingMap[i].charset) != string::npos)
|
||||
{
|
||||
if (g_charsetEncodingMap[i].encoding != wordEncoder::ENCODING_AUTO)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// No encoding is needed if the buffer only contains ASCII chars
|
||||
if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos)
|
||||
return true;
|
||||
|
||||
// Force encoding when there are only ASCII chars, but there is
|
||||
// also at least one of '\n' or '\r' (header fields)
|
||||
if (buffer.find_first_of("\n\r") != string::npos)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// static
|
||||
wordEncoder::Encoding wordEncoder::guessBestEncoding
|
||||
(const string& buffer, const charset& charset)
|
||||
{
|
||||
// If the charset is ISO-8859-x, set to QP encoding
|
||||
// Special treatment for some charsets
|
||||
const string cset = utility::stringUtils::toLower(charset.getName());
|
||||
|
||||
if (cset.find("iso-8859") != string::npos ||
|
||||
cset.find("iso8859") != string::npos)
|
||||
for (unsigned int i = 0 ; i < (sizeof(g_charsetEncodingMap) / sizeof(g_charsetEncodingMap[0])) - 1 ; ++i)
|
||||
{
|
||||
return ENCODING_QP;
|
||||
if (cset.find(g_charsetEncodingMap[i].charset) != string::npos)
|
||||
return g_charsetEncodingMap[i].encoding;
|
||||
}
|
||||
|
||||
// Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default)
|
||||
|
@ -44,6 +44,8 @@ VMIME_TEST_SUITE_BEGIN
|
||||
VMIME_TEST(testWordGenerateSpace)
|
||||
VMIME_TEST(testWordGenerateSpace2)
|
||||
VMIME_TEST(testWordGenerateMultiBytes)
|
||||
VMIME_TEST(testWordGenerateQuote)
|
||||
VMIME_TEST(testWordGenerateSpecialCharsets)
|
||||
VMIME_TEST_LIST_END
|
||||
|
||||
|
||||
@ -335,9 +337,38 @@ VMIME_TEST_SUITE_BEGIN
|
||||
VASSERT_EQ("1", "=?utf-8?Q?aaa?==?utf-8?Q?=C3=A9?==?utf-8?Q?zzz?=",
|
||||
cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(16)));
|
||||
|
||||
VASSERT_EQ("1", "=?utf-8?Q?aaa=C3=A9?==?utf-8?Q?zzz?=",
|
||||
VASSERT_EQ("2", "=?utf-8?Q?aaa=C3=A9?==?utf-8?Q?zzz?=",
|
||||
cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(17)));
|
||||
}
|
||||
|
||||
void testWordGenerateQuote()
|
||||
{
|
||||
std::string str;
|
||||
vmime::utility::outputStreamStringAdapter os(str);
|
||||
|
||||
// ASCII-only text is quotable
|
||||
str.clear();
|
||||
vmime::word("Quoted text").generate(os, 1000, 0, NULL, vmime::text::QUOTE_IF_POSSIBLE, NULL);
|
||||
VASSERT_EQ("1", "\"Quoted text\"", cleanGeneratedWords(str));
|
||||
|
||||
// Text with CR/LF is not quotable
|
||||
str.clear();
|
||||
vmime::word("Non-quotable\ntext", "us-ascii").generate(os, 1000, 0, NULL, vmime::text::QUOTE_IF_POSSIBLE, NULL);
|
||||
VASSERT_EQ("2", "=?us-ascii?Q?Non-quotable=0Atext?=", cleanGeneratedWords(str));
|
||||
|
||||
// Text with non-ASCII chars is not quotable
|
||||
str.clear();
|
||||
vmime::word("Non-quotable text \xc3\xa9").generate(os, 1000, 0, NULL, vmime::text::QUOTE_IF_POSSIBLE, NULL);
|
||||
VASSERT_EQ("3", "=?UTF-8?Q?Non-quotable_text_=C3=A9?=", cleanGeneratedWords(str));
|
||||
}
|
||||
|
||||
void testWordGenerateSpecialCharsets()
|
||||
{
|
||||
// ISO-2022-JP only uses 7-bit chars but should be encoded in Base64
|
||||
VASSERT_EQ("1", "=?iso-2022-jp?B?XlskQiVRITwlPSVKJWshJiU9JVUlSCUmJSclIl5bKEI=?=",
|
||||
cleanGeneratedWords(vmime::word("^[$B%Q!<%=%J%k!&%=%U%H%&%'%\"^[(B",
|
||||
vmime::charset("iso-2022-jp")).generate(100)));
|
||||
}
|
||||
|
||||
VMIME_TEST_SUITE_END
|
||||
|
||||
|
@ -104,6 +104,14 @@ public:
|
||||
*/
|
||||
static string::size_type countASCIIchars(const string::const_iterator begin, const string::const_iterator end);
|
||||
|
||||
/** Returns the position of the first non 7-bit US-ASCII character in a string.
|
||||
*
|
||||
* @param begin start position
|
||||
* @param end end position
|
||||
* @return position since begin, or string::npos
|
||||
*/
|
||||
static string::size_type findFirstNonASCIIchar(const string::const_iterator begin, const string::const_iterator end);
|
||||
|
||||
/** Convert the specified value to a string value.
|
||||
*
|
||||
* @param value to convert
|
||||
|
@ -73,12 +73,23 @@ public:
|
||||
*/
|
||||
Encoding getEncoding() const;
|
||||
|
||||
private:
|
||||
/** Test whether RFC-2047 encoding is needed.
|
||||
*
|
||||
* @param buffer buffer to analyze
|
||||
* @param charset charset of the buffer
|
||||
* @return true if encoding is needed, false otherwise.
|
||||
*/
|
||||
static bool isEncodingNeeded(const string& buffer, const charset& charset);
|
||||
|
||||
/** Guess the best RFC-2047 encoding to use for the specified buffer.
|
||||
*
|
||||
* @param buffer buffer to analyze
|
||||
* @param charset charset of the buffer
|
||||
* @return RFC-2047 encoding
|
||||
*/
|
||||
static Encoding guessBestEncoding(const string& buffer, const charset& charset);
|
||||
|
||||
void guessBestEncoding();
|
||||
|
||||
private:
|
||||
|
||||
string m_buffer;
|
||||
string::size_type m_pos;
|
||||
|
Loading…
Reference in New Issue
Block a user