Fixed bug #1096610: non-integral number of chars in RFC-2047 encoded words.

author: Vincent Richard <[email protected]> 2006-10-02 13:44:00 +0000
committer: Vincent Richard <[email protected]> 2006-10-02 13:44:00 +0000
commit: b79a6ad89013e4aba947df043ad26cbe7fbab5a5 (patch)
tree: 352ecfc74b409c71f68e906630940bda71350083
parent: Attachment [file]name. (diff)
download: vmime-b79a6ad89013e4aba947df043ad26cbe7fbab5a5.tar.gz
vmime-b79a6ad89013e4aba947df043ad26cbe7fbab5a5.zip
6 files changed, 448 insertions, 116 deletions
diff --git a/ChangeLog b/ChangeLog
index 85763a65..5b4a4b94 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,12 @@
 VERSION 0.8.1cvs
 ================
 
+2006-10-02  Vincent Richard  <[email protected]>
+
+ * word, wordEncoder: fixed bug #1096610 which caused encoding of a
+   non-integral number of characters (and then, generation of
+   incorrectly-formed words) with multi-bytes charsets.
+
 2006-07-13  Vincent Richard  <[email protected]>
 
  * Fixed bugs in MHTML code: 'CID' prefix should not be case-sensitive;
diff --git a/SConstruct b/SConstruct
index c719c292..ed5a1b2e 100644
--- a/SConstruct
+++ b/SConstruct
@@ -140,6 +140,7 @@ libvmime_sources = [
 	'textPart.hpp',
 	'types.hpp',
 	'word.cpp', 'word.hpp',
+	'wordEncoder.cpp', 'wordEncoder.hpp',
 	'vmime.hpp',
 	# ==============================  Utility  =============================
 	'utility/childProcess.hpp',
diff --git a/src/word.cpp b/src/word.cpp
index 58133577..39059dc0 100644
--- a/src/word.cpp
+++ b/src/word.cpp
@@ -32,6 +32,8 @@
 #include "vmime/encoderB64.hpp"
 #include "vmime/encoderQP.hpp"
 
+#include "vmime/wordEncoder.hpp"
+
 
 namespace vmime
 {
@@ -481,12 +483,11 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe
 				? maxLineLength
 				: std::min(maxLineLength, static_cast <string::size_type>(76));
 
-		// Base64 if more than 60% non-ascii, quoted-printable else (default)
-		const string::size_type asciiPercent = (m_buffer.length() == 0 ? 100 : (100 * asciiCount) / m_buffer.length());
-		const string::value_type encoding = (asciiPercent <= 40) ? 'B' : 'Q';
+		wordEncoder wordEnc(m_buffer, m_charset);
 
-		string wordStart("=?" + m_charset.getName() + "?" + encoding + "?");
-		string wordEnd("?=");
+		const string wordStart("=?" + m_charset.getName() + "?" +
+			(wordEnc.getEncoding() == wordEncoder::ENCODING_B64 ? 'B' : 'Q') + "?");
+		const string wordEnd("?=");
 
 		const string::size_type minWordLength = wordStart.length() + wordEnd.length();
 		const string::size_type maxLineLength2 = (maxLineLength3 < minWordLength + 1)
@@ -520,125 +521,40 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe
 		}
 
 		// Encode and fold input buffer
-		string::const_iterator pos = m_buffer.begin();
-		string::size_type remaining = m_buffer.length();
-
-		encoder* theEncoder = NULL;
-
-		if (encoding == 'B') theEncoder = new encoderB64;
-		else theEncoder = new encoderQP;
-
-		string qpEncodedBuffer;
-
-		if (encoding == 'Q')
-		{
-			theEncoder->getProperties()["rfc2047"] = true;
-
-			// In the case of Quoted-Printable encoding, we cannot simply encode input
-			// buffer line by line. So, we encode the whole buffer and we will fold it
-			// in the next loop...
-			utility::inputStreamStringAdapter in(m_buffer);
-			utility::outputStreamStringAdapter out(qpEncodedBuffer);
-
-			theEncoder->encode(in, out);
-
-			pos = qpEncodedBuffer.begin();
-			remaining = qpEncodedBuffer.length();
-		}
-
-#if 1
 		if (curLineLength != 1 && !isFirstWord)
 		{
 			os << " "; // Separate from previous word
 			++curLineLength;
 		}
-#endif
 
-		for ( ; remaining ; )
+		for (unsigned int i = 0 ; ; ++i)
 		{
-			// Start a new encoded word
-			os << wordStart;
-			curLineLength += minWordLength;
-
 			// Compute the number of encoded chars that will fit on this line
-			const string::size_type fit = maxLineLength2 - curLineLength;
-
-			// Base-64 encoding
-			if (encoding == 'B')
-			{
-				// TODO: WARNING! "Any encoded word which encodes a non-integral
-				// number of characters or octets is incorrectly formed."
-
-				// Here, we have a formula to compute the maximum number of source
-				// characters to encode knowing the maximum number of encoded chars
-				// (with Base64, 3 bytes of input provide 4 bytes of output).
-				string::size_type count = (fit > 1) ? ((fit - 1) * 3) / 4 : 1;
-				if (count > remaining) count = remaining;
+			const string::size_type fit = maxLineLength2 - minWordLength
+				- (i == 0 ? curLineLength : NEW_LINE_SEQUENCE_LENGTH);
 
-				utility::inputStreamStringAdapter in
-					(m_buffer, pos - m_buffer.begin(), pos - m_buffer.begin() + count);
+			// Get the next encoded chunk
+			const string chunk = wordEnc.getNextChunk(fit);
 
-				curLineLength += theEncoder->encode(in, os);
+			if (chunk.empty())
+				break;
 
-				pos += count;
-				remaining -= count;
-			}
-			// Quoted-Printable encoding
-			else
+			// Start a new encoded word
+			if (i != 0)
 			{
-				// TODO: WARNING! "Any encoded word which encodes a non-integral
-				// number of characters or octets is incorrectly formed."
-
-				// All we have to do here is to take a certain number of character
-				// (that is less than or equal to "fit") from the QP encoded buffer,
-				// but we also make sure not to fold a "=XY" encoded char.
-				const string::const_iterator qpEnd = qpEncodedBuffer.end();
-				string::const_iterator lastFoldPos = pos;
-				string::const_iterator p = pos;
-				string::size_type n = 0;
-
-				while (n < fit && p != qpEnd)
-				{
-					if (*p == '=')
-					{
-						if (n + 3 >= fit)
-						{
-							lastFoldPos = p;
-							break;
-						}
-
-						p += 3;
-						n += 3;
-					}
-					else
-					{
-						++p;
-						++n;
-					}
-				}
-
-				if (lastFoldPos == pos)
-					lastFoldPos = p;
-
-				os << string(pos, lastFoldPos);
+				os << NEW_LINE_SEQUENCE;
+				curLineLength = NEW_LINE_SEQUENCE_LENGTH;
+			}
 
-				curLineLength += (lastFoldPos - pos) + 1;
+			os << wordStart;
+			curLineLength += minWordLength;
 
-				pos += n;
-				remaining -= n;
-			}
+			os << chunk;
+			curLineLength += chunk.length();
 
 			// End of the encoded word
 			os << wordEnd;
-
-			if (remaining)
-			{
-				os << NEW_LINE_SEQUENCE;
-				curLineLength = NEW_LINE_SEQUENCE_LENGTH;
-			}
 		}
-
-		delete (theEncoder);
 	}
 
 	if (newLinePos)
diff --git a/src/wordEncoder.cpp b/src/wordEncoder.cpp
new file mode 100644
index 00000000..e854eac6
--- /dev/null
+++ b/src/wordEncoder.cpp
@@ -0,0 +1,290 @@
+//
+// VMime library (http://www.vmime.org)
+// Copyright (C) 2002-2006 Vincent Richard <[email protected]>
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Linking this library statically or dynamically with other modules is making
+// a combined work based on this library.  Thus, the terms and conditions of
+// the GNU General Public License cover the whole combination.
+//
+
+#include "vmime/wordEncoder.hpp"
+
+#include "vmime/exception.hpp"
+#include "vmime/charsetConverter.hpp"
+#include "vmime/encoderB64.hpp"
+#include "vmime/encoderQP.hpp"
+
+#include "vmime/utility/stringUtils.hpp"
+
+
+namespace vmime
+{
+
+
+wordEncoder::wordEncoder(const string& buffer, const charset& charset, const Encoding encoding)
+	: m_buffer(buffer), m_pos(0), m_length(buffer.length()), m_charset(charset), m_encoding(encoding)
+{
+	try
+	{
+		string utf8Buffer;
+
+		vmime::charset::convert
+			(buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8));
+
+		m_buffer = utf8Buffer;
+		m_length = utf8Buffer.length();
+
+		m_simple = false;
+	}
+	catch (exceptions::charset_conv_error&)
+	{
+		// Ignore exception.
+		// We will fall back on simple encoding.
+		m_simple = true;
+	}
+
+	if (m_encoding == ENCODING_AUTO)
+		m_encoding = guessBestEncoding(buffer, charset);
+
+	if (m_encoding == ENCODING_B64)
+	{
+		m_encoder = vmime::create <encoderB64>();
+	}
+	else // ENCODING_QP
+	{
+		m_encoder = vmime::create <encoderQP>();
+		m_encoder->getProperties()["rfc2047"] = true;
+	}
+}
+
+
+static const string::size_type getUTF8CharLength
+	(const string& buffer, const string::size_type pos, const string::size_type length)
+{
+	// Gives the number of extra bytes in a UTF8 char, given the leading char
+	static const unsigned char UTF8_EXTRA_BYTES[256] =
+	{
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+	};
+
+	const unsigned char c = buffer[pos];
+	const unsigned char n = UTF8_EXTRA_BYTES[c];
+
+	if (n < length - pos)
+		return n + 1;
+	else
+		return 1;
+}
+
+
+const string wordEncoder::getNextChunk(const string::size_type maxLength)
+{
+	const string::size_type remaining = m_length - m_pos;
+
+	if (remaining == 0)
+		return string();
+
+	vmime::string chunk;
+	vmime::utility::outputStreamStringAdapter chunkStream(chunk);
+
+	// Simple encoding
+	if (m_simple)
+	{
+		// WARNING! Simple encoding can encode a non-integral number of
+		// characters and then may generate incorrectly-formed words!
+
+		if (m_encoding == ENCODING_B64)
+		{
+			// Here, we have a formula to compute the maximum number of source
+			// bytes to encode knowing the maximum number of encoded chars. In
+			// Base64 encoding, 3 bytes of input provide 4 bytes of output.
+			const string::size_type inputCount =
+				std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1);
+
+			// Encode chunk
+			utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
+
+			m_encoder->encode(in, chunkStream);
+			m_pos += inputCount;
+		}
+		else // ENCODING_QP
+		{
+			// Compute exactly how much input bytes are needed to have an output
+			// string length of less than 'maxLength' bytes. In Quoted-Printable
+			// encoding, encoded bytes take 3 bytes.
+			string::size_type inputCount = 0;
+			string::size_type outputCount = 0;
+
+			while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
+			{
+				const unsigned char c = m_buffer[m_pos + inputCount];
+				bool encoded = true;
+
+				switch (c)
+				{
+				case ',':
+				case ';':
+				case ':':
+				case '_':
+				case '=':
+
+					encoded = true;
+					break;
+
+				default:
+
+					if (c >= 33 && c <= 126 && c != 61)
+						encoded = false;
+
+					break;
+				}
+
+				inputCount++;
+				outputCount += (encoded ? 3 : 1);
+			}
+
+			// Encode chunk
+			utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
+
+			m_encoder->encode(in, chunkStream);
+			m_pos += inputCount;
+		}
+	}
+	// Fully RFC-compliant encoding
+	else
+	{
+		charsetConverter conv(vmime::charset(charsets::UTF_8), m_charset);
+
+		string::size_type inputCount = 0;
+		string::size_type outputCount = 0;
+		string encodeBuffer;
+
+		while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
+		{
+			// Get the next UTF8 character
+			const string::size_type inputCharLength =
+				getUTF8CharLength(m_buffer, m_pos + inputCount, m_length);
+
+			const string inputChar(m_buffer.begin() + m_pos + inputCount,
+				m_buffer.begin() + m_pos + inputCount + inputCharLength);
+
+			// Convert back to original encoding
+			string encodeBytes;
+			conv.convert(inputChar, encodeBytes);
+
+			encodeBuffer += encodeBytes;
+
+			// Compute number of output bytes
+			if (m_encoding == ENCODING_B64)
+			{
+				outputCount = std::max(static_cast <string::size_type>(4),
+					(encodeBytes.length() * 4) / 3);
+			}
+			else // ENCODING_QP
+			{
+				for (string::size_type i = 0, n = encodeBytes.length() ; i < n ; ++i)
+				{
+					const unsigned char c = encodeBytes[i];
+					bool encoded = true;
+
+					switch (c)
+					{
+					case ',':
+					case ';':
+					case ':':
+					case '_':
+					case '=':
+
+						encoded = true;
+						break;
+
+					default:
+
+						if (c >= 33 && c <= 126 && c != 61)
+							encoded = false;
+
+						break;
+					}
+
+					outputCount += (encoded ? 3 : 1);
+				}
+			}
+
+			inputCount += inputCharLength;
+		}
+
+		// Encode chunk
+		utility::inputStreamStringAdapter in(encodeBuffer);
+
+		m_encoder->encode(in, chunkStream);
+		m_pos += inputCount;
+	}
+
+	return chunk;
+}
+
+
+const wordEncoder::Encoding wordEncoder::getEncoding() const
+{
+	return m_encoding;
+}
+
+
+// static
+const wordEncoder::Encoding wordEncoder::guessBestEncoding
+	(const string& buffer, const charset& charset)
+{
+	// If the charset is ISO-8859-x, set to QP encoding
+	const string cset = utility::stringUtils::toLower(charset.getName());
+
+	if (cset.find("iso-8859") != string::npos ||
+	    cset.find("iso8859") != string::npos)
+	{
+		return ENCODING_QP;
+	}
+
+	// Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default)
+	const string::size_type asciiCount =
+		utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end());
+
+	const string::size_type asciiPercent =
+		(buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length());
+
+	if (asciiPercent < 60)
+		return ENCODING_B64;
+	else
+		return ENCODING_QP;
+}
+
+
+} // vmime
+
diff --git a/tests/parser/textTest.cpp b/tests/parser/textTest.cpp
index 011ec294..adcd014a 100644
--- a/tests/parser/textTest.cpp
+++ b/tests/parser/textTest.cpp
@@ -41,9 +41,34 @@ VMIME_TEST_SUITE_BEGIN
 		VMIME_TEST(testWordConstructors)
 		VMIME_TEST(testWordParse)
 		VMIME_TEST(testWordGenerate)
+		VMIME_TEST(testWordGenerateMultiBytes)
 	VMIME_TEST_LIST_END
 
 
+	static const vmime::string getDisplayText(const vmime::text& t)
+	{
+		vmime::string res;
+
+		for (int i = 0 ; i < t.getWordCount() ; ++i)
+			res += t.getWordAt(i)->getBuffer();
+
+		return res;
+	}
+
+	static const vmime::string cleanGeneratedWords(const std::string& str)
+	{
+		std::istringstream iss(str);
+
+		std::string res;
+		std::string x;
+
+		while (std::getline(iss, x))
+			res += vmime::utility::stringUtils::trim(x);
+
+		return res;
+	}
+
+
 	void testConstructors()
 	{
 		vmime::text t1;
@@ -171,16 +196,6 @@ VMIME_TEST_SUITE_BEGIN
 		// TODO
 	}
 
-	static const vmime::string getDisplayText(const vmime::text& t)
-	{
-		vmime::string res;
-
-		for (int i = 0 ; i < t.getWordCount() ; ++i)
-			res += t.getWordAt(i)->getBuffer();
-
-		return res;
-	}
-
 	void testDisplayForm()
 	{
 #define DISPLAY_FORM(x) getDisplayText(*vmime::text::decodeAndUnfold(x))
@@ -254,5 +269,15 @@ VMIME_TEST_SUITE_BEGIN
 			vmime::word("\xf1\xf2\xf3\xf4\xf5", vmime::charset("foo")).generate());
 	}
 
+	void testWordGenerateMultiBytes()
+	{
+		// Ensure we don't encode a non-integral number of characters
+		VASSERT_EQ("1", "=?utf-8?Q?aaa?==?utf-8?Q?=C3=A9?==?utf-8?Q?zzz?=",
+			cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(16)));
+
+		VASSERT_EQ("1", "=?utf-8?Q?aaa=C3=A9?==?utf-8?Q?zzz?=",
+			cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(17)));
+	}
+
 VMIME_TEST_SUITE_END
 
diff --git a/vmime/wordEncoder.hpp b/vmime/wordEncoder.hpp
new file mode 100644
index 00000000..584904ca
--- /dev/null
+++ b/vmime/wordEncoder.hpp
@@ -0,0 +1,94 @@
+//
+// VMime library (http://www.vmime.org)
+// Copyright (C) 2002-2006 Vincent Richard <[email protected]>
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Linking this library statically or dynamically with other modules is making
+// a combined work based on this library.  Thus, the terms and conditions of
+// the GNU General Public License cover the whole combination.
+//
+
+#ifndef VMIME_WORDENCODER_HPP_INCLUDED
+#define VMIME_WORDENCODER_HPP_INCLUDED
+
+
+#include "vmime/charset.hpp"
+
+
+namespace vmime
+{
+
+
+class encoder;
+
+
+/** Encodes words following RFC-2047.
+  */
+
+class wordEncoder
+{
+public:
+
+	/** Available encodings for RFC-2047. */
+	enum Encoding
+	{
+		ENCODING_AUTO,
+		ENCODING_QP,
+		ENCODING_B64
+	};
+
+
+	wordEncoder(const string& buffer, const charset& charset, const Encoding encoding = ENCODING_AUTO);
+
+
+	/** Return the next chunk in the word.
+	  *
+	  * @param maxLength maximal length of the chunk
+	  * @return next chunk, of maximal length 'maxLength' if possible
+	  */
+	const string getNextChunk(const string::size_type maxLength);
+
+	/** Return the encoding used.
+	  *
+	  * @return encoding
+	  */
+	const Encoding getEncoding() const;
+
+private:
+
+	static const Encoding guessBestEncoding(const string& buffer, const charset& charset);
+
+	void guessBestEncoding();
+
+
+	string m_buffer;
+	string::size_type m_pos;
+	string::size_type m_length;
+
+	bool m_simple;
+
+	charset m_charset;
+	Encoding m_encoding;
+
+	ref <encoder> m_encoder;
+};
+
+
+} // vmime
+
+
+#endif // VMIME_WORDENCODER_HPP_INCLUDED
+
author	Vincent Richard <[email protected]>	2006-10-02 13:44:00 +0000
committer	Vincent Richard <[email protected]>	2006-10-02 13:44:00 +0000
commit	b79a6ad89013e4aba947df043ad26cbe7fbab5a5 (patch)
tree	352ecfc74b409c71f68e906630940bda71350083
parent	Attachment [file]name. (diff)
download	vmime-b79a6ad89013e4aba947df043ad26cbe7fbab5a5.tar.gz vmime-b79a6ad89013e4aba947df043ad26cbe7fbab5a5.zip