Fixed #149: don't loose charset when fixing invalid broken words.

author: Vincent Richard <[email protected]> 2016-11-05 12:31:54 +0000
committer: Vincent Richard <[email protected]> 2016-11-05 12:31:54 +0000
commit: 5424aa2381b76464ead401090558820f5d9d270b (patch)
tree: f12ba33b48ca55348542ed8c903f59ac2e8bf9c0
parent: Merge branch 'master' of https://github.com/kisli/vmime (diff)
download: vmime-5424aa2381b76464ead401090558820f5d9d270b.tar.gz
vmime-5424aa2381b76464ead401090558820f5d9d270b.zip
2 files changed, 97 insertions, 21 deletions
diff --git a/src/vmime/text.cpp b/src/vmime/text.cpp
index 0722be02..939f4f81 100644
--- a/src/vmime/text.cpp
+++ b/src/vmime/text.cpp
@@ -417,10 +417,9 @@ void text::fixBrokenWords(std::vector <shared_ptr <word> >& words)
 
 	// Fix words which encode a non-integral number of characters.
 	// This is not RFC-compliant, but we should be able to recover from it.
-	for (size_t i = 0, n = words.size() - 1 ; i < n ; ++i)
+	for (size_t i = 0, n = words.size() ; i < n - 1 ; ++i)
 	{
 		shared_ptr <word> w1 = words[i];
-		shared_ptr <word> w2 = words[i + 1];
 
 		// Check whether the word is valid
 		bool valid = false;
@@ -436,22 +435,46 @@ void text::fixBrokenWords(std::vector <shared_ptr <word> >& words)
 		}
 
 		// If the current word is not valid, try to grab some bytes
-		// from the next word, to see whether it becomes valid.
+		// from the next words, to see whether it becomes valid.
 		if (!valid)
 		{
 			string buffer(w1->getBuffer());
-			buffer += w2->getBuffer();
+			size_t mergeWords = 1;  // number of adjacent words to merge
+
+			for (size_t j = i + 1 ; j < n ; ++j)
+			{
+				shared_ptr <word> nextWord = words[j];
+
+				if (nextWord->getCharset() != w1->getCharset())
+					break;
+
+				buffer += nextWord->getBuffer();
+				++mergeWords;
+			}
+
+			if (mergeWords == 1)
+			{
+				// No adjacent word with same charset found
+				continue;
+			}
 
 			string::size_type firstInvalidByte;
 			valid = w1->getCharset().isValidText(buffer, &firstInvalidByte);
 
-			// Current word with additional bytes from the next word
-			// is now valid: adjust buffers of both words.
+			// Current word with additional bytes from the next words
+			// is now valid: adjust buffers of words.
 			w1->setBuffer(string(buffer.begin(), buffer.begin() + firstInvalidByte));
-			w2->setBuffer(string(buffer.begin() + firstInvalidByte, buffer.end()));
+			words[i + 1]->setBuffer(string(buffer.begin() + firstInvalidByte, buffer.end()));
+
+			// Remove unused words
+			for (size_t j = 0 ; j < mergeWords - 2 ; ++j)
+			{
+				words.erase(words.begin() + i + 2);
+				--n;
+			}
 
 			// If the next word is now empty, remove it
-			if (w2->getBuffer().empty())
+			if (words[i + 1]->getBuffer().empty())
 			{
 				words.erase(words.begin() + i + 1);
 				--n;
diff --git a/tests/parser/textTest.cpp b/tests/parser/textTest.cpp
index 978d9145..530c83b6 100644
--- a/tests/parser/textTest.cpp
+++ b/tests/parser/textTest.cpp
@@ -62,6 +62,7 @@ VMIME_TEST_SUITE_BEGIN(textTest)
 
 		VMIME_TEST(testWronglyPaddedB64Words)
 		VMIME_TEST(testFixBrokenWords)
+		VMIME_TEST(testUnknownCharset)
 	VMIME_TEST_LIST_END
 
 
@@ -632,19 +633,21 @@ VMIME_TEST_SUITE_BEGIN(textTest)
 			("=?utf-8?Q?Gwena=C3?="
 			 "=?utf-8?Q?=ABl?=", &outText);
 
-		VASSERT_EQ("1", "Gwena\xebl",
-			outText.getConvertedText(vmime::charset("iso-8859-1"), opts));
+		VASSERT_EQ("1.1", 1, outText.getWordCount());
+		VASSERT_EQ("1.2", "Gwena\xc3\xabl", outText.getWordAt(0)->getBuffer());
+		VASSERT_EQ("1.3", vmime::charset("utf-8"), outText.getWordAt(0)->getCharset());
 
 		// Test case 2
 		vmime::text::decodeAndUnfold
 			("=?utf-8?B?5Lit6Yu85qmf5qKw6JGj5LqL5pyDMTAz5bm056ysMDXlsYbn?="
 			 "=?utf-8?B?rKwwN+asoeitsOeoiw==?=", &outText);
 
-		VASSERT_EQ("2", "\xe4\xb8\xad\xe9\x8b\xbc\xe6\xa9\x9f\xe6\xa2\xb0"
+		VASSERT_EQ("2.1", 1, outText.getWordCount());
+		VASSERT_EQ("2.2", "\xe4\xb8\xad\xe9\x8b\xbc\xe6\xa9\x9f\xe6\xa2\xb0"
 			"\xe8\x91\xa3\xe4\xba\x8b\xe6\x9c\x83\x31\x30\x33\xe5\xb9\xb4"
 			"\xe7\xac\xac\x30\x35\xe5\xb1\x86\xe7\xac\xac\x30\x37\xe6\xac"
-			"\xa1\xe8\xad\xb0\xe7\xa8\x8b",
-			outText.getConvertedText(vmime::charset("utf-8")));
+			"\xa1\xe8\xad\xb0\xe7\xa8\x8b", outText.getWordAt(0)->getBuffer());
+		VASSERT_EQ("2.3", vmime::charset("utf-8"), outText.getWordAt(0)->getCharset());
 
 		// Test case 3 (a character spanning over 3 words: 'を' = E3 82 92)
 		vmime::text::decodeAndUnfold
@@ -652,15 +655,65 @@ VMIME_TEST_SUITE_BEGIN(textTest)
 			 "=?utf-8?Q?=82?="
 			 "=?utf-8?Q?=92xyz?=", &outText);
 
-		std::string out;  // decode as UTF-16 then rencode to UTF-8 for easier comparison
-		vmime::charset::convert(
-			outText.getConvertedText(vmime::charset("utf-16"), opts),
-			out,
-			vmime::charset("utf-16"),
-			vmime::charset("utf-8")
-		);
+		VASSERT_EQ("3.1", 1, outText.getWordCount());
+		VASSERT_EQ("3.2", "abc\xe3\x82\x92xyz", outText.getWordAt(0)->getBuffer());
+		VASSERT_EQ("3.3", vmime::charset("utf-8"), outText.getWordAt(0)->getCharset());
+
+		// Test case 4 (remains invalid)
+		vmime::text::decodeAndUnfold
+			("=?utf-8?Q?abc=E3?="
+			 "=?utf-8?Q?=82?="
+			 "=?utf-8?Q?xy?="
+			 "=?utf-8?Q?z?=", &outText);
+
+		VASSERT_EQ("4.1", 2, outText.getWordCount());
+		VASSERT_EQ("4.2", "abc", outText.getWordAt(0)->getBuffer());
+		VASSERT_EQ("4.3", vmime::charset("utf-8"), outText.getWordAt(0)->getCharset());
+		VASSERT_EQ("4.4", "\xe3\x82xyz", outText.getWordAt(1)->getBuffer());
+		VASSERT_EQ("4.5", vmime::charset("utf-8"), outText.getWordAt(1)->getCharset());
+
+		// Test case 5 (remains partially invalid)
+		vmime::text::decodeAndUnfold
+			("=?utf-8?Q?abc=E3?="
+			 "=?utf-8?Q?=82?="
+			 "=?utf-8?Q?\x92xy?="
+			 "=?utf-8?Q?z\xc3?=", &outText);
+
+		VASSERT_EQ("5.1", 2, outText.getWordCount());
+		VASSERT_EQ("5.2", "abc\xe3\x82\x92xyz", outText.getWordAt(0)->getBuffer());
+		VASSERT_EQ("5.3", vmime::charset("utf-8"), outText.getWordAt(0)->getCharset());
+		VASSERT_EQ("5.4", "\xc3", outText.getWordAt(1)->getBuffer());
+		VASSERT_EQ("5.5", vmime::charset("utf-8"), outText.getWordAt(1)->getCharset());
+	}
+
+	void testUnknownCharset()
+	{
+		vmime::text t;
+		vmime::text::decodeAndUnfold("=?gb2312?B?wdaRY8PA?=", &t);
+
+		VASSERT_EQ("1.1", 1, t.getWordCount());
+		VASSERT_EQ("1.2", "\xc1\xd6\x91\x63\xc3\xc0", t.getWordAt(0)->getBuffer());
+		VASSERT_EQ("1.3", vmime::charset("gb2312"), t.getWordAt(0)->getCharset());
+
+
+
+		vmime::parsingContext ctx;
+
+		const vmime::string hfieldBuffer = "From: '=?gb2312?B?wdaRY8PA?=' <[email protected]>";
+
+		vmime::shared_ptr <vmime::headerField> hfield =
+			vmime::headerField::parseNext(ctx, hfieldBuffer, 0, hfieldBuffer.size());
+
+		vmime::shared_ptr <vmime::mailbox> hvalue =
+			hfield->getValue <vmime::mailbox>();
 
-		VASSERT_EQ("3", "abc\xe3\x82\x92xyz", out);
+		VASSERT_EQ("2.1", 3, hvalue->getName().getWordCount());
+		VASSERT_EQ("2.2", "'", hvalue->getName().getWordAt(0)->getBuffer());
+		VASSERT_EQ("2.3", vmime::charset("us-ascii"), hvalue->getName().getWordAt(0)->getCharset());
+		VASSERT_EQ("2.4", "\xc1\xd6\x91\x63\xc3\xc0", hvalue->getName().getWordAt(1)->getBuffer());
+		VASSERT_EQ("2.5", vmime::charset("gb2312"), hvalue->getName().getWordAt(1)->getCharset());
+		VASSERT_EQ("2.6", "'", hvalue->getName().getWordAt(2)->getBuffer());
+		VASSERT_EQ("2.7", vmime::charset("us-ascii"), hvalue->getName().getWordAt(2)->getCharset());
 	}
 
 VMIME_TEST_SUITE_END
author	Vincent Richard <[email protected]>	2016-11-05 12:31:54 +0000
committer	Vincent Richard <[email protected]>	2016-11-05 12:31:54 +0000
commit	5424aa2381b76464ead401090558820f5d9d270b (patch)
tree	f12ba33b48ca55348542ed8c903f59ac2e8bf9c0
parent	Merge branch 'master' of https://github.com/kisli/vmime (diff)
download	vmime-5424aa2381b76464ead401090558820f5d9d270b.tar.gz vmime-5424aa2381b76464ead401090558820f5d9d270b.zip