vmime: prevent loss of a space during text::createFromString (#306)

``` mailbox(text("Test München West", charsets::UTF_8), "[email protected]").generate(); ``` produces ``` =?us-ascii?Q?Test_?= =?utf-8?Q?M=C3=BCnchen?= =?us-ascii?Q?West?= <[email protected]> ``` The first space between ``Test`` and ``München`` is encoded as an underscore along with the first word: ``Test_``. The second space between ``München`` and ``West`` is encoded with neither of the two words and thus lost. Decoding the text results in ``Test MünchenWest`` instead of ``Test München West``. This is caused by how ``vmime::text::createFromString()`` handles transitions between 7-bit and 8-bit words: If an 8-bit word follows a 7-bit word, a space is appended to the previous word. The opposite case of a 7-bit word following an 8-bit word *misses* this behaviour. When one fixes this problem, a follow-up issue appears: ``text::createFromString("a b\xFFc d")`` tokenizes the input into ``m_words={word("a "), word("b\xFFc ", utf8), word("d")}``. This "right-side alignment" nature of the whitespace is a problem for word::generate(): As per RFC 2047, spaces between adjacent encoded words are just separators but not meant to be displayed. A space between an encoded word and a regular ASCII text is not just a separator but also meant to be displayed. When word::generate() outputs the b-word, it would have to strip one space, but only when there is a transition from encoded-word to unencoded word. word::generate() does not know whether d will be encoded or unencoded. The idea now is that we could change the tokenization of ``text::createFromString`` such that whitespace is at the *start* of words rather than at the end. With that, word::generate() need not know anything about the next word, but rather only the *previous* one. Thus, in this patch, 1. The tokenization of ``text::createFromString`` is changed to left-align spaces and the function is fixed to account for the missing space on transition. 2. ``word::generate`` learns how to steal a space character. 3. Testcases are adjusted to account for the shifted position of the space. Fixes: #283, #284 Co-authored-by: Vincent Richard <[email protected]>
author: Jan Engelhardt <[email protected]> 2024-05-21 13:55:06 +0000
committer: GitHub <[email protected]> 2024-05-21 13:55:06 +0000
commit: d296c2d1d590f8b4f619d9c555ff24ddecec1614 (patch)
tree: 49ab8eafbebcd761f9fd304dd888c47187b090c1 /tests/parser/textTest.cpp
parent: tests: switch a byte sequence in textTest (#305) (diff)
download: vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.tar.gz
vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.zip
1 files changed, 5 insertions, 5 deletions
diff --git a/tests/parser/textTest.cpp b/tests/parser/textTest.cpp
index 6d7fe44d..cbb1740a 100644
--- a/tests/parser/textTest.cpp
+++ b/tests/parser/textTest.cpp
@@ -189,11 +189,11 @@ VMIME_TEST_SUITE_BEGIN(textTest)
 		t2.createFromString(s2, c2);
 
 		VASSERT_EQ("2.1", 3, t2.getWordCount());
-		VASSERT_EQ("2.2", "some ASCII characters and special chars: ", t2.getWordAt(0)->getBuffer());
+		VASSERT_EQ("2.2", "some ASCII characters and special chars:", t2.getWordAt(0)->getBuffer());
 		VASSERT_EQ("2.3", vmime::charset(vmime::charsets::US_ASCII), t2.getWordAt(0)->getCharset());
 		VASSERT_EQ("2.4", "\xc3\xa4\xd0\xb0", t2.getWordAt(1)->getBuffer());
 		VASSERT_EQ("2.5", c2, t2.getWordAt(1)->getCharset());
-		VASSERT_EQ("2.6", "and then more ASCII chars.", t2.getWordAt(2)->getBuffer());
+		VASSERT_EQ("2.6", " and then more ASCII chars.", t2.getWordAt(2)->getBuffer());
 		VASSERT_EQ("2.7", vmime::charset(vmime::charsets::US_ASCII), t2.getWordAt(2)->getCharset());
 	}
 
@@ -512,9 +512,9 @@ VMIME_TEST_SUITE_BEGIN(textTest)
 		text.createFromString("Achim Br\xc3\xa4ndt", vmime::charsets::UTF_8);
 
 		VASSERT_EQ("1", 2, text.getWordCount());
-		VASSERT_EQ("2", "Achim ", text.getWordAt(0)->getBuffer());
+		VASSERT_EQ("2", "Achim", text.getWordAt(0)->getBuffer());
 		VASSERT_EQ("3", "us-ascii", text.getWordAt(0)->getCharset());
-		VASSERT_EQ("4", "Br\xc3\xa4ndt", text.getWordAt(1)->getBuffer());
+		VASSERT_EQ("4", " Br\xc3\xa4ndt", text.getWordAt(1)->getBuffer());
 		VASSERT_EQ("5", "utf-8", text.getWordAt(1)->getCharset());
 
 		// Generate
@@ -534,7 +534,7 @@ VMIME_TEST_SUITE_BEGIN(textTest)
 
 		// Space MUST be encoded inside a word
 		vmime::mailbox mbox(vmime::text("Achim Br\xc3\xa4ndt", vmime::charsets::UTF_8), "[email protected]");
-		VASSERT_EQ("generate1", "=?us-ascii?Q?Achim_?= =?utf-8?Q?Br=C3=A4ndt?= <[email protected]>", mbox.generate());
+		VASSERT_EQ("generate1", "=?us-ascii?Q?Achim?= =?utf-8?Q?_Br=C3=A4ndt?= <[email protected]>", mbox.generate());
 
 		vmime::text txt;
 		txt.appendWord(vmime::make_shared <vmime::word>("Achim ", "us-ascii"));
author	Jan Engelhardt <[email protected]>	2024-05-21 13:55:06 +0000
committer	GitHub <[email protected]>	2024-05-21 13:55:06 +0000
commit	d296c2d1d590f8b4f619d9c555ff24ddecec1614 (patch)
tree	49ab8eafbebcd761f9fd304dd888c47187b090c1 /tests/parser/textTest.cpp
parent	tests: switch a byte sequence in textTest (#305) (diff)
download	vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.tar.gz vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.zip