From d296c2d1d590f8b4f619d9c555ff24ddecec1614 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Tue, 21 May 2024 15:55:06 +0200
Subject: vmime: prevent loss of a space during text::createFromString (#306)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

```
mailbox(text("Test München West", charsets::UTF_8), "a@b.de").generate();
```

produces

```
=?us-ascii?Q?Test_?= =?utf-8?Q?M=C3=BCnchen?= =?us-ascii?Q?West?= <test@example.com>
```

The first space between ``Test`` and ``München`` is encoded as an
underscore along with the first word: ``Test_``. The second space
between ``München`` and ``West`` is encoded with neither of the two
words and thus lost. Decoding the text results in ``Test
MünchenWest`` instead of ``Test München West``.

This is caused by how ``vmime::text::createFromString()`` handles
transitions between 7-bit and 8-bit words: If an 8-bit word follows a
7-bit word, a space is appended to the previous word. The opposite
case of a 7-bit word following an 8-bit word *misses* this behaviour.

When one fixes this problem, a follow-up issue appears:

``text::createFromString("a b\xFFc d")`` tokenizes the input into
``m_words={word("a "), word("b\xFFc ", utf8), word("d")}``. This
"right-side alignment" nature of the whitespace is a problem for
word::generate():

As per RFC 2047, spaces between adjacent encoded words are just
separators but not meant to be displayed. A space between an encoded
word and a regular ASCII text is not just a separator but also meant
to be displayed.

When word::generate() outputs the b-word, it would have to strip one
space, but only when there is a transition from encoded-word to
unencoded word. word::generate() does not know whether d will be
encoded or unencoded.

The idea now is that we could change the tokenization of
``text::createFromString`` such that whitespace is at the *start* of
words rather than at the end. With that, word::generate() need not
know anything about the next word, but rather only the *previous*
one.

Thus, in this patch,

1. The tokenization of ``text::createFromString`` is changed to
   left-align spaces and the function is fixed to account for
   the missing space on transition.
2. ``word::generate`` learns how to steal a space character.
3. Testcases are adjusted to account for the shifted
   position of the space.

Fixes: #283, #284

Co-authored-by: Vincent Richard <vincent@vincent-richard.net>
---
 src/vmime/text.cpp | 91 ++++++++++++++++++------------------------------------
 src/vmime/word.cpp |  3 ++
 2 files changed, 33 insertions(+), 61 deletions(-)

(limited to 'src')
diff --git a/src/vmime/text.cpp b/src/vmime/text.cpp
index a6172633..08d27bb9 100644
--- a/src/vmime/text.cpp
+++ b/src/vmime/text.cpp
@@ -285,84 +285,53 @@ void text::createFromString(const string& in, const charset& ch) {
 		asciiPercent = (in.length() == 0 ? 100 : (100 * asciiCount) / in.length());
 	}
 
-	// If there are "too much" non-ASCII chars, encode everything
+	// If there are "too much" non-ASCII chars, produce just one
+	// vmime::word. Because encoding happens word-wise, all of the input
+	// gets encoded.
+
 	if (alwaysEncode || asciiPercent < 60) {  // less than 60% ASCII chars
 
 		appendWord(make_shared <word>(in, ch));
+		return;
 
-	// Else, only encode words which need it
-	} else {
-
-		bool is8bit = false;     // is the current word 8-bit?
-		bool prevIs8bit = false; // is previous word 8-bit?
-		unsigned int count = 0;  // total number of words
-
-		for (size_t end = in.size(), pos = 0, start = 0 ; ; ) {
-
-			if (pos == end || parserHelpers::isSpace(in[pos])) {
-
-				const string chunk(in.begin() + start, in.begin() + pos);
-
-				if (pos != end) {
-					++pos;
-				}
-
-				if (is8bit) {
-
-					if (count && prevIs8bit) {
-
-						// No need to create a new encoded word, just append
-						// the current word to the previous one.
-						shared_ptr <word> w = getWordAt(getWordCount() - 1);
-						w->getBuffer() += " " + chunk;
-
-					} else {
-
-						if (count) {
-							shared_ptr <word> w = getWordAt(getWordCount() - 1);
-							w->getBuffer() += ' ';
-						}
-
-						appendWord(make_shared <word>(chunk, ch));
+	}
 
-						prevIs8bit = true;
-						++count;
-					}
+	// Else, only encode words which need it
 
-				} else {
+	size_t end = in.size(), pos = 0;
+	bool is8bit = false;     // is the current word 8-bit?
+	bool prevIs8bit = false; // is previous word 8-bit?
+	unsigned int count = 0;  // total number of words
 
-					if (count && !prevIs8bit) {
+	do {
+		size_t start = pos;
 
-						shared_ptr <word> w = getWordAt(getWordCount() - 1);
-						w->getBuffer() += " " + chunk;
+		for (; parserHelpers::isSpace(in[pos]); )
+			++pos;
 
-					} else {
+		for (; pos < end && !parserHelpers::isSpace(in[pos]); ++pos)
+			is8bit |= !parserHelpers::isAscii(in[pos]);
 
-						appendWord(make_shared <word>(chunk, charset(charsets::US_ASCII)));
+		// All chunks will have whitespace (if any) at front, never at the end
+		const string chunk(in.begin() + start, in.begin() + pos);
 
-						prevIs8bit = false;
-						++count;
-					}
-				}
+		if (prevIs8bit == is8bit && count > 0) {
 
-				if (pos == end) {
-					break;
-				}
+			// same bitness as previous word; merge
+			auto w = getWordAt(getWordCount() - 1);
+			w->getBuffer() += chunk;
 
-				is8bit = false;
-				start = pos;
+		} else {
 
-			} else if (!parserHelpers::isAscii(in[pos])) {
+			appendWord(make_shared <word>(chunk, charset(is8bit ? ch : charsets::US_ASCII)));
+			++count;
 
-				is8bit = true;
-				++pos;
+		}
 
-			} else {
+		prevIs8bit = is8bit;
+		is8bit = false;
 
-				++pos;
-			}
-		}
-	}
+	} while (pos < end);
 }
 
 
diff --git a/src/vmime/word.cpp b/src/vmime/word.cpp
index d4bf85cf..a16b4b51 100644
--- a/src/vmime/word.cpp
+++ b/src/vmime/word.cpp
@@ -755,6 +755,9 @@ void word::generate(
 		if (!startNewLine && !state->isFirstWord && !state->lastCharIsSpace) {
 
 			os << " "; // Separate from previous word
+			if (!state->prevWordIsEncoded && m_buffer[0] == ' ')
+				wordEnc.getNextChunk(1);
+
 			++curLineLength;
 
 			state->lastCharIsSpace = true;
-- 
cgit v1.2.3