diff options
author | Jan Engelhardt <[email protected]> | 2024-05-21 13:55:06 +0000 |
---|---|---|
committer | GitHub <[email protected]> | 2024-05-21 13:55:06 +0000 |
commit | d296c2d1d590f8b4f619d9c555ff24ddecec1614 (patch) | |
tree | 49ab8eafbebcd761f9fd304dd888c47187b090c1 /src | |
parent | tests: switch a byte sequence in textTest (#305) (diff) | |
download | vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.tar.gz vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.zip |
vmime: prevent loss of a space during text::createFromString (#306)
```
mailbox(text("Test München West", charsets::UTF_8), "[email protected]").generate();
```
produces
```
=?us-ascii?Q?Test_?= =?utf-8?Q?M=C3=BCnchen?= =?us-ascii?Q?West?= <[email protected]>
```
The first space between ``Test`` and ``München`` is encoded as an
underscore along with the first word: ``Test_``. The second space
between ``München`` and ``West`` is encoded with neither of the two
words and thus lost. Decoding the text results in ``Test
MünchenWest`` instead of ``Test München West``.
This is caused by how ``vmime::text::createFromString()`` handles
transitions between 7-bit and 8-bit words: If an 8-bit word follows a
7-bit word, a space is appended to the previous word. The opposite
case of a 7-bit word following an 8-bit word *misses* this behaviour.
When one fixes this problem, a follow-up issue appears:
``text::createFromString("a b\xFFc d")`` tokenizes the input into
``m_words={word("a "), word("b\xFFc ", utf8), word("d")}``. This
"right-side alignment" nature of the whitespace is a problem for
word::generate():
As per RFC 2047, spaces between adjacent encoded words are just
separators but not meant to be displayed. A space between an encoded
word and a regular ASCII text is not just a separator but also meant
to be displayed.
When word::generate() outputs the b-word, it would have to strip one
space, but only when there is a transition from encoded-word to
unencoded word. word::generate() does not know whether d will be
encoded or unencoded.
The idea now is that we could change the tokenization of
``text::createFromString`` such that whitespace is at the *start* of
words rather than at the end. With that, word::generate() need not
know anything about the next word, but rather only the *previous*
one.
Thus, in this patch,
1. The tokenization of ``text::createFromString`` is changed to
left-align spaces and the function is fixed to account for
the missing space on transition.
2. ``word::generate`` learns how to steal a space character.
3. Testcases are adjusted to account for the shifted
position of the space.
Fixes: #283, #284
Co-authored-by: Vincent Richard <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/vmime/text.cpp | 91 | ||||
-rw-r--r-- | src/vmime/word.cpp | 3 |
2 files changed, 33 insertions, 61 deletions
diff --git a/src/vmime/text.cpp b/src/vmime/text.cpp index a6172633..08d27bb9 100644 --- a/src/vmime/text.cpp +++ b/src/vmime/text.cpp @@ -285,84 +285,53 @@ void text::createFromString(const string& in, const charset& ch) { asciiPercent = (in.length() == 0 ? 100 : (100 * asciiCount) / in.length()); } - // If there are "too much" non-ASCII chars, encode everything + // If there are "too much" non-ASCII chars, produce just one + // vmime::word. Because encoding happens word-wise, all of the input + // gets encoded. + if (alwaysEncode || asciiPercent < 60) { // less than 60% ASCII chars appendWord(make_shared <word>(in, ch)); + return; - // Else, only encode words which need it - } else { - - bool is8bit = false; // is the current word 8-bit? - bool prevIs8bit = false; // is previous word 8-bit? - unsigned int count = 0; // total number of words - - for (size_t end = in.size(), pos = 0, start = 0 ; ; ) { - - if (pos == end || parserHelpers::isSpace(in[pos])) { - - const string chunk(in.begin() + start, in.begin() + pos); - - if (pos != end) { - ++pos; - } - - if (is8bit) { - - if (count && prevIs8bit) { - - // No need to create a new encoded word, just append - // the current word to the previous one. - shared_ptr <word> w = getWordAt(getWordCount() - 1); - w->getBuffer() += " " + chunk; - - } else { - - if (count) { - shared_ptr <word> w = getWordAt(getWordCount() - 1); - w->getBuffer() += ' '; - } - - appendWord(make_shared <word>(chunk, ch)); + } - prevIs8bit = true; - ++count; - } + // Else, only encode words which need it - } else { + size_t end = in.size(), pos = 0; + bool is8bit = false; // is the current word 8-bit? + bool prevIs8bit = false; // is previous word 8-bit? + unsigned int count = 0; // total number of words - if (count && !prevIs8bit) { + do { + size_t start = pos; - shared_ptr <word> w = getWordAt(getWordCount() - 1); - w->getBuffer() += " " + chunk; + for (; parserHelpers::isSpace(in[pos]); ) + ++pos; - } else { + for (; pos < end && !parserHelpers::isSpace(in[pos]); ++pos) + is8bit |= !parserHelpers::isAscii(in[pos]); - appendWord(make_shared <word>(chunk, charset(charsets::US_ASCII))); + // All chunks will have whitespace (if any) at front, never at the end + const string chunk(in.begin() + start, in.begin() + pos); - prevIs8bit = false; - ++count; - } - } + if (prevIs8bit == is8bit && count > 0) { - if (pos == end) { - break; - } + // same bitness as previous word; merge + auto w = getWordAt(getWordCount() - 1); + w->getBuffer() += chunk; - is8bit = false; - start = pos; + } else { - } else if (!parserHelpers::isAscii(in[pos])) { + appendWord(make_shared <word>(chunk, charset(is8bit ? ch : charsets::US_ASCII))); + ++count; - is8bit = true; - ++pos; + } - } else { + prevIs8bit = is8bit; + is8bit = false; - ++pos; - } - } - } + } while (pos < end); } diff --git a/src/vmime/word.cpp b/src/vmime/word.cpp index d4bf85cf..a16b4b51 100644 --- a/src/vmime/word.cpp +++ b/src/vmime/word.cpp @@ -755,6 +755,9 @@ void word::generate( if (!startNewLine && !state->isFirstWord && !state->lastCharIsSpace) { os << " "; // Separate from previous word + if (!state->prevWordIsEncoded && m_buffer[0] == ' ') + wordEnc.getNextChunk(1); + ++curLineLength; state->lastCharIsSpace = true; |