aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJan Engelhardt <[email protected]>2024-05-21 13:55:06 +0000
committerGitHub <[email protected]>2024-05-21 13:55:06 +0000
commitd296c2d1d590f8b4f619d9c555ff24ddecec1614 (patch)
tree49ab8eafbebcd761f9fd304dd888c47187b090c1 /src
parenttests: switch a byte sequence in textTest (#305) (diff)
downloadvmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.tar.gz
vmime-d296c2d1d590f8b4f619d9c555ff24ddecec1614.zip
vmime: prevent loss of a space during text::createFromString (#306)
``` mailbox(text("Test München West", charsets::UTF_8), "[email protected]").generate(); ``` produces ``` =?us-ascii?Q?Test_?= =?utf-8?Q?M=C3=BCnchen?= =?us-ascii?Q?West?= <[email protected]> ``` The first space between ``Test`` and ``München`` is encoded as an underscore along with the first word: ``Test_``. The second space between ``München`` and ``West`` is encoded with neither of the two words and thus lost. Decoding the text results in ``Test MünchenWest`` instead of ``Test München West``. This is caused by how ``vmime::text::createFromString()`` handles transitions between 7-bit and 8-bit words: If an 8-bit word follows a 7-bit word, a space is appended to the previous word. The opposite case of a 7-bit word following an 8-bit word *misses* this behaviour. When one fixes this problem, a follow-up issue appears: ``text::createFromString("a b\xFFc d")`` tokenizes the input into ``m_words={word("a "), word("b\xFFc ", utf8), word("d")}``. This "right-side alignment" nature of the whitespace is a problem for word::generate(): As per RFC 2047, spaces between adjacent encoded words are just separators but not meant to be displayed. A space between an encoded word and a regular ASCII text is not just a separator but also meant to be displayed. When word::generate() outputs the b-word, it would have to strip one space, but only when there is a transition from encoded-word to unencoded word. word::generate() does not know whether d will be encoded or unencoded. The idea now is that we could change the tokenization of ``text::createFromString`` such that whitespace is at the *start* of words rather than at the end. With that, word::generate() need not know anything about the next word, but rather only the *previous* one. Thus, in this patch, 1. The tokenization of ``text::createFromString`` is changed to left-align spaces and the function is fixed to account for the missing space on transition. 2. ``word::generate`` learns how to steal a space character. 3. Testcases are adjusted to account for the shifted position of the space. Fixes: #283, #284 Co-authored-by: Vincent Richard <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/vmime/text.cpp91
-rw-r--r--src/vmime/word.cpp3
2 files changed, 33 insertions, 61 deletions
diff --git a/src/vmime/text.cpp b/src/vmime/text.cpp
index a6172633..08d27bb9 100644
--- a/src/vmime/text.cpp
+++ b/src/vmime/text.cpp
@@ -285,84 +285,53 @@ void text::createFromString(const string& in, const charset& ch) {
asciiPercent = (in.length() == 0 ? 100 : (100 * asciiCount) / in.length());
}
- // If there are "too much" non-ASCII chars, encode everything
+ // If there are "too much" non-ASCII chars, produce just one
+ // vmime::word. Because encoding happens word-wise, all of the input
+ // gets encoded.
+
if (alwaysEncode || asciiPercent < 60) { // less than 60% ASCII chars
appendWord(make_shared <word>(in, ch));
+ return;
- // Else, only encode words which need it
- } else {
-
- bool is8bit = false; // is the current word 8-bit?
- bool prevIs8bit = false; // is previous word 8-bit?
- unsigned int count = 0; // total number of words
-
- for (size_t end = in.size(), pos = 0, start = 0 ; ; ) {
-
- if (pos == end || parserHelpers::isSpace(in[pos])) {
-
- const string chunk(in.begin() + start, in.begin() + pos);
-
- if (pos != end) {
- ++pos;
- }
-
- if (is8bit) {
-
- if (count && prevIs8bit) {
-
- // No need to create a new encoded word, just append
- // the current word to the previous one.
- shared_ptr <word> w = getWordAt(getWordCount() - 1);
- w->getBuffer() += " " + chunk;
-
- } else {
-
- if (count) {
- shared_ptr <word> w = getWordAt(getWordCount() - 1);
- w->getBuffer() += ' ';
- }
-
- appendWord(make_shared <word>(chunk, ch));
+ }
- prevIs8bit = true;
- ++count;
- }
+ // Else, only encode words which need it
- } else {
+ size_t end = in.size(), pos = 0;
+ bool is8bit = false; // is the current word 8-bit?
+ bool prevIs8bit = false; // is previous word 8-bit?
+ unsigned int count = 0; // total number of words
- if (count && !prevIs8bit) {
+ do {
+ size_t start = pos;
- shared_ptr <word> w = getWordAt(getWordCount() - 1);
- w->getBuffer() += " " + chunk;
+ for (; parserHelpers::isSpace(in[pos]); )
+ ++pos;
- } else {
+ for (; pos < end && !parserHelpers::isSpace(in[pos]); ++pos)
+ is8bit |= !parserHelpers::isAscii(in[pos]);
- appendWord(make_shared <word>(chunk, charset(charsets::US_ASCII)));
+ // All chunks will have whitespace (if any) at front, never at the end
+ const string chunk(in.begin() + start, in.begin() + pos);
- prevIs8bit = false;
- ++count;
- }
- }
+ if (prevIs8bit == is8bit && count > 0) {
- if (pos == end) {
- break;
- }
+ // same bitness as previous word; merge
+ auto w = getWordAt(getWordCount() - 1);
+ w->getBuffer() += chunk;
- is8bit = false;
- start = pos;
+ } else {
- } else if (!parserHelpers::isAscii(in[pos])) {
+ appendWord(make_shared <word>(chunk, charset(is8bit ? ch : charsets::US_ASCII)));
+ ++count;
- is8bit = true;
- ++pos;
+ }
- } else {
+ prevIs8bit = is8bit;
+ is8bit = false;
- ++pos;
- }
- }
- }
+ } while (pos < end);
}
diff --git a/src/vmime/word.cpp b/src/vmime/word.cpp
index d4bf85cf..a16b4b51 100644
--- a/src/vmime/word.cpp
+++ b/src/vmime/word.cpp
@@ -755,6 +755,9 @@ void word::generate(
if (!startNewLine && !state->isFirstWord && !state->lastCharIsSpace) {
os << " "; // Separate from previous word
+ if (!state->prevWordIsEncoded && m_buffer[0] == ' ')
+ wordEnc.getNextChunk(1);
+
++curLineLength;
state->lastCharIsSpace = true;