From 1901c6fdb60849e75ad1a52d405694c9e769e2ba Mon Sep 17 00:00:00 2001 From: Vincent Richard Date: Tue, 15 Mar 2005 10:30:42 +0000 Subject: [PATCH] Moved word parsing from 'text' class to 'word' class. --- ChangeLog | 5 + src/text.cpp | 522 ++------------------------------------------ src/word.cpp | 582 ++++++++++++++++++++++++++++++++++++++++++++++++- vmime/text.hpp | 2 - vmime/word.hpp | 20 +- 5 files changed, 622 insertions(+), 509 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0fdeac69..566a105c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,11 @@ VERSION 0.6.4cvs ================ +2005-03-15 Vincent Richard + + * text.{cpp|hpp}, word.{cpp|hpp}: moved word parsing from 'text' class + to 'word' class, which now inherits from 'component'. + 2005-03-14 Vincent Richard * removed singleton<> and singletonManager classes: useless and quite diff --git a/src/text.cpp b/src/text.cpp index e510da12..97a58647 100644 --- a/src/text.cpp +++ b/src/text.cpp @@ -19,13 +19,8 @@ #include "vmime/text.hpp" -#include "vmime/utility/stringUtils.hpp" #include "vmime/parserHelpers.hpp" -#include "vmime/encoder.hpp" -#include "vmime/encoderB64.hpp" -#include "vmime/encoderQP.hpp" - namespace vmime { @@ -70,12 +65,18 @@ text::~text() void text::parse(const string& buffer, const string::size_type position, const string::size_type end, string::size_type* newPosition) { - decodeAndUnfold(buffer.begin() + position, buffer.begin() + end, *this); + removeAllWords(); - setParsedBounds(position, end); + string::size_type newPos; + + const std::vector words = word::parseMultiple(buffer, position, end, &newPos); + + copy_vector(words, m_words); + + setParsedBounds(position, newPos); if (newPosition) - *newPosition = end; + *newPosition = newPos; } @@ -331,327 +332,8 @@ void text::encodeAndFold(utility::outputStream& os, const string::size_type maxL for (int wi = 0 ; wi < getWordCount() ; ++wi) { - const word& w = *getWordAt(wi); - const string& buffer = w.getBuffer(); - - // Calculate the number of ASCII chars to check whether encoding is needed - // and _which_ encoding to use. - const string::size_type asciiCount = - utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end()); - - bool noEncoding = (flags & FORCE_NO_ENCODING) || - (!(flags & FORCE_ENCODING) && asciiCount == buffer.length()); - - if (noEncoding) - { - // We will fold lines without encoding them. - - string::const_iterator lastWSpos = buffer.end(); // last white-space position - string::const_iterator curLineStart = buffer.begin(); // current line start - - string::const_iterator p = buffer.begin(); - const string::const_iterator end = buffer.end(); - - bool finished = false; - bool newLine = false; - - while (!finished) - { - for ( ; p != end ; ++p, ++curLineLength) - { - // Exceeded maximum line length, but we have found a white-space - // where we can cut the line... - if (curLineLength >= maxLineLength && lastWSpos != end) - break; - - if (*p == ' ' || *p == '\t') - { - // Remember the position of this white-space character - lastWSpos = p; - } - } - - if (p != end) - ++curLineLength; - - //if (p == end || curLineLength >= maxLineLength) - { - if (p == end || lastWSpos == end) - { - // If we are here, it means that we have found no whitespace - // before the first "maxLineLength" characters. In this case, - // we write the full line no matter of the max line length... - - if (!newLine && p != end && lastWSpos == end && - wi != 0 && curLineStart == buffer.begin()) - { - // Here, we are continuing on the line of previous encoded - // word, but there is not even enough space to put the - // first word of this line, so we start a new line. - if (flags & NO_NEW_LINE_SEQUENCE) - { - os << CRLF; - curLineLength = 0; - } - else - { - os << NEW_LINE_SEQUENCE; - curLineLength = NEW_LINE_SEQUENCE_LENGTH; - } - - p = curLineStart; - lastWSpos = end; - newLine = true; - } - else - { - os << string(curLineStart, p); - - if (p == end) - { - finished = true; - } - else - { - if (flags & NO_NEW_LINE_SEQUENCE) - { - os << CRLF; - curLineLength = 0; - } - else - { - os << NEW_LINE_SEQUENCE; - curLineLength = NEW_LINE_SEQUENCE_LENGTH; - } - - curLineStart = p; - lastWSpos = end; - newLine = true; - } - } - } - else - { - // In this case, there will not be enough space on the line for all the - // characters _after_ the last white-space; so we cut the line at this - // last white-space. - -#if 1 - if (curLineLength != 1 && wi != 0) - os << " "; // Separate from previous word -#endif - - os << string(curLineStart, lastWSpos); - - if (flags & NO_NEW_LINE_SEQUENCE) - { - os << CRLF; - curLineLength = 0; - } - else - { - os << NEW_LINE_SEQUENCE; - curLineLength = NEW_LINE_SEQUENCE_LENGTH; - } - - curLineStart = lastWSpos + 1; - - p = lastWSpos + 1; - lastWSpos = end; - newLine = true; - } - } - } - } - /* - RFC #2047: - 4. Encodings - - Initially, the legal values for "encoding" are "Q" and "B". These - encodings are described below. The "Q" encoding is recommended for - use when most of the characters to be encoded are in the ASCII - character set; otherwise, the "B" encoding should be used. - Nevertheless, a mail reader which claims to recognize 'encoded-word's - MUST be able to accept either encoding for any character set which it - supports. - */ - else - { - // We will encode _AND_ fold lines - - /* - RFC #2047: - 2. Syntax of encoded-words - - " While there is no limit to the length of a multiple-line header - field, each line of a header field that contains one or more - 'encoded-word's is limited to 76 characters. " - */ - - const string::size_type maxLineLength3 = - (maxLineLength == lineLengthLimits::infinite) - ? maxLineLength - : std::min(maxLineLength, static_cast (76)); - - // Base64 if more than 60% non-ascii, quoted-printable else (default) - const string::size_type asciiPercent = (100 * asciiCount) / buffer.length(); - const string::value_type encoding = (asciiPercent <= 40) ? 'B' : 'Q'; - - string wordStart("=?" + w.getCharset().getName() + "?" + encoding + "?"); - string wordEnd("?="); - - const string::size_type minWordLength = wordStart.length() + wordEnd.length(); - const string::size_type maxLineLength2 = (maxLineLength3 < minWordLength + 1) - ? maxLineLength3 + minWordLength + 1 : maxLineLength3; - - // Checks whether remaining space on this line is usable. If too few - // characters can be encoded, start a new line. - bool startNewLine = true; - - if (curLineLength + 2 < maxLineLength2) - { - const string::size_type remainingSpaceOnLine = maxLineLength2 - curLineLength - 2; - - if (remainingSpaceOnLine < minWordLength + 10) - { - // Space for no more than 10 encoded chars! - // It is not worth while to continue on this line... - startNewLine = true; - } - else - { - // OK, there is enough usable space on the current line. - startNewLine = false; - } - } - - if (startNewLine) - { - os << NEW_LINE_SEQUENCE; - curLineLength = NEW_LINE_SEQUENCE_LENGTH; - } - - // Encode and fold input buffer - string::const_iterator pos = buffer.begin(); - string::size_type remaining = buffer.length(); - - encoder* theEncoder; - - if (encoding == 'B') theEncoder = new encoderB64; - else theEncoder = new encoderQP; - - string qpEncodedBuffer; - - if (encoding == 'Q') - { - theEncoder->getProperties()["rfc2047"] = true; - - // In the case of Quoted-Printable encoding, we cannot simply encode input - // buffer line by line. So, we encode the whole buffer and we will fold it - // in the next loop... - utility::inputStreamStringAdapter in(buffer); - utility::outputStreamStringAdapter out(qpEncodedBuffer); - - theEncoder->encode(in, out); - - pos = qpEncodedBuffer.begin(); - remaining = qpEncodedBuffer.length(); - } - -#if 1 - if (curLineLength != 1 && wi != 0) - { - os << " "; // Separate from previous word - ++curLineLength; - } -#endif - - for ( ; remaining ; ) - { - // Start a new encoded word - os << wordStart; - curLineLength += minWordLength; - - // Compute the number of encoded chars that will fit on this line - const string::size_type fit = maxLineLength2 - curLineLength; - - // Base-64 encoding - if (encoding == 'B') - { - // TODO: WARNING! "Any encoded word which encodes a non-integral - // number of characters or octets is incorrectly formed." - - // Here, we have a formula to compute the maximum number of source - // characters to encode knowing the maximum number of encoded chars - // (with Base64, 3 bytes of input provide 4 bytes of output). - string::size_type count = (fit > 1) ? ((fit - 1) * 3) / 4 : 1; - if (count > remaining) count = remaining; - - utility::inputStreamStringAdapter in - (buffer, pos - buffer.begin(), pos - buffer.begin() + count); - - curLineLength += theEncoder->encode(in, os); - - pos += count; - remaining -= count; - } - // Quoted-Printable encoding - else - { - // TODO: WARNING! "Any encoded word which encodes a non-integral - // number of characters or octets is incorrectly formed." - - // All we have to do here is to take a certain number of character - // (that is less than or equal to "fit") from the QP encoded buffer, - // but we also make sure not to fold a "=XY" encoded char. - const string::const_iterator qpEnd = qpEncodedBuffer.end(); - string::const_iterator lastFoldPos = pos; - string::const_iterator p = pos; - string::size_type n = 0; - - while (n < fit && p != qpEnd) - { - if (*p == '=') - { - if (n + 3 >= fit) - { - lastFoldPos = p; - break; - } - - p += 3; - n += 3; - } - else - { - ++p; - ++n; - } - } - - if (lastFoldPos == pos) - lastFoldPos = p; - - os << string(pos, lastFoldPos); - - curLineLength += (lastFoldPos - pos) + 1; - - pos += n; - remaining -= n; - } - - // End of the encoded word - os << wordEnd; - - if (remaining) - { - os << NEW_LINE_SEQUENCE; - curLineLength = NEW_LINE_SEQUENCE_LENGTH; - } - } - - delete (theEncoder); - } + getWordAt(wi)->generate(os, maxLineLength, curLineLength, + &curLineLength, flags, (wi == 0)); } if (lastLineLength) @@ -665,187 +347,21 @@ text* text::decodeAndUnfold(const string& in, text* generateInExisting) out->removeAllWords(); - decodeAndUnfold(in.begin(), in.end(), *out); + const std::vector words = word::parseMultiple(in, 0, in.length(), NULL); + + copy_vector(words, out->m_words); return (out); } -void text::decodeAndUnfold(const string::const_iterator& inStart, const string::const_iterator& inEnd, text& out) -{ - // NOTE: See RFC-2047, Pages 11-12 for knowing about handling - // of white-spaces between encoded words. - - out.removeAllWords(); - - string::const_iterator p = inStart; - const string::const_iterator end = inEnd; - - const charset defaultCharset(charsets::US_ASCII); - charset prevWordCharset(defaultCharset); - - bool prevIsEncoded = false; - - string::const_iterator prevPos = p; - - for ( ; ; ) - { - if (p == end) // || *p == '\n') - { - string::const_iterator textEnd = p; - - if (textEnd != inStart && *(textEnd - 1) == '\r') - --textEnd; - - if (textEnd != prevPos) - { - if (!out.isEmpty() && prevWordCharset == defaultCharset) - { - out.getWordAt(out.getWordCount() - 1)->getBuffer() += string(prevPos, textEnd); - } - else - { - prevWordCharset = defaultCharset; - out.appendWord(new word(string(prevPos, textEnd), defaultCharset)); - prevIsEncoded = false; - } - } - - if (p == end) - { - // Finished - break; - } - - // Skip the new-line character - prevPos = ++p; - } - else if (*p == '=' && (p + 1) != end && *(p + 1) == '?') - { - string::const_iterator wordPos = p; - p += 2; // skip '=?' - - if (p != end) - { - const string::const_iterator charsetPos = p; - - for ( ; p != end && *p != '?' ; ++p); - - if (p != end) // a charset is specified - { - const string::const_iterator charsetEnd = p; - const string::const_iterator encPos = ++p; // skip '?' - - for ( ; p != end && *p != '?' ; ++p); - - if (p != end) // an encoding is specified - { - //const string::const_iterator encEnd = p; - const string::const_iterator dataPos = ++p; // skip '?' - - for ( ; p != end && !(*p == '?' && *(p + 1) == '=') ; ++p); - - if (p != end) // some data is specified - { - const string::const_iterator dataEnd = p; - p += 2; // skip '?=' - - encoder* theEncoder = NULL; - - // Base-64 encoding - if (*encPos == 'B' || *encPos == 'b') - { - theEncoder = new encoderB64; - } - // Quoted-Printable encoding - else if (*encPos == 'Q' || *encPos == 'q') - { - theEncoder = new encoderQP; - theEncoder->getProperties()["rfc2047"] = true; - } - - if (theEncoder) - { - // Decode text - string decodedBuffer; - - utility::inputStreamStringAdapter ein(string(dataPos, dataEnd)); - utility::outputStreamStringAdapter eout(decodedBuffer); - - theEncoder->decode(ein, eout); - delete (theEncoder); - - // Append all the unencoded text before this word - if (prevPos != wordPos) - { - string::const_iterator p = prevPos; - - if (prevIsEncoded) - { - // Check whether there are only white-spaces between - // the two encoded words - for ( ; (p != wordPos) && parserHelpers::isspace(*p) ; ++p); - } - - if (p != wordPos) // if not empty - { - if (!out.isEmpty() && prevWordCharset == defaultCharset) - { - out.getWordAt(out.getWordCount() - 1)-> - getBuffer() += string(prevPos, wordPos); - } - else - { - out.appendWord(new word - (string(prevPos, wordPos), defaultCharset)); - - prevWordCharset = defaultCharset; - } - } - } - - // Append this fresh decoded word to output text - charset thisCharset(string(charsetPos, charsetEnd)); - - if (!out.isEmpty() && prevWordCharset == thisCharset) - { - out.getWordAt(out.getWordCount() - 1)-> - getBuffer() += decodedBuffer; - } - else - { - prevWordCharset = thisCharset; - out.appendWord(new word(decodedBuffer, thisCharset)); - } - - // This word has been decoded: we can advance in the input buffer - prevPos = p; - prevIsEncoded = true; - } - else - { - // Unknown encoding: can't decode this word, we will - // treat this word as ordinary text (RFC-2047, Page 9). - } - } - } - } - } - } - else - { - ++p; - } - - for ( ; p != end && *p != '=' && *p != '\n' ; ++p); - } -} - - const std::vector text::getChildComponents() const { - // TODO: 'word' should inherit from 'component' - return std::vector (); + std::vector list; + + copy_vector(m_words, list); + + return (list); } diff --git a/src/word.cpp b/src/word.cpp index 6801fb47..7fde1fab 100644 --- a/src/word.cpp +++ b/src/word.cpp @@ -18,6 +18,15 @@ // #include "vmime/word.hpp" +#include "vmime/text.hpp" + +#include "vmime/utility/stringUtils.hpp" +#include "vmime/utility/smartPtr.hpp" +#include "vmime/parserHelpers.hpp" + +#include "vmime/encoder.hpp" +#include "vmime/encoderB64.hpp" +#include "vmime/encoderQP.hpp" namespace vmime @@ -31,7 +40,7 @@ word::word() word::word(const word& w) - : m_buffer(w.m_buffer), m_charset(w.m_charset) + : component(), m_buffer(w.m_buffer), m_charset(w.m_charset) { } @@ -48,6 +57,567 @@ word::word(const string& buffer, const charset& charset) } +word* word::parseNext(const string& buffer, const string::size_type position, + const string::size_type end, string::size_type* newPosition, + bool prevIsEncoded, bool* isEncoded, bool isFirst) +{ + string::size_type pos = position; + + // Ignore white-spaces: + // - before the first word + // - between two encoded words + // - after the last word + while (pos < end && parserHelpers::isspace(buffer[pos])) + ++pos; + + string::size_type startPos = pos; + string unencoded; + + while (pos < end) + { + // End of line: does not occur in the middle of an encoded word. This is + // used to remove folding white-spaces from unencoded text. + if (buffer[pos] == '\n') + { + string::size_type endPos = pos; + + if (pos > position && buffer[pos - 1] == '\r') + --endPos; + + while (pos != end && parserHelpers::isspace(buffer[pos])) + ++pos; + + unencoded += string(buffer.begin() + startPos, buffer.begin() + endPos); + unencoded += ' '; + + startPos = pos; + } + // Start of an encoded word + else if (pos + 6 < end && // 6 = "=?(.+)?(.*)?=" + buffer[pos] == '=' && buffer[pos + 1] == '?') + { + // Check whether there is some unencoded text before + unencoded += string(buffer.begin() + startPos, buffer.begin() + pos); + + if (!unencoded.empty()) + { + word* w = new word(unencoded, charset(charsets::US_ASCII)); + w->setParsedBounds(position, pos); + + if (newPosition) + *newPosition = pos; + + if (isEncoded) + *isEncoded = false; + + return (w); + } + + // ...else find the finish sequence '?=' and return an encoded word + const string::size_type wordStart = pos; + + pos += 4; + + while (pos < end) + { + if (buffer[pos] == '\n') + { + // End of line not allowed in the middle of an encoded word: + // treat this text as unencoded text (see *). + break; + } + else if (buffer[pos] == '?' && pos + 1 < end && buffer[pos + 1] == '=') + { + // Found the finish sequence + break; + } + + ++pos; + } + + if (pos == end) // not a valid word (no finish sequence) + continue; + else if (buffer[pos] == '\n') // (*) + continue; + + pos += 2; // ?= + + word* w = new word(); + w->parse(buffer, wordStart, pos, NULL); + + if (newPosition) + *newPosition = pos; + + if (isEncoded) + *isEncoded = true; + + return (w); + } + + ++pos; + } + + // Treat unencoded text at the end of the buffer + if (end != startPos) + { + if (startPos != pos && !isFirst && prevIsEncoded) + unencoded += ' '; + + unencoded += string(buffer.begin() + startPos, buffer.begin() + end); + + word* w = new word(unencoded, charset(charsets::US_ASCII)); + w->setParsedBounds(position, end); + + if (newPosition) + *newPosition = end; + + if (isEncoded) + *isEncoded = false; + + return (w); + } + + return (NULL); +} + + +const std::vector word::parseMultiple(const string& buffer, const string::size_type position, + const string::size_type end, string::size_type* newPosition) +{ + std::vector res; + word* w = NULL; + + string::size_type pos = position; + + bool prevIsEncoded = false; + + while ((w = word::parseNext(buffer, pos, end, &pos, prevIsEncoded, &prevIsEncoded, (w == NULL))) != NULL) + res.push_back(w); + + if (newPosition) + *newPosition = pos; + + return (res); +} + + +void word::parse(const string& buffer, const string::size_type position, + const string::size_type end, string::size_type* newPosition) +{ + if (position + 6 < end && // 6 = "=?(.+)?(.*)?=" + buffer[position] == '=' && buffer[position + 1] == '?') + { + string::const_iterator p = buffer.begin() + position + 2; + const string::const_iterator pend = buffer.begin() + end; + + const string::const_iterator charsetPos = p; + + for ( ; p != pend && *p != '?' ; ++p); + + if (p != pend) // a charset is specified + { + const string::const_iterator charsetEnd = p; + const string::const_iterator encPos = ++p; // skip '?' + + for ( ; p != pend && *p != '?' ; ++p); + + if (p != pend) // an encoding is specified + { + //const string::const_iterator encEnd = p; + const string::const_iterator dataPos = ++p; // skip '?' + + for ( ; p != pend && !(*p == '?' && *(p + 1) == '=') ; ++p); + + if (p != pend) // some data is specified + { + const string::const_iterator dataEnd = p; + p += 2; // skip '?=' + + encoder* theEncoder = NULL; + + // Base-64 encoding + if (*encPos == 'B' || *encPos == 'b') + { + theEncoder = new encoderB64; + } + // Quoted-Printable encoding + else if (*encPos == 'Q' || *encPos == 'q') + { + theEncoder = new encoderQP; + theEncoder->getProperties()["rfc2047"] = true; + } + + if (theEncoder) + { + // Decode text + string decodedBuffer; + + utility::inputStreamStringAdapter ein(string(dataPos, dataEnd)); + utility::outputStreamStringAdapter eout(decodedBuffer); + + theEncoder->decode(ein, eout); + delete (theEncoder); + + m_buffer = decodedBuffer; + m_charset = charset(string(charsetPos, charsetEnd)); + + setParsedBounds(position, p - buffer.begin()); + + if (newPosition) + *newPosition = (p - buffer.begin()); + + return; + } + } + } + } + } + + // Unknown encoding or malformed encoded word: treat the buffer as ordinary text (RFC-2047, Page 9). + m_buffer = string(buffer.begin() + position, buffer.begin() + end); + m_charset = charsets::US_ASCII; + + setParsedBounds(position, end); + + if (newPosition) + *newPosition = end; +} + + +void word::generate(utility::outputStream& os, const string::size_type maxLineLength, + const string::size_type curLinePos, string::size_type* newLinePos) const +{ + generate(os, maxLineLength, curLinePos, newLinePos, 0, true); +} + + +void word::generate(utility::outputStream& os, const string::size_type maxLineLength, + const string::size_type curLinePos, string::size_type* newLinePos, const int flags, + const bool isFirstWord) const +{ + string::size_type curLineLength = curLinePos; + + // Calculate the number of ASCII chars to check whether encoding is needed + // and _which_ encoding to use. + const string::size_type asciiCount = + utility::stringUtils::countASCIIchars(m_buffer.begin(), m_buffer.end()); + + bool noEncoding = (flags & text::FORCE_NO_ENCODING) || + (!(flags & text::FORCE_ENCODING) && asciiCount == m_buffer.length()); + + if (noEncoding) + { + // We will fold lines without encoding them. + + string::const_iterator lastWSpos = m_buffer.end(); // last white-space position + string::const_iterator curLineStart = m_buffer.begin(); // current line start + + string::const_iterator p = m_buffer.begin(); + const string::const_iterator end = m_buffer.end(); + + bool finished = false; + bool newLine = false; + + while (!finished) + { + for ( ; p != end ; ++p, ++curLineLength) + { + // Exceeded maximum line length, but we have found a white-space + // where we can cut the line... + if (curLineLength >= maxLineLength && lastWSpos != end) + break; + + if (*p == ' ' || *p == '\t') + { + // Remember the position of this white-space character + lastWSpos = p; + } + } + + if (p != end) + ++curLineLength; + + if (p == end || lastWSpos == end) + { + // If we are here, it means that we have found no whitespace + // before the first "maxLineLength" characters. In this case, + // we write the full line no matter of the max line length... + + if (!newLine && p != end && lastWSpos == end && + !isFirstWord && curLineStart == m_buffer.begin()) + { + // Here, we are continuing on the line of previous encoded + // word, but there is not even enough space to put the + // first word of this line, so we start a new line. + if (flags & text::NO_NEW_LINE_SEQUENCE) + { + os << CRLF; + curLineLength = 0; + } + else + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + p = curLineStart; + lastWSpos = end; + newLine = true; + } + else + { + os << string(curLineStart, p); + + if (p == end) + { + finished = true; + } + else + { + if (flags & text::NO_NEW_LINE_SEQUENCE) + { + os << CRLF; + curLineLength = 0; + } + else + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + curLineStart = p; + lastWSpos = end; + newLine = true; + } + } + } + else + { + // In this case, there will not be enough space on the line for all the + // characters _after_ the last white-space; so we cut the line at this + // last white-space. + +#if 1 + if (curLineLength != 1 && !isFirstWord) + os << " "; // Separate from previous word +#endif + + os << string(curLineStart, lastWSpos); + + if (flags & text::NO_NEW_LINE_SEQUENCE) + { + os << CRLF; + curLineLength = 0; + } + else + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + curLineStart = lastWSpos + 1; + + p = lastWSpos + 1; + lastWSpos = end; + newLine = true; + } + } + } + /* + RFC #2047: + 4. Encodings + + Initially, the legal values for "encoding" are "Q" and "B". These + encodings are described below. The "Q" encoding is recommended for + use when most of the characters to be encoded are in the ASCII + character set; otherwise, the "B" encoding should be used. + Nevertheless, a mail reader which claims to recognize 'encoded-word's + MUST be able to accept either encoding for any character set which it + supports. + */ + else + { + // We will encode _AND_ fold lines + + /* + RFC #2047: + 2. Syntax of encoded-words + + " While there is no limit to the length of a multiple-line header + field, each line of a header field that contains one or more + 'encoded-word's is limited to 76 characters. " + */ + + const string::size_type maxLineLength3 = + (maxLineLength == lineLengthLimits::infinite) + ? maxLineLength + : std::min(maxLineLength, static_cast (76)); + + // Base64 if more than 60% non-ascii, quoted-printable else (default) + const string::size_type asciiPercent = (m_buffer.length() == 0 ? 100 : (100 * asciiCount) / m_buffer.length()); + const string::value_type encoding = (asciiPercent <= 40) ? 'B' : 'Q'; + + string wordStart("=?" + m_charset.getName() + "?" + encoding + "?"); + string wordEnd("?="); + + const string::size_type minWordLength = wordStart.length() + wordEnd.length(); + const string::size_type maxLineLength2 = (maxLineLength3 < minWordLength + 1) + ? maxLineLength3 + minWordLength + 1 : maxLineLength3; + + // Checks whether remaining space on this line is usable. If too few + // characters can be encoded, start a new line. + bool startNewLine = true; + + if (curLineLength + 2 < maxLineLength2) + { + const string::size_type remainingSpaceOnLine = maxLineLength2 - curLineLength - 2; + + if (remainingSpaceOnLine < minWordLength + 10) + { + // Space for no more than 10 encoded chars! + // It is not worth while to continue on this line... + startNewLine = true; + } + else + { + // OK, there is enough usable space on the current line. + startNewLine = false; + } + } + + if (startNewLine) + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + // Encode and fold input buffer + string::const_iterator pos = m_buffer.begin(); + string::size_type remaining = m_buffer.length(); + + encoder* theEncoder = NULL; + + if (encoding == 'B') theEncoder = new encoderB64; + else theEncoder = new encoderQP; + + string qpEncodedBuffer; + + if (encoding == 'Q') + { + theEncoder->getProperties()["rfc2047"] = true; + + // In the case of Quoted-Printable encoding, we cannot simply encode input + // buffer line by line. So, we encode the whole buffer and we will fold it + // in the next loop... + utility::inputStreamStringAdapter in(m_buffer); + utility::outputStreamStringAdapter out(qpEncodedBuffer); + + theEncoder->encode(in, out); + + pos = qpEncodedBuffer.begin(); + remaining = qpEncodedBuffer.length(); + } + +#if 1 + if (curLineLength != 1 && !isFirstWord) + { + os << " "; // Separate from previous word + ++curLineLength; + } +#endif + + for ( ; remaining ; ) + { + // Start a new encoded word + os << wordStart; + curLineLength += minWordLength; + + // Compute the number of encoded chars that will fit on this line + const string::size_type fit = maxLineLength2 - curLineLength; + + // Base-64 encoding + if (encoding == 'B') + { + // TODO: WARNING! "Any encoded word which encodes a non-integral + // number of characters or octets is incorrectly formed." + + // Here, we have a formula to compute the maximum number of source + // characters to encode knowing the maximum number of encoded chars + // (with Base64, 3 bytes of input provide 4 bytes of output). + string::size_type count = (fit > 1) ? ((fit - 1) * 3) / 4 : 1; + if (count > remaining) count = remaining; + + utility::inputStreamStringAdapter in + (m_buffer, pos - m_buffer.begin(), pos - m_buffer.begin() + count); + + curLineLength += theEncoder->encode(in, os); + + pos += count; + remaining -= count; + } + // Quoted-Printable encoding + else + { + // TODO: WARNING! "Any encoded word which encodes a non-integral + // number of characters or octets is incorrectly formed." + + // All we have to do here is to take a certain number of character + // (that is less than or equal to "fit") from the QP encoded buffer, + // but we also make sure not to fold a "=XY" encoded char. + const string::const_iterator qpEnd = qpEncodedBuffer.end(); + string::const_iterator lastFoldPos = pos; + string::const_iterator p = pos; + string::size_type n = 0; + + while (n < fit && p != qpEnd) + { + if (*p == '=') + { + if (n + 3 >= fit) + { + lastFoldPos = p; + break; + } + + p += 3; + n += 3; + } + else + { + ++p; + ++n; + } + } + + if (lastFoldPos == pos) + lastFoldPos = p; + + os << string(pos, lastFoldPos); + + curLineLength += (lastFoldPos - pos) + 1; + + pos += n; + remaining -= n; + } + + // End of the encoded word + os << wordEnd; + + if (remaining) + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + } + + delete (theEncoder); + } + + if (newLinePos) + *newLinePos = curLineLength; +} + + #if VMIME_WIDE_CHAR_SUPPORT const wstring word::getDecodedText() const @@ -77,8 +647,10 @@ word& word::operator=(const string& s) } -void word::copyFrom(const word& w) +void word::copyFrom(const component& other) { + const word& w = dynamic_cast (other); + m_buffer = w.m_buffer; m_charset = w.m_charset; } @@ -142,4 +714,10 @@ void word::setBuffer(const string& buffer) } +const std::vector word::getChildComponents() const +{ + return std::vector (); +} + + } // vmime diff --git a/vmime/text.hpp b/vmime/text.hpp index a7a6a793..4af9c495 100644 --- a/vmime/text.hpp +++ b/vmime/text.hpp @@ -205,8 +205,6 @@ public: private: - static void decodeAndUnfold(const string::const_iterator& inStart, const string::const_iterator& inEnd, text& out); - std::vector m_words; }; diff --git a/vmime/word.hpp b/vmime/word.hpp index c5e07423..b0a9a035 100644 --- a/vmime/word.hpp +++ b/vmime/word.hpp @@ -21,6 +21,7 @@ #define VMIME_WORD_HPP_INCLUDED +#include "vmime/component.hpp" #include "vmime/charset.hpp" @@ -32,7 +33,7 @@ namespace vmime * some text encoded into one specified charset. */ -class word +class word : public component { public: @@ -93,7 +94,7 @@ public: * * @param other other word to copy data from */ - void copyFrom(const word& other); + void copyFrom(const component& other); /** Clone this word. * @@ -101,6 +102,21 @@ public: */ word* clone() const; + + using component::parse; + using component::generate; + + void parse(const string& buffer, const string::size_type position, const string::size_type end, string::size_type* newPosition = NULL); + void generate(utility::outputStream& os, const string::size_type maxLineLength = lineLengthLimits::infinite, const string::size_type curLinePos = 0, string::size_type* newLinePos = NULL) const; + + void generate(utility::outputStream& os, const string::size_type maxLineLength, const string::size_type curLinePos, string::size_type* newLinePos, const int flags, const bool isFirstWord) const; + + const std::vector getChildComponents() const; + + static word* parseNext(const string& buffer, const string::size_type position, const string::size_type end, string::size_type* newPosition, bool prevIsEncoded, bool* isEncoded, bool isFirst); + + static const std::vector parseMultiple(const string& buffer, const string::size_type position, const string::size_type end, string::size_type* newPosition); + private: // The "m_buffer" of this word holds the data, and this data is encoded