diff --git a/ChangeLog b/ChangeLog index 85763a65..5b4a4b94 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,12 @@ VERSION 0.8.1cvs ================ +2006-10-02 Vincent Richard + + * word, wordEncoder: fixed bug #1096610 which caused encoding of a + non-integral number of characters (and then, generation of + incorrectly-formed words) with multi-bytes charsets. + 2006-07-13 Vincent Richard * Fixed bugs in MHTML code: 'CID' prefix should not be case-sensitive; diff --git a/SConstruct b/SConstruct index c719c292..ed5a1b2e 100644 --- a/SConstruct +++ b/SConstruct @@ -140,6 +140,7 @@ libvmime_sources = [ 'textPart.hpp', 'types.hpp', 'word.cpp', 'word.hpp', + 'wordEncoder.cpp', 'wordEncoder.hpp', 'vmime.hpp', # ============================== Utility ============================= 'utility/childProcess.hpp', diff --git a/src/word.cpp b/src/word.cpp index 58133577..39059dc0 100644 --- a/src/word.cpp +++ b/src/word.cpp @@ -32,6 +32,8 @@ #include "vmime/encoderB64.hpp" #include "vmime/encoderQP.hpp" +#include "vmime/wordEncoder.hpp" + namespace vmime { @@ -481,12 +483,11 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe ? maxLineLength : std::min(maxLineLength, static_cast (76)); - // Base64 if more than 60% non-ascii, quoted-printable else (default) - const string::size_type asciiPercent = (m_buffer.length() == 0 ? 100 : (100 * asciiCount) / m_buffer.length()); - const string::value_type encoding = (asciiPercent <= 40) ? 'B' : 'Q'; + wordEncoder wordEnc(m_buffer, m_charset); - string wordStart("=?" + m_charset.getName() + "?" + encoding + "?"); - string wordEnd("?="); + const string wordStart("=?" + m_charset.getName() + "?" + + (wordEnc.getEncoding() == wordEncoder::ENCODING_B64 ? 'B' : 'Q') + "?"); + const string wordEnd("?="); const string::size_type minWordLength = wordStart.length() + wordEnd.length(); const string::size_type maxLineLength2 = (maxLineLength3 < minWordLength + 1) @@ -520,125 +521,40 @@ void word::generate(utility::outputStream& os, const string::size_type maxLineLe } // Encode and fold input buffer - string::const_iterator pos = m_buffer.begin(); - string::size_type remaining = m_buffer.length(); - - encoder* theEncoder = NULL; - - if (encoding == 'B') theEncoder = new encoderB64; - else theEncoder = new encoderQP; - - string qpEncodedBuffer; - - if (encoding == 'Q') - { - theEncoder->getProperties()["rfc2047"] = true; - - // In the case of Quoted-Printable encoding, we cannot simply encode input - // buffer line by line. So, we encode the whole buffer and we will fold it - // in the next loop... - utility::inputStreamStringAdapter in(m_buffer); - utility::outputStreamStringAdapter out(qpEncodedBuffer); - - theEncoder->encode(in, out); - - pos = qpEncodedBuffer.begin(); - remaining = qpEncodedBuffer.length(); - } - -#if 1 if (curLineLength != 1 && !isFirstWord) { os << " "; // Separate from previous word ++curLineLength; } -#endif - for ( ; remaining ; ) + for (unsigned int i = 0 ; ; ++i) { - // Start a new encoded word - os << wordStart; - curLineLength += minWordLength; - // Compute the number of encoded chars that will fit on this line - const string::size_type fit = maxLineLength2 - curLineLength; + const string::size_type fit = maxLineLength2 - minWordLength + - (i == 0 ? curLineLength : NEW_LINE_SEQUENCE_LENGTH); - // Base-64 encoding - if (encoding == 'B') - { - // TODO: WARNING! "Any encoded word which encodes a non-integral - // number of characters or octets is incorrectly formed." + // Get the next encoded chunk + const string chunk = wordEnc.getNextChunk(fit); - // Here, we have a formula to compute the maximum number of source - // characters to encode knowing the maximum number of encoded chars - // (with Base64, 3 bytes of input provide 4 bytes of output). - string::size_type count = (fit > 1) ? ((fit - 1) * 3) / 4 : 1; - if (count > remaining) count = remaining; + if (chunk.empty()) + break; - utility::inputStreamStringAdapter in - (m_buffer, pos - m_buffer.begin(), pos - m_buffer.begin() + count); - - curLineLength += theEncoder->encode(in, os); - - pos += count; - remaining -= count; - } - // Quoted-Printable encoding - else - { - // TODO: WARNING! "Any encoded word which encodes a non-integral - // number of characters or octets is incorrectly formed." - - // All we have to do here is to take a certain number of character - // (that is less than or equal to "fit") from the QP encoded buffer, - // but we also make sure not to fold a "=XY" encoded char. - const string::const_iterator qpEnd = qpEncodedBuffer.end(); - string::const_iterator lastFoldPos = pos; - string::const_iterator p = pos; - string::size_type n = 0; - - while (n < fit && p != qpEnd) - { - if (*p == '=') - { - if (n + 3 >= fit) - { - lastFoldPos = p; - break; - } - - p += 3; - n += 3; - } - else - { - ++p; - ++n; - } - } - - if (lastFoldPos == pos) - lastFoldPos = p; - - os << string(pos, lastFoldPos); - - curLineLength += (lastFoldPos - pos) + 1; - - pos += n; - remaining -= n; - } - - // End of the encoded word - os << wordEnd; - - if (remaining) + // Start a new encoded word + if (i != 0) { os << NEW_LINE_SEQUENCE; curLineLength = NEW_LINE_SEQUENCE_LENGTH; } - } - delete (theEncoder); + os << wordStart; + curLineLength += minWordLength; + + os << chunk; + curLineLength += chunk.length(); + + // End of the encoded word + os << wordEnd; + } } if (newLinePos) diff --git a/src/wordEncoder.cpp b/src/wordEncoder.cpp new file mode 100644 index 00000000..e854eac6 --- /dev/null +++ b/src/wordEncoder.cpp @@ -0,0 +1,290 @@ +// +// VMime library (http://www.vmime.org) +// Copyright (C) 2002-2006 Vincent Richard +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Linking this library statically or dynamically with other modules is making +// a combined work based on this library. Thus, the terms and conditions of +// the GNU General Public License cover the whole combination. +// + +#include "vmime/wordEncoder.hpp" + +#include "vmime/exception.hpp" +#include "vmime/charsetConverter.hpp" +#include "vmime/encoderB64.hpp" +#include "vmime/encoderQP.hpp" + +#include "vmime/utility/stringUtils.hpp" + + +namespace vmime +{ + + +wordEncoder::wordEncoder(const string& buffer, const charset& charset, const Encoding encoding) + : m_buffer(buffer), m_pos(0), m_length(buffer.length()), m_charset(charset), m_encoding(encoding) +{ + try + { + string utf8Buffer; + + vmime::charset::convert + (buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8)); + + m_buffer = utf8Buffer; + m_length = utf8Buffer.length(); + + m_simple = false; + } + catch (exceptions::charset_conv_error&) + { + // Ignore exception. + // We will fall back on simple encoding. + m_simple = true; + } + + if (m_encoding == ENCODING_AUTO) + m_encoding = guessBestEncoding(buffer, charset); + + if (m_encoding == ENCODING_B64) + { + m_encoder = vmime::create (); + } + else // ENCODING_QP + { + m_encoder = vmime::create (); + m_encoder->getProperties()["rfc2047"] = true; + } +} + + +static const string::size_type getUTF8CharLength + (const string& buffer, const string::size_type pos, const string::size_type length) +{ + // Gives the number of extra bytes in a UTF8 char, given the leading char + static const unsigned char UTF8_EXTRA_BYTES[256] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 + }; + + const unsigned char c = buffer[pos]; + const unsigned char n = UTF8_EXTRA_BYTES[c]; + + if (n < length - pos) + return n + 1; + else + return 1; +} + + +const string wordEncoder::getNextChunk(const string::size_type maxLength) +{ + const string::size_type remaining = m_length - m_pos; + + if (remaining == 0) + return string(); + + vmime::string chunk; + vmime::utility::outputStreamStringAdapter chunkStream(chunk); + + // Simple encoding + if (m_simple) + { + // WARNING! Simple encoding can encode a non-integral number of + // characters and then may generate incorrectly-formed words! + + if (m_encoding == ENCODING_B64) + { + // Here, we have a formula to compute the maximum number of source + // bytes to encode knowing the maximum number of encoded chars. In + // Base64 encoding, 3 bytes of input provide 4 bytes of output. + const string::size_type inputCount = + std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1); + + // Encode chunk + utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount); + + m_encoder->encode(in, chunkStream); + m_pos += inputCount; + } + else // ENCODING_QP + { + // Compute exactly how much input bytes are needed to have an output + // string length of less than 'maxLength' bytes. In Quoted-Printable + // encoding, encoded bytes take 3 bytes. + string::size_type inputCount = 0; + string::size_type outputCount = 0; + + while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining)) + { + const unsigned char c = m_buffer[m_pos + inputCount]; + bool encoded = true; + + switch (c) + { + case ',': + case ';': + case ':': + case '_': + case '=': + + encoded = true; + break; + + default: + + if (c >= 33 && c <= 126 && c != 61) + encoded = false; + + break; + } + + inputCount++; + outputCount += (encoded ? 3 : 1); + } + + // Encode chunk + utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount); + + m_encoder->encode(in, chunkStream); + m_pos += inputCount; + } + } + // Fully RFC-compliant encoding + else + { + charsetConverter conv(vmime::charset(charsets::UTF_8), m_charset); + + string::size_type inputCount = 0; + string::size_type outputCount = 0; + string encodeBuffer; + + while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining)) + { + // Get the next UTF8 character + const string::size_type inputCharLength = + getUTF8CharLength(m_buffer, m_pos + inputCount, m_length); + + const string inputChar(m_buffer.begin() + m_pos + inputCount, + m_buffer.begin() + m_pos + inputCount + inputCharLength); + + // Convert back to original encoding + string encodeBytes; + conv.convert(inputChar, encodeBytes); + + encodeBuffer += encodeBytes; + + // Compute number of output bytes + if (m_encoding == ENCODING_B64) + { + outputCount = std::max(static_cast (4), + (encodeBytes.length() * 4) / 3); + } + else // ENCODING_QP + { + for (string::size_type i = 0, n = encodeBytes.length() ; i < n ; ++i) + { + const unsigned char c = encodeBytes[i]; + bool encoded = true; + + switch (c) + { + case ',': + case ';': + case ':': + case '_': + case '=': + + encoded = true; + break; + + default: + + if (c >= 33 && c <= 126 && c != 61) + encoded = false; + + break; + } + + outputCount += (encoded ? 3 : 1); + } + } + + inputCount += inputCharLength; + } + + // Encode chunk + utility::inputStreamStringAdapter in(encodeBuffer); + + m_encoder->encode(in, chunkStream); + m_pos += inputCount; + } + + return chunk; +} + + +const wordEncoder::Encoding wordEncoder::getEncoding() const +{ + return m_encoding; +} + + +// static +const wordEncoder::Encoding wordEncoder::guessBestEncoding + (const string& buffer, const charset& charset) +{ + // If the charset is ISO-8859-x, set to QP encoding + const string cset = utility::stringUtils::toLower(charset.getName()); + + if (cset.find("iso-8859") != string::npos || + cset.find("iso8859") != string::npos) + { + return ENCODING_QP; + } + + // Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default) + const string::size_type asciiCount = + utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end()); + + const string::size_type asciiPercent = + (buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length()); + + if (asciiPercent < 60) + return ENCODING_B64; + else + return ENCODING_QP; +} + + +} // vmime + diff --git a/tests/parser/textTest.cpp b/tests/parser/textTest.cpp index 011ec294..adcd014a 100644 --- a/tests/parser/textTest.cpp +++ b/tests/parser/textTest.cpp @@ -41,9 +41,34 @@ VMIME_TEST_SUITE_BEGIN VMIME_TEST(testWordConstructors) VMIME_TEST(testWordParse) VMIME_TEST(testWordGenerate) + VMIME_TEST(testWordGenerateMultiBytes) VMIME_TEST_LIST_END + static const vmime::string getDisplayText(const vmime::text& t) + { + vmime::string res; + + for (int i = 0 ; i < t.getWordCount() ; ++i) + res += t.getWordAt(i)->getBuffer(); + + return res; + } + + static const vmime::string cleanGeneratedWords(const std::string& str) + { + std::istringstream iss(str); + + std::string res; + std::string x; + + while (std::getline(iss, x)) + res += vmime::utility::stringUtils::trim(x); + + return res; + } + + void testConstructors() { vmime::text t1; @@ -171,16 +196,6 @@ VMIME_TEST_SUITE_BEGIN // TODO } - static const vmime::string getDisplayText(const vmime::text& t) - { - vmime::string res; - - for (int i = 0 ; i < t.getWordCount() ; ++i) - res += t.getWordAt(i)->getBuffer(); - - return res; - } - void testDisplayForm() { #define DISPLAY_FORM(x) getDisplayText(*vmime::text::decodeAndUnfold(x)) @@ -254,5 +269,15 @@ VMIME_TEST_SUITE_BEGIN vmime::word("\xf1\xf2\xf3\xf4\xf5", vmime::charset("foo")).generate()); } + void testWordGenerateMultiBytes() + { + // Ensure we don't encode a non-integral number of characters + VASSERT_EQ("1", "=?utf-8?Q?aaa?==?utf-8?Q?=C3=A9?==?utf-8?Q?zzz?=", + cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(16))); + + VASSERT_EQ("1", "=?utf-8?Q?aaa=C3=A9?==?utf-8?Q?zzz?=", + cleanGeneratedWords(vmime::word("aaa\xc3\xa9zzz", vmime::charset("utf-8")).generate(17))); + } + VMIME_TEST_SUITE_END diff --git a/vmime/wordEncoder.hpp b/vmime/wordEncoder.hpp new file mode 100644 index 00000000..584904ca --- /dev/null +++ b/vmime/wordEncoder.hpp @@ -0,0 +1,94 @@ +// +// VMime library (http://www.vmime.org) +// Copyright (C) 2002-2006 Vincent Richard +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Linking this library statically or dynamically with other modules is making +// a combined work based on this library. Thus, the terms and conditions of +// the GNU General Public License cover the whole combination. +// + +#ifndef VMIME_WORDENCODER_HPP_INCLUDED +#define VMIME_WORDENCODER_HPP_INCLUDED + + +#include "vmime/charset.hpp" + + +namespace vmime +{ + + +class encoder; + + +/** Encodes words following RFC-2047. + */ + +class wordEncoder +{ +public: + + /** Available encodings for RFC-2047. */ + enum Encoding + { + ENCODING_AUTO, + ENCODING_QP, + ENCODING_B64 + }; + + + wordEncoder(const string& buffer, const charset& charset, const Encoding encoding = ENCODING_AUTO); + + + /** Return the next chunk in the word. + * + * @param maxLength maximal length of the chunk + * @return next chunk, of maximal length 'maxLength' if possible + */ + const string getNextChunk(const string::size_type maxLength); + + /** Return the encoding used. + * + * @return encoding + */ + const Encoding getEncoding() const; + +private: + + static const Encoding guessBestEncoding(const string& buffer, const charset& charset); + + void guessBestEncoding(); + + + string m_buffer; + string::size_type m_pos; + string::size_type m_length; + + bool m_simple; + + charset m_charset; + Encoding m_encoding; + + ref m_encoder; +}; + + +} // vmime + + +#endif // VMIME_WORDENCODER_HPP_INCLUDED +