2006-10-02 13:44:00 +00:00
|
|
|
//
|
|
|
|
// VMime library (http://www.vmime.org)
|
2009-09-06 12:02:10 +00:00
|
|
|
// Copyright (C) 2002-2009 Vincent Richard <vincent@vincent-richard.net>
|
2006-10-02 13:44:00 +00:00
|
|
|
//
|
|
|
|
// This program is free software; you can redistribute it and/or
|
|
|
|
// modify it under the terms of the GNU General Public License as
|
2009-09-06 12:02:10 +00:00
|
|
|
// published by the Free Software Foundation; either version 3 of
|
2006-10-02 13:44:00 +00:00
|
|
|
// the License, or (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
// General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU General Public License along
|
|
|
|
// with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
//
|
|
|
|
// Linking this library statically or dynamically with other modules is making
|
|
|
|
// a combined work based on this library. Thus, the terms and conditions of
|
|
|
|
// the GNU General Public License cover the whole combination.
|
|
|
|
//
|
|
|
|
|
|
|
|
#include "vmime/wordEncoder.hpp"
|
|
|
|
|
|
|
|
#include "vmime/exception.hpp"
|
|
|
|
#include "vmime/charsetConverter.hpp"
|
2008-10-12 13:59:09 +00:00
|
|
|
|
|
|
|
#include "vmime/utility/encoder/b64Encoder.hpp"
|
|
|
|
#include "vmime/utility/encoder/qpEncoder.hpp"
|
2006-10-02 13:44:00 +00:00
|
|
|
|
|
|
|
#include "vmime/utility/stringUtils.hpp"
|
|
|
|
|
|
|
|
|
|
|
|
namespace vmime
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
wordEncoder::wordEncoder(const string& buffer, const charset& charset, const Encoding encoding)
|
|
|
|
: m_buffer(buffer), m_pos(0), m_length(buffer.length()), m_charset(charset), m_encoding(encoding)
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
string utf8Buffer;
|
|
|
|
|
|
|
|
vmime::charset::convert
|
|
|
|
(buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8));
|
|
|
|
|
|
|
|
m_buffer = utf8Buffer;
|
|
|
|
m_length = utf8Buffer.length();
|
|
|
|
|
|
|
|
m_simple = false;
|
|
|
|
}
|
|
|
|
catch (exceptions::charset_conv_error&)
|
|
|
|
{
|
|
|
|
// Ignore exception.
|
|
|
|
// We will fall back on simple encoding.
|
|
|
|
m_simple = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (m_encoding == ENCODING_AUTO)
|
|
|
|
m_encoding = guessBestEncoding(buffer, charset);
|
|
|
|
|
|
|
|
if (m_encoding == ENCODING_B64)
|
|
|
|
{
|
2008-10-12 13:59:09 +00:00
|
|
|
m_encoder = vmime::create <utility::encoder::b64Encoder>();
|
2006-10-02 13:44:00 +00:00
|
|
|
}
|
|
|
|
else // ENCODING_QP
|
|
|
|
{
|
2008-10-12 13:59:09 +00:00
|
|
|
m_encoder = vmime::create <utility::encoder::qpEncoder>();
|
2006-10-02 13:44:00 +00:00
|
|
|
m_encoder->getProperties()["rfc2047"] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-10-12 10:05:28 +00:00
|
|
|
static string::size_type getUTF8CharLength
|
2006-10-02 13:44:00 +00:00
|
|
|
(const string& buffer, const string::size_type pos, const string::size_type length)
|
|
|
|
{
|
|
|
|
// Gives the number of extra bytes in a UTF8 char, given the leading char
|
|
|
|
static const unsigned char UTF8_EXTRA_BYTES[256] =
|
|
|
|
{
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
|
|
|
|
};
|
|
|
|
|
|
|
|
const unsigned char c = buffer[pos];
|
|
|
|
const unsigned char n = UTF8_EXTRA_BYTES[c];
|
|
|
|
|
|
|
|
if (n < length - pos)
|
|
|
|
return n + 1;
|
|
|
|
else
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const string wordEncoder::getNextChunk(const string::size_type maxLength)
|
|
|
|
{
|
|
|
|
const string::size_type remaining = m_length - m_pos;
|
|
|
|
|
|
|
|
if (remaining == 0)
|
|
|
|
return string();
|
|
|
|
|
|
|
|
vmime::string chunk;
|
|
|
|
vmime::utility::outputStreamStringAdapter chunkStream(chunk);
|
|
|
|
|
|
|
|
// Simple encoding
|
|
|
|
if (m_simple)
|
|
|
|
{
|
|
|
|
// WARNING! Simple encoding can encode a non-integral number of
|
|
|
|
// characters and then may generate incorrectly-formed words!
|
|
|
|
|
|
|
|
if (m_encoding == ENCODING_B64)
|
|
|
|
{
|
|
|
|
// Here, we have a formula to compute the maximum number of source
|
|
|
|
// bytes to encode knowing the maximum number of encoded chars. In
|
|
|
|
// Base64 encoding, 3 bytes of input provide 4 bytes of output.
|
|
|
|
const string::size_type inputCount =
|
|
|
|
std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1);
|
|
|
|
|
|
|
|
// Encode chunk
|
|
|
|
utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
|
|
|
|
|
|
|
|
m_encoder->encode(in, chunkStream);
|
|
|
|
m_pos += inputCount;
|
|
|
|
}
|
|
|
|
else // ENCODING_QP
|
|
|
|
{
|
|
|
|
// Compute exactly how much input bytes are needed to have an output
|
|
|
|
// string length of less than 'maxLength' bytes. In Quoted-Printable
|
|
|
|
// encoding, encoded bytes take 3 bytes.
|
|
|
|
string::size_type inputCount = 0;
|
|
|
|
string::size_type outputCount = 0;
|
|
|
|
|
|
|
|
while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
|
|
|
|
{
|
|
|
|
const unsigned char c = m_buffer[m_pos + inputCount];
|
|
|
|
bool encoded = true;
|
|
|
|
|
|
|
|
switch (c)
|
|
|
|
{
|
|
|
|
case ',':
|
|
|
|
case ';':
|
|
|
|
case ':':
|
|
|
|
case '_':
|
|
|
|
case '=':
|
|
|
|
|
|
|
|
encoded = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
|
|
if (c >= 33 && c <= 126 && c != 61)
|
|
|
|
encoded = false;
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
inputCount++;
|
|
|
|
outputCount += (encoded ? 3 : 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Encode chunk
|
|
|
|
utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
|
|
|
|
|
|
|
|
m_encoder->encode(in, chunkStream);
|
|
|
|
m_pos += inputCount;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Fully RFC-compliant encoding
|
|
|
|
else
|
|
|
|
{
|
2006-10-20 17:48:10 +00:00
|
|
|
charsetConverter conv(charsets::UTF_8, m_charset);
|
2006-10-02 13:44:00 +00:00
|
|
|
|
|
|
|
string::size_type inputCount = 0;
|
|
|
|
string::size_type outputCount = 0;
|
|
|
|
string encodeBuffer;
|
|
|
|
|
|
|
|
while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
|
|
|
|
{
|
|
|
|
// Get the next UTF8 character
|
|
|
|
const string::size_type inputCharLength =
|
|
|
|
getUTF8CharLength(m_buffer, m_pos + inputCount, m_length);
|
|
|
|
|
|
|
|
const string inputChar(m_buffer.begin() + m_pos + inputCount,
|
|
|
|
m_buffer.begin() + m_pos + inputCount + inputCharLength);
|
|
|
|
|
|
|
|
// Convert back to original encoding
|
|
|
|
string encodeBytes;
|
|
|
|
conv.convert(inputChar, encodeBytes);
|
|
|
|
|
|
|
|
encodeBuffer += encodeBytes;
|
|
|
|
|
|
|
|
// Compute number of output bytes
|
|
|
|
if (m_encoding == ENCODING_B64)
|
|
|
|
{
|
|
|
|
outputCount = std::max(static_cast <string::size_type>(4),
|
2006-11-17 14:59:10 +00:00
|
|
|
(encodeBuffer.length() * 4) / 3);
|
2006-10-02 13:44:00 +00:00
|
|
|
}
|
|
|
|
else // ENCODING_QP
|
|
|
|
{
|
|
|
|
for (string::size_type i = 0, n = encodeBytes.length() ; i < n ; ++i)
|
|
|
|
{
|
|
|
|
const unsigned char c = encodeBytes[i];
|
|
|
|
bool encoded = true;
|
|
|
|
|
|
|
|
switch (c)
|
|
|
|
{
|
|
|
|
case ',':
|
|
|
|
case ';':
|
|
|
|
case ':':
|
|
|
|
case '_':
|
|
|
|
case '=':
|
|
|
|
|
|
|
|
encoded = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
|
|
if (c >= 33 && c <= 126 && c != 61)
|
|
|
|
encoded = false;
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
outputCount += (encoded ? 3 : 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inputCount += inputCharLength;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Encode chunk
|
|
|
|
utility::inputStreamStringAdapter in(encodeBuffer);
|
|
|
|
|
|
|
|
m_encoder->encode(in, chunkStream);
|
|
|
|
m_pos += inputCount;
|
|
|
|
}
|
|
|
|
|
|
|
|
return chunk;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-10-12 10:05:28 +00:00
|
|
|
wordEncoder::Encoding wordEncoder::getEncoding() const
|
2006-10-02 13:44:00 +00:00
|
|
|
{
|
|
|
|
return m_encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-05-21 07:41:15 +00:00
|
|
|
// Explicitly force encoding for some charsets
|
|
|
|
struct CharsetEncodingEntry
|
|
|
|
{
|
|
|
|
CharsetEncodingEntry(const std::string& charset_, const wordEncoder::Encoding encoding_)
|
|
|
|
: charset(charset_), encoding(encoding_)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string charset;
|
|
|
|
wordEncoder::Encoding encoding;
|
|
|
|
};
|
|
|
|
|
|
|
|
CharsetEncodingEntry g_charsetEncodingMap[] =
|
|
|
|
{
|
|
|
|
// Use QP encoding for ISO-8859-x charsets
|
|
|
|
CharsetEncodingEntry("iso-8859", wordEncoder::ENCODING_QP),
|
|
|
|
CharsetEncodingEntry("iso8859", wordEncoder::ENCODING_QP),
|
|
|
|
|
|
|
|
// RFC-1468 states:
|
|
|
|
// " ISO-2022-JP may also be used in MIME Part 2 headers. The "B"
|
|
|
|
// encoding should be used with ISO-2022-JP text. "
|
|
|
|
// Use Base64 encoding for all ISO-2022 charsets.
|
|
|
|
CharsetEncodingEntry("iso-2022", wordEncoder::ENCODING_B64),
|
|
|
|
CharsetEncodingEntry("iso2022", wordEncoder::ENCODING_B64),
|
|
|
|
|
|
|
|
// Last entry is not used
|
|
|
|
CharsetEncodingEntry("", wordEncoder::ENCODING_AUTO)
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// static
|
|
|
|
bool wordEncoder::isEncodingNeeded(const string& buffer, const charset& charset)
|
|
|
|
{
|
|
|
|
// Special treatment for some charsets
|
|
|
|
const string cset = utility::stringUtils::toLower(charset.getName());
|
|
|
|
|
|
|
|
for (unsigned int i = 0 ; i < (sizeof(g_charsetEncodingMap) / sizeof(g_charsetEncodingMap[0])) - 1 ; ++i)
|
|
|
|
{
|
|
|
|
if (cset.find(g_charsetEncodingMap[i].charset) != string::npos)
|
|
|
|
{
|
|
|
|
if (g_charsetEncodingMap[i].encoding != wordEncoder::ENCODING_AUTO)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// No encoding is needed if the buffer only contains ASCII chars
|
|
|
|
if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Force encoding when there are only ASCII chars, but there is
|
|
|
|
// also at least one of '\n' or '\r' (header fields)
|
|
|
|
if (buffer.find_first_of("\n\r") != string::npos)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-10-02 13:44:00 +00:00
|
|
|
// static
|
2008-10-12 10:05:28 +00:00
|
|
|
wordEncoder::Encoding wordEncoder::guessBestEncoding
|
2006-10-02 13:44:00 +00:00
|
|
|
(const string& buffer, const charset& charset)
|
|
|
|
{
|
2010-05-21 07:41:15 +00:00
|
|
|
// Special treatment for some charsets
|
2006-10-02 13:44:00 +00:00
|
|
|
const string cset = utility::stringUtils::toLower(charset.getName());
|
|
|
|
|
2010-05-21 07:41:15 +00:00
|
|
|
for (unsigned int i = 0 ; i < (sizeof(g_charsetEncodingMap) / sizeof(g_charsetEncodingMap[0])) - 1 ; ++i)
|
2006-10-02 13:44:00 +00:00
|
|
|
{
|
2010-05-21 07:41:15 +00:00
|
|
|
if (cset.find(g_charsetEncodingMap[i].charset) != string::npos)
|
|
|
|
return g_charsetEncodingMap[i].encoding;
|
2006-10-02 13:44:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default)
|
|
|
|
const string::size_type asciiCount =
|
|
|
|
utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end());
|
|
|
|
|
|
|
|
const string::size_type asciiPercent =
|
|
|
|
(buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length());
|
|
|
|
|
|
|
|
if (asciiPercent < 60)
|
|
|
|
return ENCODING_B64;
|
|
|
|
else
|
|
|
|
return ENCODING_QP;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} // vmime
|
|
|
|
|