1 files changed, 290 insertions, 0 deletions
diff --git a/src/wordEncoder.cpp b/src/wordEncoder.cpp
new file mode 100644
index 00000000..e854eac6
--- /dev/null
+++ b/src/wordEncoder.cpp
@@ -0,0 +1,290 @@
+//
+// VMime library (http://www.vmime.org)
+// Copyright (C) 2002-2006 Vincent Richard <[email protected]>
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Linking this library statically or dynamically with other modules is making
+// a combined work based on this library.  Thus, the terms and conditions of
+// the GNU General Public License cover the whole combination.
+//
+
+#include "vmime/wordEncoder.hpp"
+
+#include "vmime/exception.hpp"
+#include "vmime/charsetConverter.hpp"
+#include "vmime/encoderB64.hpp"
+#include "vmime/encoderQP.hpp"
+
+#include "vmime/utility/stringUtils.hpp"
+
+
+namespace vmime
+{
+
+
+wordEncoder::wordEncoder(const string& buffer, const charset& charset, const Encoding encoding)
+	: m_buffer(buffer), m_pos(0), m_length(buffer.length()), m_charset(charset), m_encoding(encoding)
+{
+	try
+	{
+		string utf8Buffer;
+
+		vmime::charset::convert
+			(buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8));
+
+		m_buffer = utf8Buffer;
+		m_length = utf8Buffer.length();
+
+		m_simple = false;
+	}
+	catch (exceptions::charset_conv_error&)
+	{
+		// Ignore exception.
+		// We will fall back on simple encoding.
+		m_simple = true;
+	}
+
+	if (m_encoding == ENCODING_AUTO)
+		m_encoding = guessBestEncoding(buffer, charset);
+
+	if (m_encoding == ENCODING_B64)
+	{
+		m_encoder = vmime::create <encoderB64>();
+	}
+	else // ENCODING_QP
+	{
+		m_encoder = vmime::create <encoderQP>();
+		m_encoder->getProperties()["rfc2047"] = true;
+	}
+}
+
+
+static const string::size_type getUTF8CharLength
+	(const string& buffer, const string::size_type pos, const string::size_type length)
+{
+	// Gives the number of extra bytes in a UTF8 char, given the leading char
+	static const unsigned char UTF8_EXTRA_BYTES[256] =
+	{
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+	};
+
+	const unsigned char c = buffer[pos];
+	const unsigned char n = UTF8_EXTRA_BYTES[c];
+
+	if (n < length - pos)
+		return n + 1;
+	else
+		return 1;
+}
+
+
+const string wordEncoder::getNextChunk(const string::size_type maxLength)
+{
+	const string::size_type remaining = m_length - m_pos;
+
+	if (remaining == 0)
+		return string();
+
+	vmime::string chunk;
+	vmime::utility::outputStreamStringAdapter chunkStream(chunk);
+
+	// Simple encoding
+	if (m_simple)
+	{
+		// WARNING! Simple encoding can encode a non-integral number of
+		// characters and then may generate incorrectly-formed words!
+
+		if (m_encoding == ENCODING_B64)
+		{
+			// Here, we have a formula to compute the maximum number of source
+			// bytes to encode knowing the maximum number of encoded chars. In
+			// Base64 encoding, 3 bytes of input provide 4 bytes of output.
+			const string::size_type inputCount =
+				std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1);
+
+			// Encode chunk
+			utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
+
+			m_encoder->encode(in, chunkStream);
+			m_pos += inputCount;
+		}
+		else // ENCODING_QP
+		{
+			// Compute exactly how much input bytes are needed to have an output
+			// string length of less than 'maxLength' bytes. In Quoted-Printable
+			// encoding, encoded bytes take 3 bytes.
+			string::size_type inputCount = 0;
+			string::size_type outputCount = 0;
+
+			while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
+			{
+				const unsigned char c = m_buffer[m_pos + inputCount];
+				bool encoded = true;
+
+				switch (c)
+				{
+				case ',':
+				case ';':
+				case ':':
+				case '_':
+				case '=':
+
+					encoded = true;
+					break;
+
+				default:
+
+					if (c >= 33 && c <= 126 && c != 61)
+						encoded = false;
+
+					break;
+				}
+
+				inputCount++;
+				outputCount += (encoded ? 3 : 1);
+			}
+
+			// Encode chunk
+			utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
+
+			m_encoder->encode(in, chunkStream);
+			m_pos += inputCount;
+		}
+	}
+	// Fully RFC-compliant encoding
+	else
+	{
+		charsetConverter conv(vmime::charset(charsets::UTF_8), m_charset);
+
+		string::size_type inputCount = 0;
+		string::size_type outputCount = 0;
+		string encodeBuffer;
+
+		while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
+		{
+			// Get the next UTF8 character
+			const string::size_type inputCharLength =
+				getUTF8CharLength(m_buffer, m_pos + inputCount, m_length);
+
+			const string inputChar(m_buffer.begin() + m_pos + inputCount,
+				m_buffer.begin() + m_pos + inputCount + inputCharLength);
+
+			// Convert back to original encoding
+			string encodeBytes;
+			conv.convert(inputChar, encodeBytes);
+
+			encodeBuffer += encodeBytes;
+
+			// Compute number of output bytes
+			if (m_encoding == ENCODING_B64)
+			{
+				outputCount = std::max(static_cast <string::size_type>(4),
+					(encodeBytes.length() * 4) / 3);
+			}
+			else // ENCODING_QP
+			{
+				for (string::size_type i = 0, n = encodeBytes.length() ; i < n ; ++i)
+				{
+					const unsigned char c = encodeBytes[i];
+					bool encoded = true;
+
+					switch (c)
+					{
+					case ',':
+					case ';':
+					case ':':
+					case '_':
+					case '=':
+
+						encoded = true;
+						break;
+
+					default:
+
+						if (c >= 33 && c <= 126 && c != 61)
+							encoded = false;
+
+						break;
+					}
+
+					outputCount += (encoded ? 3 : 1);
+				}
+			}
+
+			inputCount += inputCharLength;
+		}
+
+		// Encode chunk
+		utility::inputStreamStringAdapter in(encodeBuffer);
+
+		m_encoder->encode(in, chunkStream);
+		m_pos += inputCount;
+	}
+
+	return chunk;
+}
+
+
+const wordEncoder::Encoding wordEncoder::getEncoding() const
+{
+	return m_encoding;
+}
+
+
+// static
+const wordEncoder::Encoding wordEncoder::guessBestEncoding
+	(const string& buffer, const charset& charset)
+{
+	// If the charset is ISO-8859-x, set to QP encoding
+	const string cset = utility::stringUtils::toLower(charset.getName());
+
+	if (cset.find("iso-8859") != string::npos ||
+	    cset.find("iso8859") != string::npos)
+	{
+		return ENCODING_QP;
+	}
+
+	// Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default)
+	const string::size_type asciiCount =
+		utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end());
+
+	const string::size_type asciiPercent =
+		(buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length());
+
+	if (asciiPercent < 60)
+		return ENCODING_B64;
+	else
+		return ENCODING_QP;
+}
+
+
+} // vmime
+