diff --git a/ChangeLog b/ChangeLog index f9a4240d..78cbf785 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,9 @@ VERSION 0.7.2cvs * charsetConverter.{hpp|cpp}: new object 'charsetConverter' for converting between charsets (code moved from static functions in 'charset' class). + * Added 'charsetFilteredOutputStream': provide charset conversion while + writing to an output stream. + 2005-10-16 Vincent Richard * SConstruct: fixed compilation problems on FreeBSD (thanks to Xin LI). diff --git a/SConstruct b/SConstruct index 62e040cd..8848cf7a 100644 --- a/SConstruct +++ b/SConstruct @@ -333,6 +333,7 @@ libvmimetest_sources = [ # ============================== Parser ============================== 'tests/parser/attachmentHelperTest.cpp', 'tests/parser/bodyPartTest.cpp', + 'tests/parser/charsetTest.cpp', 'tests/parser/datetimeTest.cpp', 'tests/parser/dispositionTest.cpp', 'tests/parser/encoderTest.cpp', diff --git a/src/charsetConverter.cpp b/src/charsetConverter.cpp index ecfb1811..c6a77349 100644 --- a/src/charsetConverter.cpp +++ b/src/charsetConverter.cpp @@ -33,8 +33,8 @@ extern "C" #include // HACK: prototypes may differ depending on the compiler and/or system (the - // second parameter may or may not be 'const'). This redeclaration is a hack - // to have a common prototype "iconv_cast". + // second parameter may or may not be 'const'). This relies on the compiler + // for choosing the right type. class ICONV_HACK { public: @@ -81,6 +81,7 @@ charsetConverter::~charsetConverter() iconv_close(*static_cast (m_desc)); delete static_cast (m_desc); + m_desc = NULL; } } @@ -164,4 +165,169 @@ void charsetConverter::convert(const string& in, string& out) } + +// charsetFilteredOutputStream + +namespace utility { + + +charsetFilteredOutputStream::charsetFilteredOutputStream + (const charset& source, const charset& dest, outputStream& os) + : m_desc(NULL), m_sourceCharset(source), m_destCharset(dest), + m_stream(os), m_unconvCount(0) +{ + // Get an iconv descriptor + const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str()); + + if (cd != reinterpret_cast (-1)) + { + iconv_t* p = new iconv_t; + *p= cd; + + m_desc = p; + } +} + + +charsetFilteredOutputStream::~charsetFilteredOutputStream() +{ + if (m_desc != NULL) + { + // Close iconv handle + iconv_close(*static_cast (m_desc)); + + delete static_cast (m_desc); + m_desc = NULL; + } +} + + +outputStream& charsetFilteredOutputStream::getNextOutputStream() +{ + return m_stream; +} + + +void charsetFilteredOutputStream::write + (const value_type* const data, const size_type count) +{ + if (m_desc == NULL) + throw exceptions::charset_conv_error("Cannot initialize converter."); + + const iconv_t cd = *static_cast (m_desc); + + const value_type* curData = data; + size_type curDataLen = count; + + // If there is some unconverted bytes left, add more data from this + // chunk to see if it can now be converted. + while (m_unconvCount != 0 || curDataLen != 0) + { + if (m_unconvCount != 0) + { + // Check if an incomplete input sequence is larger than the + // input buffer size: should not happen except if something + // in the input sequence is invalid. If so, output a special + // character and skip one byte in the invalid sequence. + if (m_unconvCount >= sizeof(m_unconvBuffer)) + { + m_stream.write("?", 1); + + std::copy(m_unconvBuffer + 1, + m_unconvBuffer + m_unconvCount, m_unconvBuffer); + + m_unconvCount--; + } + + // Get more data + const size_type remaining = + std::min(curDataLen, sizeof(m_unconvBuffer) - m_unconvCount); + + std::copy(curData, curData + remaining, m_unconvBuffer + m_unconvCount); + + m_unconvCount += remaining; + curDataLen -= remaining; + curData += remaining; + + if (remaining == 0) + return; // no more data + + // Try a conversion + const char* inPtr = m_unconvBuffer; + size_t inLength = m_unconvCount; + char* outPtr = m_outputBuffer; + size_t outLength = sizeof(m_outputBuffer); + + const size_t inLength0 = inLength; + + if (iconv(cd, ICONV_HACK(&inPtr), &inLength, &outPtr, &outLength) == static_cast (-1)) + { + const size_t inputConverted = inLength0 - inLength; + + // Write successfully converted bytes + m_stream.write(m_outputBuffer, sizeof(m_outputBuffer) - outLength); + + // Shift unconverted bytes + std::copy(m_unconvBuffer + inputConverted, + m_unconvBuffer + m_unconvCount, m_unconvBuffer); + + m_unconvCount -= inputConverted; + + continue; + } + + // Write successfully converted bytes + m_stream.write(m_outputBuffer, sizeof(m_outputBuffer) - outLength); + + // Empty the unconverted buffer + m_unconvCount = 0; + } + + if (curDataLen == 0) + return; // no more data + + // Now, convert the current data buffer + const char* inPtr = curData; + size_t inLength = std::min(curDataLen, sizeof(m_outputBuffer) / MAX_CHARACTER_WIDTH); + char* outPtr = m_outputBuffer; + size_t outLength = sizeof(m_outputBuffer); + + const size_t inLength0 = inLength; + + if (iconv(cd, ICONV_HACK(&inPtr), &inLength, &outPtr, &outLength) == static_cast (-1)) + { + // Write successfully converted bytes + m_stream.write(m_outputBuffer, sizeof(m_outputBuffer) - outLength); + + const size_t inputConverted = inLength0 - inLength; + + curData += inputConverted; + curDataLen -= inputConverted; + + // Put one byte byte into the unconverted buffer so + // that the next iteration fill it + if (curDataLen != 0) + { + m_unconvCount = 1; + m_unconvBuffer[0] = *curData; + + curData++; + curDataLen--; + } + } + else + { + // Write successfully converted bytes + m_stream.write(m_outputBuffer, sizeof(m_outputBuffer) - outLength); + + curData += inLength0; + curDataLen -= inLength0; + } + } +} + + +} // utility + + } // vmime diff --git a/tests/parser/charsetTest.cpp b/tests/parser/charsetTest.cpp new file mode 100644 index 00000000..02360e8a --- /dev/null +++ b/tests/parser/charsetTest.cpp @@ -0,0 +1,236 @@ +// +// VMime library (http://www.vmime.org) +// Copyright (C) 2002-2005 Vincent Richard +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Linking this library statically or dynamically with other modules is making +// a combined work based on this library. Thus, the terms and conditions of +// the GNU General Public License cover the whole combination. +// + +#include "tests/testUtils.hpp" + + +#define VMIME_TEST_SUITE charsetTest +#define VMIME_TEST_SUITE_MODULE "Parser" + + +// Excerpt from http://www.gnu.org/ +static const vmime::charset inputCharset("gb2312"); +static const char inputBytes[] = + "\xbb\xb6\xd3\xad\xc0\xb4\xb5\xbd\x20\x47\x4e\x55\x20\xb9\xa4\xb3" + "\xcc\xb5\xc4\xcd\xf8\xd2\xb3\xcb\xc5\xb7\xfe\xd6\xf7\xbb\xfa\x20" + "\x77\x77\x77\x2e\x67\x6e\x75\x2e\x6f\x72\x67\x20\xa1\xa3\x20\x47" + "\x4e\x55\x20\xb9\xa4\xb3\xcc\x20\xbf\xaa\xca\xbc\xec\xb6\xd2\xbb" + "\xbe\xc5\xb0\xcb\xcb\xc4\xc4\xea\xa3\xac\xd6\xbc\xd4\xda\xb7\xa2" + "\xd5\xb9\xd2\xbb\xb8\xf6\xc0\xe0\xcb\xc6\x20\x55\x6e\x69\x78\x20" + "\xa3\xac\xc7\xd2\xce\xaa\x20\xd7\xd4\xd3\xc9\xc8\xed\xbc\xfe\x20" + "\xb5\xc4\xcd\xea\xd5\xfb\xb2\xd9\xd7\xf7\xcf\xb5\xcd\xb3\xa3\xba" + "\x20\x47\x4e\x55\x20\xcf\xb5\xcd\xb3\xa1\xa3\xa3\xa8\x47\x4e\x55" + "\x20\xca\xc7\xd3\xc9\xa1\xb0\x47\x4e\x55\x27\x73\x20\x4e\x6f\x74" + "\x20\x55\x6e\x69\x78\xa1\xb1\xcb\xf9\xb5\xdd\xbb\xd8\xb6\xa8\xd2" + "\xe5\xb3\xf6\xb5\xc4\xca\xd7\xd7\xd6\xc4\xb8\xcb\xf5\xd0\xb4\xd3" + "\xef\xa3\xbb\xcb\xfc\xb5\xc4\xb7\xa2\xd2\xf4\xce\xaa\xa1\xb0\x67" + "\x75\x68\x2d\x4e\x45\x57\xa1\xb1\xa3\xa9\xa1\xa3\xb8\xf7\xd6\xd6" + "\xca\xb9\xd3\xc3\x20\x4c\x69\x6e\x75\x78\x20\xd7\xf7\xce\xaa\xc4" + "\xda\xba\xcb\xb5\xc4\x20\x47\x4e\x55\x20\xb2\xd9\xd7\xf7\xcf\xb5" + "\xcd\xb3\xd5\xfd\xb1\xbb\xb9\xe3\xb7\xba\xb5\xd8\xca\xb9\xd3\xc3" + "\xd6\xf8\xa3\xbb\xcb\xe4\xc8\xbb\xd5\xe2\xd0\xa9\xcf\xb5\xcd\xb3" + "\xcd\xa8\xb3\xa3\xb1\xbb\xb3\xc6\xd7\xf7\xce\xaa\xa1\xb0\x4c\x69" + "\x6e\x75\x78\xa1\xb1\xa3\xac\xb5\xab\xca\xc7\xcb\xfc\xc3\xc7\xd3" + "\xa6\xb8\xc3\xb8\xfc\xbe\xab\xc8\xb7\xb5\xd8\xb1\xbb\xb3\xc6\xce" + "\xaa\x20\x47\x4e\x55\x2f\x4c\x69\x6e\x75\x78\x20\xcf\xb5\xcd\xb3" + "\x20\xa1\xa3\x0a"; + +static const vmime::charset outputCharset("utf-8"); +static const char outputBytes[] = + "\xe6\xac\xa2\xe8\xbf\x8e\xe6\x9d\xa5\xe5\x88\xb0\x20\x47\x4e\x55" + "\x20\xe5\xb7\xa5\xe7\xa8\x8b\xe7\x9a\x84\xe7\xbd\x91\xe9\xa1\xb5" + "\xe4\xbc\xba\xe6\x9c\x8d\xe4\xb8\xbb\xe6\x9c\xba\x20\x77\x77\x77" + "\x2e\x67\x6e\x75\x2e\x6f\x72\x67\x20\xe3\x80\x82\x20\x47\x4e\x55" + "\x20\xe5\xb7\xa5\xe7\xa8\x8b\x20\xe5\xbc\x80\xe5\xa7\x8b\xe6\x96" + "\xbc\xe4\xb8\x80\xe4\xb9\x9d\xe5\x85\xab\xe5\x9b\x9b\xe5\xb9\xb4" + "\xef\xbc\x8c\xe6\x97\xa8\xe5\x9c\xa8\xe5\x8f\x91\xe5\xb1\x95\xe4" + "\xb8\x80\xe4\xb8\xaa\xe7\xb1\xbb\xe4\xbc\xbc\x20\x55\x6e\x69\x78" + "\x20\xef\xbc\x8c\xe4\xb8\x94\xe4\xb8\xba\x20\xe8\x87\xaa\xe7\x94" + "\xb1\xe8\xbd\xaf\xe4\xbb\xb6\x20\xe7\x9a\x84\xe5\xae\x8c\xe6\x95" + "\xb4\xe6\x93\x8d\xe4\xbd\x9c\xe7\xb3\xbb\xe7\xbb\x9f\xef\xbc\x9a" + "\x20\x47\x4e\x55\x20\xe7\xb3\xbb\xe7\xbb\x9f\xe3\x80\x82\xef\xbc" + "\x88\x47\x4e\x55\x20\xe6\x98\xaf\xe7\x94\xb1\xe2\x80\x9c\x47\x4e" + "\x55\x27\x73\x20\x4e\x6f\x74\x20\x55\x6e\x69\x78\xe2\x80\x9d\xe6" + "\x89\x80\xe9\x80\x92\xe5\x9b\x9e\xe5\xae\x9a\xe4\xb9\x89\xe5\x87" + "\xba\xe7\x9a\x84\xe9\xa6\x96\xe5\xad\x97\xe6\xaf\x8d\xe7\xbc\xa9" + "\xe5\x86\x99\xe8\xaf\xad\xef\xbc\x9b\xe5\xae\x83\xe7\x9a\x84\xe5" + "\x8f\x91\xe9\x9f\xb3\xe4\xb8\xba\xe2\x80\x9c\x67\x75\x68\x2d\x4e" + "\x45\x57\xe2\x80\x9d\xef\xbc\x89\xe3\x80\x82\xe5\x90\x84\xe7\xa7" + "\x8d\xe4\xbd\xbf\xe7\x94\xa8\x20\x4c\x69\x6e\x75\x78\x20\xe4\xbd" + "\x9c\xe4\xb8\xba\xe5\x86\x85\xe6\xa0\xb8\xe7\x9a\x84\x20\x47\x4e" + "\x55\x20\xe6\x93\x8d\xe4\xbd\x9c\xe7\xb3\xbb\xe7\xbb\x9f\xe6\xad" + "\xa3\xe8\xa2\xab\xe5\xb9\xbf\xe6\xb3\x9b\xe5\x9c\xb0\xe4\xbd\xbf" + "\xe7\x94\xa8\xe8\x91\x97\xef\xbc\x9b\xe8\x99\xbd\xe7\x84\xb6\xe8" + "\xbf\x99\xe4\xba\x9b\xe7\xb3\xbb\xe7\xbb\x9f\xe9\x80\x9a\xe5\xb8" + "\xb8\xe8\xa2\xab\xe7\xa7\xb0\xe4\xbd\x9c\xe4\xb8\xba\xe2\x80\x9c" + "\x4c\x69\x6e\x75\x78\xe2\x80\x9d\xef\xbc\x8c\xe4\xbd\x86\xe6\x98" + "\xaf\xe5\xae\x83\xe4\xbb\xac\xe5\xba\x94\xe8\xaf\xa5\xe6\x9b\xb4" + "\xe7\xb2\xbe\xe7\xa1\xae\xe5\x9c\xb0\xe8\xa2\xab\xe7\xa7\xb0\xe4" + "\xb8\xba\x20\x47\x4e\x55\x2f\x4c\x69\x6e\x75\x78\x20\xe7\xb3\xbb" + "\xe7\xbb\x9f\x20\xe3\x80\x82\x0a"; + + + +VMIME_TEST_SUITE_BEGIN + + VMIME_TEST_LIST_BEGIN + // Test valid input + VMIME_TEST(testConvertStringValid) + VMIME_TEST(testConvertStreamValid) + VMIME_TEST(testFilterValid1) + VMIME_TEST(testFilterValid2) + VMIME_TEST(testFilterValid3) + + // TODO: more tests + VMIME_TEST_LIST_END + + + void testConvertStringValid() + { + vmime::string in(inputBytes, sizeof(inputBytes) - 1); + vmime::string expectedOut(outputBytes, sizeof(outputBytes) - 1); + vmime::string actualOut; + + vmime::charset::convert + (in, actualOut, inputCharset, outputCharset); + + VASSERT_EQ("1", toHex(expectedOut), toHex(actualOut)); + } + + void testConvertStreamValid() + { + vmime::string in(inputBytes, sizeof(inputBytes) - 1); + vmime::string expectedOut(outputBytes, sizeof(outputBytes) - 1); + + vmime::string actualOut; + vmime::utility::outputStreamStringAdapter os(actualOut); + + vmime::utility::inputStreamStringAdapter is(in); + + vmime::charset::convert + (is, os, inputCharset, outputCharset); + + VASSERT_EQ("1", toHex(expectedOut), toHex(actualOut)); + } + + // Using 'bufferedStreamCopy' + void testFilterValid1() + { + vmime::string in(inputBytes, sizeof(inputBytes) - 1); + vmime::string expectedOut(outputBytes, sizeof(outputBytes) - 1); + + vmime::string actualOut; + vmime::utility::outputStreamStringAdapter osa(actualOut); + vmime::utility::charsetFilteredOutputStream os + (inputCharset, outputCharset, osa); + + vmime::utility::inputStreamStringAdapter is(in); + + vmime::utility::bufferedStreamCopy(is, os); + + VASSERT_EQ("1", toHex(expectedOut), toHex(actualOut)); + } + + // One byte at a time + void testFilterValid2() + { + vmime::string in(inputBytes, sizeof(inputBytes) - 1); + vmime::string expectedOut(outputBytes, sizeof(outputBytes) - 1); + + vmime::string actualOut; + vmime::utility::outputStreamStringAdapter osa(actualOut); + vmime::utility::charsetFilteredOutputStream os + (inputCharset, outputCharset, osa); + + vmime::utility::inputStreamStringAdapter is(in); + + vmime::utility::stream::value_type buffer[16]; + + for (int i = 0 ; !is.eof() ; ++i) + os.write(buffer, is.read(buffer, 1)); + + VASSERT_EQ("1", toHex(expectedOut), toHex(actualOut)); + } + + // Variable chunks + void testFilterValid3() + { + vmime::string in(inputBytes, sizeof(inputBytes) - 1); + vmime::string expectedOut(outputBytes, sizeof(outputBytes) - 1); + + vmime::string actualOut; + vmime::utility::outputStreamStringAdapter osa(actualOut); + vmime::utility::charsetFilteredOutputStream os + (inputCharset, outputCharset, osa); + + vmime::utility::inputStreamStringAdapter is(in); + + vmime::utility::stream::value_type buffer[16]; + + for (int i = 0 ; !is.eof() ; ++i) + os.write(buffer, is.read(buffer, (i % 5) + 1)); + + VASSERT_EQ("1", toHex(expectedOut), toHex(actualOut)); + } + + + // Conversion to hexadecimal for easier debugging + static const vmime::string toHex(const vmime::string str) + { + static const char hexChars[] = "0123456789abcdef"; + + vmime::string res = "\n"; + + for (unsigned int i = 0 ; i < str.length() ; i += 16) + { + unsigned int r = std::min + (static_cast (16), str.length() - i); + + vmime::string hex; + vmime::string chr; + + for (unsigned int j = 0 ; j < r ; ++j) + { + const unsigned char c = str[i + j]; + + hex += hexChars[c / 16]; + hex += hexChars[c % 16]; + hex += " "; + + if (c >= 32 && c <= 127) + chr += c; + else + chr += '.'; + } + + for (unsigned int j = r ; j < 16 ; ++j) + hex += " "; + + res += hex + " " + chr + "\n"; + } + + return res; + } + +VMIME_TEST_SUITE_END +