diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e0f8010..25ad376c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -735,12 +735,12 @@ INCLUDE(cmake/FindICU.cmake) FIND_PACKAGE(ICU QUIET) -IF(WIN32) - SET(VMIME_CHARSETCONV_LIB_DETECTED "win") -ELSEIF(ICU_LIBRARIES) +IF(ICU_LIBRARIES) SET(VMIME_CHARSETCONV_LIB_DETECTED "icu") ELSEIF(ICONV_FOUND) SET(VMIME_CHARSETCONV_LIB_DETECTED "iconv") +ELSEIF(WIN32) + SET(VMIME_CHARSETCONV_LIB_DETECTED "win") ENDIF() SET( @@ -803,6 +803,10 @@ ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "icu") ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "win") + MESSAGE(WARNING "*** ICU or iconv library should always be preferred" + " over MultiByteToWideChar/WideCharToMultiByte on Windows, as" + " error handling is very poor, and there is no streaming support.") + SET(VMIME_CHARSETCONV_LIB_IS_ICONV "OFF") SET(VMIME_CHARSETCONV_LIB_IS_ICU "OFF") SET(VMIME_CHARSETCONV_LIB_IS_WIN "ON") diff --git a/src/vmime/charset.cpp b/src/vmime/charset.cpp index 22bff301..1a291106 100644 --- a/src/vmime/charset.cpp +++ b/src/vmime/charset.cpp @@ -109,6 +109,43 @@ void charset::convert(const string& in, string& out, const charset& source, cons } +bool charset::isValidText + (const string& text, string::size_type* firstInvalidByte) const +{ + charsetConverterOptions opts; + opts.silentlyReplaceInvalidSequences = false; + + charsetConverter::status st; + + try + { + std::string out; + + // Try converting to UTF-8 + shared_ptr conv = charsetConverter::create(*this, vmime::charset("utf-8"), opts); + conv->convert(text, out, &st); + } + catch (exceptions::illegal_byte_sequence_for_charset& e) + { + // An illegal byte sequence was found in the input buffer + if (firstInvalidByte) + { + if (st.inputBytesRead < text.length()) + *firstInvalidByte = st.inputBytesRead; + else + *firstInvalidByte = text.length(); + } + + return false; + } + + if (firstInvalidByte) + *firstInvalidByte = text.length(); + + return true; +} + + const charset charset::getLocalCharset() { return (platform::getHandler()->getLocalCharset()); diff --git a/src/vmime/charset.hpp b/src/vmime/charset.hpp index 5bd50fdf..1ea6289d 100644 --- a/src/vmime/charset.hpp +++ b/src/vmime/charset.hpp @@ -95,8 +95,12 @@ public: * @param source input charset * @param dest output charset * @param opts conversion options - * @throws exceptions::charset_conv_error if an error occured during - * the conversion + * @throws exceptions::illegal_byte_sequence_for_charset if an illegal + * byte sequence was found in the input bytes, and the + * 'silentlyReplaceInvalidSequences' flag is set to false in + * the charsetConverterOptions + * @throws exceptions::charset_conv_error if an unexpected error occured + * during the conversion */ static void convert(const string& in, string& out, const charset& source, const charset& dest, @@ -110,13 +114,29 @@ public: * @param source input charset * @param dest output charset * @param opts conversion options - * @throws exceptions::charset_conv_error if an error occured during - * the conversion + * @throws exceptions::illegal_byte_sequence_for_charset if an illegal + * byte sequence was found in the input bytes, and the + * 'silentlyReplaceInvalidSequences' flag is set to false in + * the charsetConverterOptions + * @throws exceptions::charset_conv_error if an unexpected error occured + * during the conversion */ static void convert(utility::inputStream& in, utility::outputStream& out, const charset& source, const charset& dest, const charsetConverterOptions& opts = charsetConverterOptions()); + /** Checks whether the specified text is valid in this charset. + * + * @param text input text + * @param firstInvalidByte if the function returns false, will contain + * the index of the first invalid byte in the string. Can be NULL if + * not used. + * @return true if the text is perfectly valid in this charset, + * or false otherwise (eg. it contains illegal sequences) + */ + bool isValidText(const string& text, string::size_type* firstInvalidByte) const; + + shared_ptr clone() const; void copyFrom(const component& other); diff --git a/src/vmime/charsetConverter.cpp b/src/vmime/charsetConverter.cpp index 87886823..525a71ec 100644 --- a/src/vmime/charsetConverter.cpp +++ b/src/vmime/charsetConverter.cpp @@ -42,4 +42,11 @@ shared_ptr charsetConverter::create } +charsetConverter::status::status() + : inputBytesRead(0), outputBytesWritten(0) +{ + +} + + } // vmime diff --git a/src/vmime/charsetConverter.hpp b/src/vmime/charsetConverter.hpp index 07f38d8f..98c64406 100644 --- a/src/vmime/charsetConverter.hpp +++ b/src/vmime/charsetConverter.hpp @@ -44,8 +44,13 @@ namespace utility /** A filtered output stream which applies a charset conversion * to input bytes. * - * May throw a exceptions::charset_conv_error if an error + * May throw a exceptions::charset_conv_error if an unexpected error * occured when initializing convert, or during charset conversion. + * + * May also throw a exceptions::illegal_byte_sequence_for_charset + * if an illegal byte sequence was found in the input bytes, and the + * 'silentlyReplaceInvalidSequences' flag is set to false in + * the charsetConverterOptions. */ class VMIME_EXPORT charsetFilteredOutputStream : public filteredOutputStream @@ -63,6 +68,23 @@ class VMIME_EXPORT charsetConverter : public object { public: + /** Holds information about a conversion. + */ + struct status + { + status(); + + + /** Number of bytes read from input buffer and successfully converted. + */ + size_t inputBytesRead; + + /** Number of bytes written to output buffer. + */ + size_t outputBytesWritten; + }; + + /** Construct and initialize an iconv charset converter. * * @param source input charset @@ -81,29 +103,44 @@ public: * * @param in input buffer * @param out output buffer - * @throws exceptions::charset_conv_error if an error occured during - * the conversion + * @param st will receive some extra infos when conversion is finished + * or stopped by an error (can be NULL) + * @throws exceptions::illegal_byte_sequence_for_charset if an illegal + * byte sequence was found in the input bytes, and the + * 'silentlyReplaceInvalidSequences' flag is set to false in + * the charsetConverterOptions + * @throws exceptions::charset_conv_error if an unexpected error occured + * during the conversion */ - virtual void convert(const string& in, string& out) = 0; + virtual void convert(const string& in, string& out, status* st = NULL) = 0; /** Convert the contents of an input stream in a specified charset * to another charset and write the result to an output stream. * * @param in input stream to read data from * @param out output stream to write the converted data - * @throws exceptions::charset_conv_error if an error occured during - * the conversion + * @param st will receive some extra infos when conversion is finished + * or stopped by an error (can be NULL) + * @throws exceptions::illegal_byte_sequence_for_charset if an illegal + * byte sequence was found in the input bytes, and the + * 'silentlyReplaceInvalidSequences' flag is set to false in + * the charsetConverterOptions + * @throws exceptions::charset_conv_error if an unexpected error occured + * during the conversion */ - virtual void convert(utility::inputStream& in, utility::outputStream& out) = 0; + virtual void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL) = 0; /** Returns a filtered output stream which applies a charset * conversion to input bytes. Please note that it may not be * supported by the converter. * * @param os stream into which filtered data will be written + * @param opts conversion options * @return a filtered output stream, or NULL if not supported */ - virtual shared_ptr getFilteredOutputStream(utility::outputStream& os) = 0; + virtual shared_ptr getFilteredOutputStream + (utility::outputStream& os, + const charsetConverterOptions& opts = charsetConverterOptions()) = 0; private: diff --git a/src/vmime/charsetConverterOptions.cpp b/src/vmime/charsetConverterOptions.cpp index caeacd01..4b0814af 100644 --- a/src/vmime/charsetConverterOptions.cpp +++ b/src/vmime/charsetConverterOptions.cpp @@ -29,7 +29,8 @@ namespace vmime charsetConverterOptions::charsetConverterOptions() - : invalidSequence("?") + : silentlyReplaceInvalidSequences(true), + invalidSequence("?") { } diff --git a/src/vmime/charsetConverterOptions.hpp b/src/vmime/charsetConverterOptions.hpp index 07e7a138..e07d30b2 100644 --- a/src/vmime/charsetConverterOptions.hpp +++ b/src/vmime/charsetConverterOptions.hpp @@ -42,7 +42,15 @@ public: charsetConverterOptions(); - /** Replace invalid sequences with this string. */ + /** If true, invalid sequences will be silently replaced with + * a string when possible (see 'invalidSequence'). + * Default is true. + */ + bool silentlyReplaceInvalidSequences; + + /** Replace invalid sequences with this string. + * Default is '?'. + */ string invalidSequence; }; diff --git a/src/vmime/charsetConverter_iconv.cpp b/src/vmime/charsetConverter_iconv.cpp index 75d7b170..eebc229b 100644 --- a/src/vmime/charsetConverter_iconv.cpp +++ b/src/vmime/charsetConverter_iconv.cpp @@ -147,8 +147,12 @@ charsetConverter_iconv::~charsetConverter_iconv() } -void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputStream& out) +void charsetConverter_iconv::convert + (utility::inputStream& in, utility::outputStream& out, status* st) { + if (st) + new (st) status(); + if (m_desc == NULL) throw exceptions::charset_conv_error("Cannot initialize converter."); @@ -175,6 +179,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt if (iconv(cd, ICONV_IN_TYPE(&inPtr), ptrLength, ICONV_OUT_TYPE(&outPtr), &outLength) == static_cast (-1)) { + if (st && inPtr) + { + st->inputBytesRead += (inPtr - inBuffer); + st->outputBytesWritten += (outPtr - outBuffer); + } + // Illegal input sequence or input sequence has no equivalent // sequence in the destination charset. if (prevIsInvalid) @@ -182,6 +192,9 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt // Write successfully converted bytes out.write(outBuffer, sizeof(outBuffer) - outLength); + if (!m_options.silentlyReplaceInvalidSequences) + throw exceptions::illegal_byte_sequence_for_charset(); + // Output a special character to indicate we don't known how to // convert the sequence at this position outputInvalidChar(out, cd, m_options); @@ -208,6 +221,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt // Write successfully converted bytes out.write(outBuffer, sizeof(outBuffer) - outLength); + if (st && inPtr) + { + st->inputBytesRead += (inPtr - inBuffer); + st->outputBytesWritten += (outPtr - outBuffer); + } + inPos = 0; prevIsInvalid = false; } @@ -222,29 +241,27 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt } -void charsetConverter_iconv::convert(const string& in, string& out) +void charsetConverter_iconv::convert(const string& in, string& out, status* st) { - if (m_source == m_dest) - { - // No conversion needed - out = in; - return; - } + if (st) + new (st) status(); out.clear(); utility::inputStreamStringAdapter is(in); utility::outputStreamStringAdapter os(out); - convert(is, os); + convert(is, os, st); os.flush(); } -shared_ptr charsetConverter_iconv::getFilteredOutputStream(utility::outputStream& os) +shared_ptr + charsetConverter_iconv::getFilteredOutputStream + (utility::outputStream& os, const charsetConverterOptions& opts) { - return make_shared (m_source, m_dest, &os); + return make_shared (m_source, m_dest, &os, opts); } @@ -255,9 +272,10 @@ namespace utility { charsetFilteredOutputStream_iconv::charsetFilteredOutputStream_iconv - (const charset& source, const charset& dest, outputStream* os) + (const charset& source, const charset& dest, outputStream* os, + const charsetConverterOptions& opts) : m_desc(NULL), m_sourceCharset(source), m_destCharset(dest), - m_stream(*os), m_unconvCount(0) + m_stream(*os), m_unconvCount(0), m_options(opts) { // Get an iconv descriptor const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str()); @@ -314,6 +332,9 @@ void charsetFilteredOutputStream_iconv::writeImpl // character and skip one byte in the invalid sequence. if (m_unconvCount >= sizeof(m_unconvBuffer)) { + if (!m_options.silentlyReplaceInvalidSequences) + throw exceptions::illegal_byte_sequence_for_charset(); + outputInvalidChar(m_stream, cd); std::copy(m_unconvBuffer + 1, @@ -439,6 +460,9 @@ void charsetFilteredOutputStream_iconv::flush() // Skip a "blocking" character if (inputConverted == 0) { + if (!m_options.silentlyReplaceInvalidSequences) + throw exceptions::illegal_byte_sequence_for_charset(); + outputInvalidChar(m_stream, cd); offset++; diff --git a/src/vmime/charsetConverter_iconv.hpp b/src/vmime/charsetConverter_iconv.hpp index 4167dc4e..e9a0f28a 100644 --- a/src/vmime/charsetConverter_iconv.hpp +++ b/src/vmime/charsetConverter_iconv.hpp @@ -56,10 +56,12 @@ public: ~charsetConverter_iconv(); - void convert(const string& in, string& out); - void convert(utility::inputStream& in, utility::outputStream& out); + void convert(const string& in, string& out, status* st = NULL); + void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL); - shared_ptr getFilteredOutputStream(utility::outputStream& os); + shared_ptr getFilteredOutputStream + (utility::outputStream& os, + const charsetConverterOptions& opts = charsetConverterOptions()); private: @@ -84,9 +86,11 @@ public: * @param source input charset * @param dest output charset * @param os stream into which write filtered data + * @param opts conversion options */ charsetFilteredOutputStream_iconv - (const charset& source, const charset& dest, outputStream* os); + (const charset& source, const charset& dest, outputStream* os, + const charsetConverterOptions& opts = charsetConverterOptions()); ~charsetFilteredOutputStream_iconv(); @@ -121,6 +125,8 @@ private: // Buffer used for conversion. Avoids declaring it in write(). // Should be at least MAX_CHARACTER_WIDTH * MAX_CHARACTER_WIDTH. byte_t m_outputBuffer[32768]; + + charsetConverterOptions m_options; }; diff --git a/src/vmime/charsetConverter_icu.cpp b/src/vmime/charsetConverter_icu.cpp index 86b25d89..fa965817 100644 --- a/src/vmime/charsetConverter_icu.cpp +++ b/src/vmime/charsetConverter_icu.cpp @@ -91,10 +91,17 @@ charsetConverter_icu::~charsetConverter_icu() } -void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStream& out) +void charsetConverter_icu::convert + (utility::inputStream& in, utility::outputStream& out, status* st) { UErrorCode err = U_ZERO_ERROR; + ucnv_reset(m_from); + ucnv_reset(m_to); + + if (st) + new (st) status(); + // From buffers byte_t cpInBuffer[16]; // stream data put here const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); @@ -105,12 +112,31 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; std::vector cpOutBuffer(cpOutBufferSz); - // Set replacement chars for when converting from Unicode to codepage - icu::UnicodeString substString(m_options.invalidSequence.c_str()); - ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); + // Tell ICU what to do when encountering an illegal byte sequence + if (m_options.silentlyReplaceInvalidSequences) + { + // Set replacement chars for when converting from Unicode to codepage + icu::UnicodeString substString(m_options.invalidSequence.c_str()); + ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); - if (U_FAILURE(err)) - throw exceptions::charset_conv_error("[ICU] Error setting replacement char."); + if (U_FAILURE(err)) + throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); + } + else + { + // Tell ICU top stop (and return an error) on illegal byte sequences + ucnv_setToUCallBack + (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); + + if (U_FAILURE(err)) + throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); + + ucnv_setFromUCallBack + (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); + + if (U_FAILURE(err)) + throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); + } // Input data available while (!in.eof()) @@ -137,8 +163,22 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre ucnv_toUnicode(m_from, &target, targetLimit, &source, sourceLimit, NULL, flush, &toErr); + if (st) + st->inputBytesRead += (source - reinterpret_cast (&cpInBuffer[0])); + if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) - throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); + { + if (toErr == U_INVALID_CHAR_FOUND || + toErr == U_TRUNCATED_CHAR_FOUND || + toErr == U_ILLEGAL_CHAR_FOUND) + { + // Error will be thrown later (*) + } + else + { + throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); + } + } // The Unicode source is the buffer just written and the limit // is where the previous conversion stopped (target is moved in the conversion) @@ -158,8 +198,40 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &uSource, uSourceLimit, NULL, flush, &fromErr); + if (st) + { + // Decrement input bytes count by the number of input bytes in error + char errBytes[16]; + int8_t errBytesLen = sizeof(errBytes); + UErrorCode errBytesErr = U_ZERO_ERROR; + + ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr); + + st->inputBytesRead -= errBytesLen; + st->outputBytesWritten += cpTarget - &cpOutBuffer[0]; + } + + // (*) If an error occured while converting from input charset, throw it now + if (toErr == U_INVALID_CHAR_FOUND || + toErr == U_TRUNCATED_CHAR_FOUND || + toErr == U_ILLEGAL_CHAR_FOUND) + { + throw exceptions::illegal_byte_sequence_for_charset(); + } + if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) - throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); + { + if (fromErr == U_INVALID_CHAR_FOUND || + fromErr == U_TRUNCATED_CHAR_FOUND || + fromErr == U_ILLEGAL_CHAR_FOUND) + { + throw exceptions::illegal_byte_sequence_for_charset(); + } + else + { + throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); + } + } // Write to destination stream out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); @@ -171,29 +243,27 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre } -void charsetConverter_icu::convert(const string& in, string& out) +void charsetConverter_icu::convert(const string& in, string& out, status* st) { - if (m_source == m_dest) - { - // No conversion needed - out = in; - return; - } + if (st) + new (st) status(); out.clear(); utility::inputStreamStringAdapter is(in); utility::outputStreamStringAdapter os(out); - convert(is, os); + convert(is, os, st); os.flush(); } -shared_ptr charsetConverter_icu::getFilteredOutputStream(utility::outputStream& os) +shared_ptr + charsetConverter_icu::getFilteredOutputStream + (utility::outputStream& os, const charsetConverterOptions& opts) { - return make_shared (m_source, m_dest, &os); + return make_shared (m_source, m_dest, &os, opts); } @@ -204,8 +274,10 @@ namespace utility { charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu - (const charset& source, const charset& dest, outputStream* os) - : m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os) + (const charset& source, const charset& dest, outputStream* os, + const charsetConverterOptions& opts) + : m_from(NULL), m_to(NULL), m_sourceCharset(source), + m_destCharset(dest), m_stream(*os), m_options(opts) { UErrorCode err = U_ZERO_ERROR; m_from = ucnv_open(source.getName().c_str(), &err); @@ -224,12 +296,31 @@ charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + "."); } - // Set replacement chars for when converting from Unicode to codepage - icu::UnicodeString substString(vmime::charsetConverterOptions().invalidSequence.c_str()); - ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); + // Tell ICU what to do when encountering an illegal byte sequence + if (m_options.silentlyReplaceInvalidSequences) + { + // Set replacement chars for when converting from Unicode to codepage + icu::UnicodeString substString(m_options.invalidSequence.c_str()); + ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); - if (U_FAILURE(err)) - throw exceptions::charset_conv_error("[ICU] Error setting replacement char."); + if (U_FAILURE(err)) + throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); + } + else + { + // Tell ICU top stop (and return an error) on illegal byte sequences + ucnv_setToUCallBack + (m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); + + if (U_FAILURE(err)) + throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); + + ucnv_setFromUCallBack + (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); + + if (U_FAILURE(err)) + throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); + } } @@ -275,8 +366,17 @@ void charsetFilteredOutputStream_icu::writeImpl if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { - throw exceptions::charset_conv_error - ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); + if (toErr == U_INVALID_CHAR_FOUND || + toErr == U_TRUNCATED_CHAR_FOUND || + toErr == U_ILLEGAL_CHAR_FOUND) + { + throw exceptions::illegal_byte_sequence_for_charset(); + } + else + { + throw exceptions::charset_conv_error + ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); + } } const size_t uniLength = uniTarget - &uniBuffer[0]; @@ -303,8 +403,17 @@ void charsetFilteredOutputStream_icu::writeImpl if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { - throw exceptions::charset_conv_error - ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); + if (fromErr == U_INVALID_CHAR_FOUND || + fromErr == U_TRUNCATED_CHAR_FOUND || + fromErr == U_ILLEGAL_CHAR_FOUND) + { + throw exceptions::illegal_byte_sequence_for_charset(); + } + else + { + throw exceptions::charset_conv_error + ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); + } } const size_t cpLength = cpTarget - &cpBuffer[0]; diff --git a/src/vmime/charsetConverter_icu.hpp b/src/vmime/charsetConverter_icu.hpp index 5d054413..9b03f643 100644 --- a/src/vmime/charsetConverter_icu.hpp +++ b/src/vmime/charsetConverter_icu.hpp @@ -59,10 +59,12 @@ public: ~charsetConverter_icu(); - void convert(const string& in, string& out); - void convert(utility::inputStream& in, utility::outputStream& out); + void convert(const string& in, string& out, status* st = NULL); + void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL); - shared_ptr getFilteredOutputStream(utility::outputStream& os); + shared_ptr getFilteredOutputStream + (utility::outputStream& os, + const charsetConverterOptions& opts = charsetConverterOptions()); private: @@ -88,9 +90,11 @@ public: * @param source input charset * @param dest output charset * @param os stream into which write filtered data + * @param opts conversion options */ charsetFilteredOutputStream_icu - (const charset& source, const charset& dest, outputStream* os); + (const charset& source, const charset& dest, outputStream* os, + const charsetConverterOptions& opts = charsetConverterOptions()); ~charsetFilteredOutputStream_icu(); @@ -112,6 +116,8 @@ private: const charset m_destCharset; outputStream& m_stream; + + charsetConverterOptions m_options; }; diff --git a/src/vmime/charsetConverter_idna.cpp b/src/vmime/charsetConverter_idna.cpp index aea6eca7..9f1ed4c0 100644 --- a/src/vmime/charsetConverter_idna.cpp +++ b/src/vmime/charsetConverter_idna.cpp @@ -57,8 +57,11 @@ charsetConverter_idna::~charsetConverter_idna() } -void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out) +void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out, status* st) { + if (st) + new (st) status(); + // IDNA should be used for short strings, so it does not matter if we // do not work directly on the stream string inStr; @@ -66,20 +69,16 @@ void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStr vmime::utility::bufferedStreamCopy(in, os); string outStr; - convert(inStr, outStr); + convert(inStr, outStr, st); out << outStr; } -void charsetConverter_idna::convert(const string& in, string& out) +void charsetConverter_idna::convert(const string& in, string& out, status* st) { - if (m_source == m_dest) - { - // No conversion needed - out = in; - return; - } + if (st) + new (st) status(); out.clear(); @@ -87,6 +86,12 @@ void charsetConverter_idna::convert(const string& in, string& out) { if (utility::stringUtils::is7bit(in)) { + if (st) + { + st->inputBytesRead = in.length(); + st->outputBytesWritten = in.length(); + } + // No need to encode as Punycode out = in; return; @@ -107,6 +112,9 @@ void charsetConverter_idna::convert(const string& in, string& out) unichars.push_back(uc); } + if (st) + st->inputBytesRead = in.length(); + std::vector output(inUTF8.length() * 2); punycode_uint outputLen = output.size(); @@ -116,6 +124,9 @@ void charsetConverter_idna::convert(const string& in, string& out) if (status == punycode_success) { out = string("xn--") + string(output.begin(), output.begin() + outputLen); + + if (st) + st->outputBytesWritten = out.length(); } else { @@ -126,6 +137,12 @@ void charsetConverter_idna::convert(const string& in, string& out) { if (in.length() < 5 || in.substr(0, 4) != "xn--") { + if (st) + { + st->inputBytesRead = in.length(); + st->outputBytesWritten = in.length(); + } + // Not an IDNA string out = in; return; @@ -137,6 +154,9 @@ void charsetConverter_idna::convert(const string& in, string& out) const punycode_status status = punycode_decode (in.length() - 4, &in[4], &outputLen, &output[0], /* case_flags */ NULL); + if (st) + st->inputBytesRead = in.length(); + if (status == punycode_success) { std::vector outUTF8Bytes(outputLen * 4); @@ -150,6 +170,9 @@ void charsetConverter_idna::convert(const string& in, string& out) string outUTF8(&outUTF8Bytes[0], p); charset::convert(outUTF8, out, vmime::charsets::UTF_8, m_dest); + + if (st) + st->outputBytesWritten = out.length(); } else { @@ -159,7 +182,9 @@ void charsetConverter_idna::convert(const string& in, string& out) } -shared_ptr charsetConverter_idna::getFilteredOutputStream(utility::outputStream& /* os */) +shared_ptr + charsetConverter_idna::getFilteredOutputStream + (utility::outputStream& /* os */, const charsetConverterOptions& /* opts */) { return null; } diff --git a/src/vmime/charsetConverter_idna.hpp b/src/vmime/charsetConverter_idna.hpp index 874d6bf1..06ffbcdd 100644 --- a/src/vmime/charsetConverter_idna.hpp +++ b/src/vmime/charsetConverter_idna.hpp @@ -50,10 +50,12 @@ public: ~charsetConverter_idna(); - void convert(const string& in, string& out); - void convert(utility::inputStream& in, utility::outputStream& out); + void convert(const string& in, string& out, status* st = NULL); + void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL); - shared_ptr getFilteredOutputStream(utility::outputStream& os); + shared_ptr getFilteredOutputStream + (utility::outputStream& os, + const charsetConverterOptions& opts = charsetConverterOptions()); private: diff --git a/src/vmime/charsetConverter_win.cpp b/src/vmime/charsetConverter_win.cpp index e7542584..2828fc66 100644 --- a/src/vmime/charsetConverter_win.cpp +++ b/src/vmime/charsetConverter_win.cpp @@ -69,8 +69,12 @@ charsetConverter_win::charsetConverter_win } -void charsetConverter_win::convert(utility::inputStream& in, utility::outputStream& out) +void charsetConverter_win::convert + (utility::inputStream& in, utility::outputStream& out, status* st) { + if (st) + new (st) status(); + byte_t buffer[32768]; string inStr, outStr; @@ -80,20 +84,16 @@ void charsetConverter_win::convert(utility::inputStream& in, utility::outputStre utility::stringUtils::appendBytesToString(inStr, buffer, len); } - convert(inStr, outStr); + convert(inStr, outStr, st); out.write(outStr.data(), outStr.length()); } -void charsetConverter_win::convert(const string& in, string& out) +void charsetConverter_win::convert(const string& in, string& out, status* st) { - if (m_source == m_dest) - { - // No conversion needed - out = in; - return; - } + if (st) + new (st) status(); const int sourceCodePage = getCodePage(m_source.getName().c_str()); const int destCodePage = getCodePage(m_dest.getName().c_str()); @@ -113,10 +113,27 @@ void charsetConverter_win::convert(const string& in, string& out) const size_t bufferSize = in.length() * 2; // in wide characters unicodeBuffer.resize(bufferSize); + DWORD flags = 0; + + if (!m_options.silentlyReplaceInvalidSequences) + flags |= MB_ERR_INVALID_CHARS; + unicodePtr = reinterpret_cast (&unicodeBuffer[0]); unicodeLen = MultiByteToWideChar (sourceCodePage, 0, in.c_str(), static_cast (in.length()), reinterpret_cast (&unicodeBuffer[0]), static_cast (bufferSize)); + + if (unicodeLen == 0) + { + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + { + throw exceptions::illegal_byte_sequence_in_charset(); + } + else + { + throw exceptions::charset_conv_error("MultiByteToWideChar() failed when converting to Unicode from " + m_source.getName()); + } + } } // Convert from Unicode to destination charset @@ -135,6 +152,18 @@ void charsetConverter_win::convert(const string& in, string& out) (destCodePage, 0, unicodePtr, static_cast (unicodeLen), &buffer[0], static_cast (bufferSize), 0, NULL); + if (len == 0) + { + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + { + throw exceptions::illegal_byte_sequence_in_charset(); + } + else + { + throw exceptions::charset_conv_error("WideCharToMultiByte() failed when converting from Unicode to " + m_source.getName()); + } + } + out.assign(&buffer[0], len); } } @@ -158,7 +187,8 @@ int charsetConverter_win::getCodePage(const char* name) shared_ptr - charsetConverter_win::getFilteredOutputStream(utility::outputStream& /* os */) + charsetConverter_win::getFilteredOutputStream + (utility::outputStream& /* os */, const charsetConverterOptions& /* opts */) { // TODO: implement me! return null; diff --git a/src/vmime/charsetConverter_win.hpp b/src/vmime/charsetConverter_win.hpp index a89fc021..572ce6e0 100644 --- a/src/vmime/charsetConverter_win.hpp +++ b/src/vmime/charsetConverter_win.hpp @@ -38,7 +38,16 @@ namespace vmime { -/** A generic charset converter which uses Windows MultiByteToWideChar. +/** A generic charset converter which uses Windows MultiByteToWideChar + * and WideCharToMultiByte API functions. + * + * ICU or iconv library should always be preferred over this one, even + * on Windows platform, as MultiByteToWideChar() and WideCharToMultiByte() + * functions cannot be used easily with streams (no context). Moreover, + * error handling is very poor, in particular when an invalid sequence + * is found... + * + * Also, "status" is not supported by this converter for the same reason. */ class charsetConverter_win : public charsetConverter @@ -54,8 +63,8 @@ public: charsetConverter_win(const charset& source, const charset& dest, const charsetConverterOptions& opts = charsetConverterOptions()); - void convert(const string& in, string& out); - void convert(utility::inputStream& in, utility::outputStream& out); + void convert(const string& in, string& out, status* st); + void convert(utility::inputStream& in, utility::outputStream& out, status* st); shared_ptr getFilteredOutputStream(utility::outputStream& os); diff --git a/src/vmime/exception.cpp b/src/vmime/exception.cpp index ce57883c..b96b7c37 100644 --- a/src/vmime/exception.cpp +++ b/src/vmime/exception.cpp @@ -115,6 +115,20 @@ exception* charset_conv_error::clone() const { return new charset_conv_error(*th const char* charset_conv_error::name() const throw() { return "charset_conv_error"; } + +// +// illegal_byte_sequence_for_charset +// + +illegal_byte_sequence_for_charset::~illegal_byte_sequence_for_charset() throw() {} +illegal_byte_sequence_for_charset::illegal_byte_sequence_for_charset(const string& what, const exception& other) + : exception(what.empty() ? "Found illegal byte sequence for this charset." : what, other) {} + +exception* illegal_byte_sequence_for_charset::clone() const { return new illegal_byte_sequence_for_charset(*this); } +const char* illegal_byte_sequence_for_charset::name() const throw() { return "illegal_byte_sequence_for_charset"; } + + + // // no_encoder_available // diff --git a/src/vmime/exception.hpp b/src/vmime/exception.hpp index a279f305..9dcbf641 100644 --- a/src/vmime/exception.hpp +++ b/src/vmime/exception.hpp @@ -116,6 +116,18 @@ public: }; +class VMIME_EXPORT illegal_byte_sequence_for_charset : public vmime::exception +{ +public: + + illegal_byte_sequence_for_charset(const string& what = "", const exception& other = NO_EXCEPTION); + ~illegal_byte_sequence_for_charset() throw(); + + exception* clone() const; + const char* name() const throw(); +}; + + /** No encoder has been found for the specified encoding name. */ diff --git a/src/vmime/text.cpp b/src/vmime/text.cpp index 08fc9ba9..f506f660 100644 --- a/src/vmime/text.cpp +++ b/src/vmime/text.cpp @@ -400,7 +400,8 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g out->removeAllWords(); - const std::vector > words = word::parseMultiple(ctx, in, 0, in.length(), NULL); + std::vector > words = word::parseMultiple(ctx, in, 0, in.length(), NULL); + fixBrokenWords(words); copy_vector(words, out->m_words); @@ -408,6 +409,48 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g } +// static +void text::fixBrokenWords(std::vector >& words) +{ + if (words.size() < 2) + return; + + // Fix words which encode a non-integral number of characters. + // This is not RFC-compliant, but we should be able to recover from it. + for (size_t i = 0, n = words.size() - 1 ; i < n ; ++i) + { + shared_ptr w1 = words[i]; + shared_ptr w2 = words[i + 1]; + + // Check whether the word is valid + bool valid = w1->getCharset().isValidText(w1->getBuffer(), NULL); + + // If the current word is not valid, try to grab some bytes + // from the next word, to see whether it becomes valid. + if (!valid) + { + string buffer(w1->getBuffer()); + buffer += w2->getBuffer(); + + string::size_type firstInvalidByte; + valid = w1->getCharset().isValidText(buffer, &firstInvalidByte); + + // Current word with additional bytes from the next word + // is now valid: adjust buffers of both words. + w1->setBuffer(string(buffer.begin(), buffer.begin() + firstInvalidByte)); + w2->setBuffer(string(buffer.begin() + firstInvalidByte, buffer.end())); + + // If the next word is now empty, remove it + if (w2->getBuffer().empty()) + { + words.erase(words.begin() + i + 1); + --n; + } + } + } +} + + const std::vector > text::getChildComponents() { std::vector > list; diff --git a/src/vmime/text.hpp b/src/vmime/text.hpp index b7e25669..d8c4571a 100644 --- a/src/vmime/text.hpp +++ b/src/vmime/text.hpp @@ -251,6 +251,9 @@ public: protected: + static void fixBrokenWords(std::vector >& words); + + // Component parsing & assembling void parseImpl (const parsingContext& ctx, diff --git a/tests/parser/charsetTest.cpp b/tests/parser/charsetTest.cpp index 915b8560..e599c5b5 100644 --- a/tests/parser/charsetTest.cpp +++ b/tests/parser/charsetTest.cpp @@ -21,6 +21,8 @@ // the GNU General Public License cover the whole combination. // +#include + #include "tests/testUtils.hpp" #include "charsetTestSuites.hpp" @@ -39,6 +41,14 @@ VMIME_TEST_SUITE_BEGIN(charsetTest) VMIME_TEST(testDecodeIDNA) VMIME_TEST(testUTF7Support) + + VMIME_TEST(testReplaceInvalidSequence) + VMIME_TEST(testStopOnInvalidSequence) + + VMIME_TEST(testStatus) + VMIME_TEST(testStatusWithInvalidSequence) + + VMIME_TEST(testIsValidText) VMIME_TEST_LIST_END @@ -106,10 +116,15 @@ VMIME_TEST_SUITE_BEGIN(charsetTest) } static const vmime::string convertHelper - (const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest) + (const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest, + const vmime::charsetConverterOptions& opts = vmime::charsetConverterOptions(), + vmime::charsetConverter::status* st = NULL) { + vmime::shared_ptr conv = + vmime::charsetConverter::create(csrc, cdest, opts); + vmime::string out; - vmime::charset::convert(in, out, csrc, cdest); + conv->convert(in, out, st); return out; } @@ -145,5 +160,91 @@ VMIME_TEST_SUITE_BEGIN(charsetTest) VASSERT_EQ("2", "f+APg-o", convertHelper("\x66\xc3\xb8\x6f", "utf-8", "utf-7")); } + void testReplaceInvalidSequence() + { + vmime::charsetConverterOptions opts; + opts.silentlyReplaceInvalidSequences = true; + opts.invalidSequence = "?"; + + vmime::string res = convertHelper + ("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts); + + // Result should be in the form "a???b?c??d" or "a??????b?c??d"... + // Remove consecutive question marks for easier matching. + res.erase(std::unique(res.begin(), res.end()), res.end()); + + VASSERT_EQ( + "Illegal UTF-8 sequence", + "a?b?c?d", + res + ); + } + + void testStopOnInvalidSequence() + { + vmime::charsetConverterOptions opts; + opts.silentlyReplaceInvalidSequences = false; + + VASSERT_THROW( + "Illegal UTF-8 sequence", + convertHelper("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts), + vmime::exceptions::illegal_byte_sequence_for_charset + ); + } + + void testStatus() + { + vmime::charsetConverterOptions opts; + opts.silentlyReplaceInvalidSequences = false; + + vmime::charsetConverter::status st; + + // 012345 6 7 + convertHelper("Gwena\xc3\xabl", "utf-8", "iso-8859-1", opts, &st); + + VASSERT_EQ("inputBytesRead", 8, st.inputBytesRead); + VASSERT_EQ("outputBytesWritten", 7, st.outputBytesWritten); + } + + void testStatusWithInvalidSequence() + { + vmime::charsetConverterOptions opts; + opts.silentlyReplaceInvalidSequences = false; + + vmime::charsetConverter::status st; + + try + { + // 01234 5 6789 0 1 + convertHelper("Fran\xc3\xa7ois\xf1\x80\x65", "utf-8", "iso-8859-1", opts, &st); + } + catch (vmime::exceptions::illegal_byte_sequence_for_charset& e) + { + } + catch (...) + { + throw; + } + + VASSERT_EQ("inputBytesRead", 9, st.inputBytesRead); + VASSERT_EQ("outputBytesWritten", 8, st.outputBytesWritten); + } + + void testIsValidText() + { + // Invalid text + const vmime::string invalidText("Fran\xc3\xa7ois\xf1\x80\x65"); + vmime::string::size_type firstInvalidByte; + + VASSERT_EQ("invalid.isValidText", false, vmime::charset("utf-8").isValidText(invalidText, &firstInvalidByte)); + VASSERT_EQ("invalid.firstInvalidByte", 9, firstInvalidByte); + + // Valid text + const vmime::string validText("Gwena\xc3\xabl"); + + VASSERT_EQ("valid.isValidText", true, vmime::charset("utf-8").isValidText(validText, &firstInvalidByte)); + VASSERT_EQ("valid.firstInvalidByte", 8, firstInvalidByte); + } + VMIME_TEST_SUITE_END diff --git a/tests/parser/textTest.cpp b/tests/parser/textTest.cpp index 588dc194..978d9145 100644 --- a/tests/parser/textTest.cpp +++ b/tests/parser/textTest.cpp @@ -61,6 +61,7 @@ VMIME_TEST_SUITE_BEGIN(textTest) VMIME_TEST(testInternationalizedEmail_folding) VMIME_TEST(testWronglyPaddedB64Words) + VMIME_TEST(testFixBrokenWords) VMIME_TEST_LIST_END @@ -617,5 +618,50 @@ VMIME_TEST_SUITE_BEGIN(textTest) outText.getConvertedText(vmime::charset("utf-8"))); } + // Ensure that words which encode a non-integral number of characters + // are correctly decoded. + void testFixBrokenWords() + { + vmime::text outText; + + vmime::charsetConverterOptions opts; + opts.silentlyReplaceInvalidSequences = false; // just to be sure that broken words are actually fixed + + // Test case 1 + vmime::text::decodeAndUnfold + ("=?utf-8?Q?Gwena=C3?=" + "=?utf-8?Q?=ABl?=", &outText); + + VASSERT_EQ("1", "Gwena\xebl", + outText.getConvertedText(vmime::charset("iso-8859-1"), opts)); + + // Test case 2 + vmime::text::decodeAndUnfold + ("=?utf-8?B?5Lit6Yu85qmf5qKw6JGj5LqL5pyDMTAz5bm056ysMDXlsYbn?=" + "=?utf-8?B?rKwwN+asoeitsOeoiw==?=", &outText); + + VASSERT_EQ("2", "\xe4\xb8\xad\xe9\x8b\xbc\xe6\xa9\x9f\xe6\xa2\xb0" + "\xe8\x91\xa3\xe4\xba\x8b\xe6\x9c\x83\x31\x30\x33\xe5\xb9\xb4" + "\xe7\xac\xac\x30\x35\xe5\xb1\x86\xe7\xac\xac\x30\x37\xe6\xac" + "\xa1\xe8\xad\xb0\xe7\xa8\x8b", + outText.getConvertedText(vmime::charset("utf-8"))); + + // Test case 3 (a character spanning over 3 words: 'を' = E3 82 92) + vmime::text::decodeAndUnfold + ("=?utf-8?Q?abc=E3?=" + "=?utf-8?Q?=82?=" + "=?utf-8?Q?=92xyz?=", &outText); + + std::string out; // decode as UTF-16 then rencode to UTF-8 for easier comparison + vmime::charset::convert( + outText.getConvertedText(vmime::charset("utf-16"), opts), + out, + vmime::charset("utf-16"), + vmime::charset("utf-8") + ); + + VASSERT_EQ("3", "abc\xe3\x82\x92xyz", out); + } + VMIME_TEST_SUITE_END