Issue #103: fix badly encoded words.

This commit is contained in:
Vincent Richard 2015-02-16 18:43:03 +01:00
parent f51cb846a9
commit c5c66f9fdc
21 changed files with 640 additions and 96 deletions

View File

@ -735,12 +735,12 @@ INCLUDE(cmake/FindICU.cmake)
FIND_PACKAGE(ICU QUIET) FIND_PACKAGE(ICU QUIET)
IF(WIN32) IF(ICU_LIBRARIES)
SET(VMIME_CHARSETCONV_LIB_DETECTED "win")
ELSEIF(ICU_LIBRARIES)
SET(VMIME_CHARSETCONV_LIB_DETECTED "icu") SET(VMIME_CHARSETCONV_LIB_DETECTED "icu")
ELSEIF(ICONV_FOUND) ELSEIF(ICONV_FOUND)
SET(VMIME_CHARSETCONV_LIB_DETECTED "iconv") SET(VMIME_CHARSETCONV_LIB_DETECTED "iconv")
ELSEIF(WIN32)
SET(VMIME_CHARSETCONV_LIB_DETECTED "win")
ENDIF() ENDIF()
SET( SET(
@ -803,6 +803,10 @@ ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "icu")
ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "win") ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "win")
MESSAGE(WARNING "*** ICU or iconv library should always be preferred"
" over MultiByteToWideChar/WideCharToMultiByte on Windows, as"
" error handling is very poor, and there is no streaming support.")
SET(VMIME_CHARSETCONV_LIB_IS_ICONV "OFF") SET(VMIME_CHARSETCONV_LIB_IS_ICONV "OFF")
SET(VMIME_CHARSETCONV_LIB_IS_ICU "OFF") SET(VMIME_CHARSETCONV_LIB_IS_ICU "OFF")
SET(VMIME_CHARSETCONV_LIB_IS_WIN "ON") SET(VMIME_CHARSETCONV_LIB_IS_WIN "ON")

View File

@ -109,6 +109,43 @@ void charset::convert(const string& in, string& out, const charset& source, cons
} }
bool charset::isValidText
(const string& text, string::size_type* firstInvalidByte) const
{
charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
charsetConverter::status st;
try
{
std::string out;
// Try converting to UTF-8
shared_ptr <charsetConverter> conv = charsetConverter::create(*this, vmime::charset("utf-8"), opts);
conv->convert(text, out, &st);
}
catch (exceptions::illegal_byte_sequence_for_charset& e)
{
// An illegal byte sequence was found in the input buffer
if (firstInvalidByte)
{
if (st.inputBytesRead < text.length())
*firstInvalidByte = st.inputBytesRead;
else
*firstInvalidByte = text.length();
}
return false;
}
if (firstInvalidByte)
*firstInvalidByte = text.length();
return true;
}
const charset charset::getLocalCharset() const charset charset::getLocalCharset()
{ {
return (platform::getHandler()->getLocalCharset()); return (platform::getHandler()->getLocalCharset());

View File

@ -95,8 +95,12 @@ public:
* @param source input charset * @param source input charset
* @param dest output charset * @param dest output charset
* @param opts conversion options * @param opts conversion options
* @throws exceptions::charset_conv_error if an error occured during * @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* the conversion * byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/ */
static void convert(const string& in, string& out, static void convert(const string& in, string& out,
const charset& source, const charset& dest, const charset& source, const charset& dest,
@ -110,13 +114,29 @@ public:
* @param source input charset * @param source input charset
* @param dest output charset * @param dest output charset
* @param opts conversion options * @param opts conversion options
* @throws exceptions::charset_conv_error if an error occured during * @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* the conversion * byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/ */
static void convert(utility::inputStream& in, utility::outputStream& out, static void convert(utility::inputStream& in, utility::outputStream& out,
const charset& source, const charset& dest, const charset& source, const charset& dest,
const charsetConverterOptions& opts = charsetConverterOptions()); const charsetConverterOptions& opts = charsetConverterOptions());
/** Checks whether the specified text is valid in this charset.
*
* @param text input text
* @param firstInvalidByte if the function returns false, will contain
* the index of the first invalid byte in the string. Can be NULL if
* not used.
* @return true if the text is perfectly valid in this charset,
* or false otherwise (eg. it contains illegal sequences)
*/
bool isValidText(const string& text, string::size_type* firstInvalidByte) const;
shared_ptr <component> clone() const; shared_ptr <component> clone() const;
void copyFrom(const component& other); void copyFrom(const component& other);

View File

@ -42,4 +42,11 @@ shared_ptr <charsetConverter> charsetConverter::create
} }
charsetConverter::status::status()
: inputBytesRead(0), outputBytesWritten(0)
{
}
} // vmime } // vmime

View File

@ -44,8 +44,13 @@ namespace utility
/** A filtered output stream which applies a charset conversion /** A filtered output stream which applies a charset conversion
* to input bytes. * to input bytes.
* *
* May throw a exceptions::charset_conv_error if an error * May throw a exceptions::charset_conv_error if an unexpected error
* occured when initializing convert, or during charset conversion. * occured when initializing convert, or during charset conversion.
*
* May also throw a exceptions::illegal_byte_sequence_for_charset
* if an illegal byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions.
*/ */
class VMIME_EXPORT charsetFilteredOutputStream : public filteredOutputStream class VMIME_EXPORT charsetFilteredOutputStream : public filteredOutputStream
@ -63,6 +68,23 @@ class VMIME_EXPORT charsetConverter : public object
{ {
public: public:
/** Holds information about a conversion.
*/
struct status
{
status();
/** Number of bytes read from input buffer and successfully converted.
*/
size_t inputBytesRead;
/** Number of bytes written to output buffer.
*/
size_t outputBytesWritten;
};
/** Construct and initialize an iconv charset converter. /** Construct and initialize an iconv charset converter.
* *
* @param source input charset * @param source input charset
@ -81,29 +103,44 @@ public:
* *
* @param in input buffer * @param in input buffer
* @param out output buffer * @param out output buffer
* @throws exceptions::charset_conv_error if an error occured during * @param st will receive some extra infos when conversion is finished
* the conversion * or stopped by an error (can be NULL)
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/ */
virtual void convert(const string& in, string& out) = 0; virtual void convert(const string& in, string& out, status* st = NULL) = 0;
/** Convert the contents of an input stream in a specified charset /** Convert the contents of an input stream in a specified charset
* to another charset and write the result to an output stream. * to another charset and write the result to an output stream.
* *
* @param in input stream to read data from * @param in input stream to read data from
* @param out output stream to write the converted data * @param out output stream to write the converted data
* @throws exceptions::charset_conv_error if an error occured during * @param st will receive some extra infos when conversion is finished
* the conversion * or stopped by an error (can be NULL)
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/ */
virtual void convert(utility::inputStream& in, utility::outputStream& out) = 0; virtual void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL) = 0;
/** Returns a filtered output stream which applies a charset /** Returns a filtered output stream which applies a charset
* conversion to input bytes. Please note that it may not be * conversion to input bytes. Please note that it may not be
* supported by the converter. * supported by the converter.
* *
* @param os stream into which filtered data will be written * @param os stream into which filtered data will be written
* @param opts conversion options
* @return a filtered output stream, or NULL if not supported * @return a filtered output stream, or NULL if not supported
*/ */
virtual shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os) = 0; virtual shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions()) = 0;
private: private:

View File

@ -29,7 +29,8 @@ namespace vmime
charsetConverterOptions::charsetConverterOptions() charsetConverterOptions::charsetConverterOptions()
: invalidSequence("?") : silentlyReplaceInvalidSequences(true),
invalidSequence("?")
{ {
} }

View File

@ -42,7 +42,15 @@ public:
charsetConverterOptions(); charsetConverterOptions();
/** Replace invalid sequences with this string. */ /** If true, invalid sequences will be silently replaced with
* a string when possible (see 'invalidSequence').
* Default is true.
*/
bool silentlyReplaceInvalidSequences;
/** Replace invalid sequences with this string.
* Default is '?'.
*/
string invalidSequence; string invalidSequence;
}; };

View File

@ -147,8 +147,12 @@ charsetConverter_iconv::~charsetConverter_iconv()
} }
void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputStream& out) void charsetConverter_iconv::convert
(utility::inputStream& in, utility::outputStream& out, status* st)
{ {
if (st)
new (st) status();
if (m_desc == NULL) if (m_desc == NULL)
throw exceptions::charset_conv_error("Cannot initialize converter."); throw exceptions::charset_conv_error("Cannot initialize converter.");
@ -175,6 +179,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
if (iconv(cd, ICONV_IN_TYPE(&inPtr), ptrLength, if (iconv(cd, ICONV_IN_TYPE(&inPtr), ptrLength,
ICONV_OUT_TYPE(&outPtr), &outLength) == static_cast <size_t>(-1)) ICONV_OUT_TYPE(&outPtr), &outLength) == static_cast <size_t>(-1))
{ {
if (st && inPtr)
{
st->inputBytesRead += (inPtr - inBuffer);
st->outputBytesWritten += (outPtr - outBuffer);
}
// Illegal input sequence or input sequence has no equivalent // Illegal input sequence or input sequence has no equivalent
// sequence in the destination charset. // sequence in the destination charset.
if (prevIsInvalid) if (prevIsInvalid)
@ -182,6 +192,9 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
// Write successfully converted bytes // Write successfully converted bytes
out.write(outBuffer, sizeof(outBuffer) - outLength); out.write(outBuffer, sizeof(outBuffer) - outLength);
if (!m_options.silentlyReplaceInvalidSequences)
throw exceptions::illegal_byte_sequence_for_charset();
// Output a special character to indicate we don't known how to // Output a special character to indicate we don't known how to
// convert the sequence at this position // convert the sequence at this position
outputInvalidChar(out, cd, m_options); outputInvalidChar(out, cd, m_options);
@ -208,6 +221,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
// Write successfully converted bytes // Write successfully converted bytes
out.write(outBuffer, sizeof(outBuffer) - outLength); out.write(outBuffer, sizeof(outBuffer) - outLength);
if (st && inPtr)
{
st->inputBytesRead += (inPtr - inBuffer);
st->outputBytesWritten += (outPtr - outBuffer);
}
inPos = 0; inPos = 0;
prevIsInvalid = false; prevIsInvalid = false;
} }
@ -222,29 +241,27 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
} }
void charsetConverter_iconv::convert(const string& in, string& out) void charsetConverter_iconv::convert(const string& in, string& out, status* st)
{ {
if (m_source == m_dest) if (st)
{ new (st) status();
// No conversion needed
out = in;
return;
}
out.clear(); out.clear();
utility::inputStreamStringAdapter is(in); utility::inputStreamStringAdapter is(in);
utility::outputStreamStringAdapter os(out); utility::outputStreamStringAdapter os(out);
convert(is, os); convert(is, os, st);
os.flush(); os.flush();
} }
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_iconv::getFilteredOutputStream(utility::outputStream& os) shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_iconv::getFilteredOutputStream
(utility::outputStream& os, const charsetConverterOptions& opts)
{ {
return make_shared <utility::charsetFilteredOutputStream_iconv>(m_source, m_dest, &os); return make_shared <utility::charsetFilteredOutputStream_iconv>(m_source, m_dest, &os, opts);
} }
@ -255,9 +272,10 @@ namespace utility {
charsetFilteredOutputStream_iconv::charsetFilteredOutputStream_iconv charsetFilteredOutputStream_iconv::charsetFilteredOutputStream_iconv
(const charset& source, const charset& dest, outputStream* os) (const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts)
: m_desc(NULL), m_sourceCharset(source), m_destCharset(dest), : m_desc(NULL), m_sourceCharset(source), m_destCharset(dest),
m_stream(*os), m_unconvCount(0) m_stream(*os), m_unconvCount(0), m_options(opts)
{ {
// Get an iconv descriptor // Get an iconv descriptor
const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str()); const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str());
@ -314,6 +332,9 @@ void charsetFilteredOutputStream_iconv::writeImpl
// character and skip one byte in the invalid sequence. // character and skip one byte in the invalid sequence.
if (m_unconvCount >= sizeof(m_unconvBuffer)) if (m_unconvCount >= sizeof(m_unconvBuffer))
{ {
if (!m_options.silentlyReplaceInvalidSequences)
throw exceptions::illegal_byte_sequence_for_charset();
outputInvalidChar(m_stream, cd); outputInvalidChar(m_stream, cd);
std::copy(m_unconvBuffer + 1, std::copy(m_unconvBuffer + 1,
@ -439,6 +460,9 @@ void charsetFilteredOutputStream_iconv::flush()
// Skip a "blocking" character // Skip a "blocking" character
if (inputConverted == 0) if (inputConverted == 0)
{ {
if (!m_options.silentlyReplaceInvalidSequences)
throw exceptions::illegal_byte_sequence_for_charset();
outputInvalidChar(m_stream, cd); outputInvalidChar(m_stream, cd);
offset++; offset++;

View File

@ -56,10 +56,12 @@ public:
~charsetConverter_iconv(); ~charsetConverter_iconv();
void convert(const string& in, string& out); void convert(const string& in, string& out, status* st = NULL);
void convert(utility::inputStream& in, utility::outputStream& out); void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os); shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions());
private: private:
@ -84,9 +86,11 @@ public:
* @param source input charset * @param source input charset
* @param dest output charset * @param dest output charset
* @param os stream into which write filtered data * @param os stream into which write filtered data
* @param opts conversion options
*/ */
charsetFilteredOutputStream_iconv charsetFilteredOutputStream_iconv
(const charset& source, const charset& dest, outputStream* os); (const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts = charsetConverterOptions());
~charsetFilteredOutputStream_iconv(); ~charsetFilteredOutputStream_iconv();
@ -121,6 +125,8 @@ private:
// Buffer used for conversion. Avoids declaring it in write(). // Buffer used for conversion. Avoids declaring it in write().
// Should be at least MAX_CHARACTER_WIDTH * MAX_CHARACTER_WIDTH. // Should be at least MAX_CHARACTER_WIDTH * MAX_CHARACTER_WIDTH.
byte_t m_outputBuffer[32768]; byte_t m_outputBuffer[32768];
charsetConverterOptions m_options;
}; };

View File

@ -91,10 +91,17 @@ charsetConverter_icu::~charsetConverter_icu()
} }
void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStream& out) void charsetConverter_icu::convert
(utility::inputStream& in, utility::outputStream& out, status* st)
{ {
UErrorCode err = U_ZERO_ERROR; UErrorCode err = U_ZERO_ERROR;
ucnv_reset(m_from);
ucnv_reset(m_to);
if (st)
new (st) status();
// From buffers // From buffers
byte_t cpInBuffer[16]; // stream data put here byte_t cpInBuffer[16]; // stream data put here
const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
@ -105,12 +112,31 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
std::vector <char> cpOutBuffer(cpOutBufferSz); std::vector <char> cpOutBuffer(cpOutBufferSz);
// Tell ICU what to do when encountering an illegal byte sequence
if (m_options.silentlyReplaceInvalidSequences)
{
// Set replacement chars for when converting from Unicode to codepage // Set replacement chars for when converting from Unicode to codepage
icu::UnicodeString substString(m_options.invalidSequence.c_str()); icu::UnicodeString substString(m_options.invalidSequence.c_str());
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
if (U_FAILURE(err)) if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error setting replacement char."); throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
}
else
{
// Tell ICU top stop (and return an error) on illegal byte sequences
ucnv_setToUCallBack
(m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
ucnv_setFromUCallBack
(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
}
// Input data available // Input data available
while (!in.eof()) while (!in.eof())
@ -137,8 +163,22 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
ucnv_toUnicode(m_from, &target, targetLimit, ucnv_toUnicode(m_from, &target, targetLimit,
&source, sourceLimit, NULL, flush, &toErr); &source, sourceLimit, NULL, flush, &toErr);
if (st)
st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));
if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
{
if (toErr == U_INVALID_CHAR_FOUND ||
toErr == U_TRUNCATED_CHAR_FOUND ||
toErr == U_ILLEGAL_CHAR_FOUND)
{
// Error will be thrown later (*)
}
else
{
throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
}
}
// The Unicode source is the buffer just written and the limit // The Unicode source is the buffer just written and the limit
// is where the previous conversion stopped (target is moved in the conversion) // is where the previous conversion stopped (target is moved in the conversion)
@ -158,8 +198,40 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
&uSource, uSourceLimit, NULL, flush, &fromErr); &uSource, uSourceLimit, NULL, flush, &fromErr);
if (st)
{
// Decrement input bytes count by the number of input bytes in error
char errBytes[16];
int8_t errBytesLen = sizeof(errBytes);
UErrorCode errBytesErr = U_ZERO_ERROR;
ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);
st->inputBytesRead -= errBytesLen;
st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
}
// (*) If an error occured while converting from input charset, throw it now
if (toErr == U_INVALID_CHAR_FOUND ||
toErr == U_TRUNCATED_CHAR_FOUND ||
toErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
{
if (fromErr == U_INVALID_CHAR_FOUND ||
fromErr == U_TRUNCATED_CHAR_FOUND ||
fromErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
else
{
throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
}
}
// Write to destination stream // Write to destination stream
out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));
@ -171,29 +243,27 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
} }
void charsetConverter_icu::convert(const string& in, string& out) void charsetConverter_icu::convert(const string& in, string& out, status* st)
{ {
if (m_source == m_dest) if (st)
{ new (st) status();
// No conversion needed
out = in;
return;
}
out.clear(); out.clear();
utility::inputStreamStringAdapter is(in); utility::inputStreamStringAdapter is(in);
utility::outputStreamStringAdapter os(out); utility::outputStreamStringAdapter os(out);
convert(is, os); convert(is, os, st);
os.flush(); os.flush();
} }
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_icu::getFilteredOutputStream(utility::outputStream& os) shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_icu::getFilteredOutputStream
(utility::outputStream& os, const charsetConverterOptions& opts)
{ {
return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os); return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os, opts);
} }
@ -204,8 +274,10 @@ namespace utility {
charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
(const charset& source, const charset& dest, outputStream* os) (const charset& source, const charset& dest, outputStream* os,
: m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os) const charsetConverterOptions& opts)
: m_from(NULL), m_to(NULL), m_sourceCharset(source),
m_destCharset(dest), m_stream(*os), m_options(opts)
{ {
UErrorCode err = U_ZERO_ERROR; UErrorCode err = U_ZERO_ERROR;
m_from = ucnv_open(source.getName().c_str(), &err); m_from = ucnv_open(source.getName().c_str(), &err);
@ -224,12 +296,31 @@ charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + "."); ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
} }
// Tell ICU what to do when encountering an illegal byte sequence
if (m_options.silentlyReplaceInvalidSequences)
{
// Set replacement chars for when converting from Unicode to codepage // Set replacement chars for when converting from Unicode to codepage
icu::UnicodeString substString(vmime::charsetConverterOptions().invalidSequence.c_str()); icu::UnicodeString substString(m_options.invalidSequence.c_str());
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
if (U_FAILURE(err)) if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error setting replacement char."); throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
}
else
{
// Tell ICU top stop (and return an error) on illegal byte sequences
ucnv_setToUCallBack
(m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
ucnv_setFromUCallBack
(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
}
} }
@ -274,10 +365,19 @@ void charsetFilteredOutputStream_icu::writeImpl
&uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr); &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr);
if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
{
if (toErr == U_INVALID_CHAR_FOUND ||
toErr == U_TRUNCATED_CHAR_FOUND ||
toErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
else
{ {
throw exceptions::charset_conv_error throw exceptions::charset_conv_error
("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
} }
}
const size_t uniLength = uniTarget - &uniBuffer[0]; const size_t uniLength = uniTarget - &uniBuffer[0];
@ -302,10 +402,19 @@ void charsetFilteredOutputStream_icu::writeImpl
&cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr); &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr);
if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
{
if (fromErr == U_INVALID_CHAR_FOUND ||
fromErr == U_TRUNCATED_CHAR_FOUND ||
fromErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
else
{ {
throw exceptions::charset_conv_error throw exceptions::charset_conv_error
("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
} }
}
const size_t cpLength = cpTarget - &cpBuffer[0]; const size_t cpLength = cpTarget - &cpBuffer[0];

View File

@ -59,10 +59,12 @@ public:
~charsetConverter_icu(); ~charsetConverter_icu();
void convert(const string& in, string& out); void convert(const string& in, string& out, status* st = NULL);
void convert(utility::inputStream& in, utility::outputStream& out); void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os); shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions());
private: private:
@ -88,9 +90,11 @@ public:
* @param source input charset * @param source input charset
* @param dest output charset * @param dest output charset
* @param os stream into which write filtered data * @param os stream into which write filtered data
* @param opts conversion options
*/ */
charsetFilteredOutputStream_icu charsetFilteredOutputStream_icu
(const charset& source, const charset& dest, outputStream* os); (const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts = charsetConverterOptions());
~charsetFilteredOutputStream_icu(); ~charsetFilteredOutputStream_icu();
@ -112,6 +116,8 @@ private:
const charset m_destCharset; const charset m_destCharset;
outputStream& m_stream; outputStream& m_stream;
charsetConverterOptions m_options;
}; };

View File

@ -57,8 +57,11 @@ charsetConverter_idna::~charsetConverter_idna()
} }
void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out) void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out, status* st)
{ {
if (st)
new (st) status();
// IDNA should be used for short strings, so it does not matter if we // IDNA should be used for short strings, so it does not matter if we
// do not work directly on the stream // do not work directly on the stream
string inStr; string inStr;
@ -66,20 +69,16 @@ void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStr
vmime::utility::bufferedStreamCopy(in, os); vmime::utility::bufferedStreamCopy(in, os);
string outStr; string outStr;
convert(inStr, outStr); convert(inStr, outStr, st);
out << outStr; out << outStr;
} }
void charsetConverter_idna::convert(const string& in, string& out) void charsetConverter_idna::convert(const string& in, string& out, status* st)
{ {
if (m_source == m_dest) if (st)
{ new (st) status();
// No conversion needed
out = in;
return;
}
out.clear(); out.clear();
@ -87,6 +86,12 @@ void charsetConverter_idna::convert(const string& in, string& out)
{ {
if (utility::stringUtils::is7bit(in)) if (utility::stringUtils::is7bit(in))
{ {
if (st)
{
st->inputBytesRead = in.length();
st->outputBytesWritten = in.length();
}
// No need to encode as Punycode // No need to encode as Punycode
out = in; out = in;
return; return;
@ -107,6 +112,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
unichars.push_back(uc); unichars.push_back(uc);
} }
if (st)
st->inputBytesRead = in.length();
std::vector <char> output(inUTF8.length() * 2); std::vector <char> output(inUTF8.length() * 2);
punycode_uint outputLen = output.size(); punycode_uint outputLen = output.size();
@ -116,6 +124,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
if (status == punycode_success) if (status == punycode_success)
{ {
out = string("xn--") + string(output.begin(), output.begin() + outputLen); out = string("xn--") + string(output.begin(), output.begin() + outputLen);
if (st)
st->outputBytesWritten = out.length();
} }
else else
{ {
@ -126,6 +137,12 @@ void charsetConverter_idna::convert(const string& in, string& out)
{ {
if (in.length() < 5 || in.substr(0, 4) != "xn--") if (in.length() < 5 || in.substr(0, 4) != "xn--")
{ {
if (st)
{
st->inputBytesRead = in.length();
st->outputBytesWritten = in.length();
}
// Not an IDNA string // Not an IDNA string
out = in; out = in;
return; return;
@ -137,6 +154,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
const punycode_status status = punycode_decode const punycode_status status = punycode_decode
(in.length() - 4, &in[4], &outputLen, &output[0], /* case_flags */ NULL); (in.length() - 4, &in[4], &outputLen, &output[0], /* case_flags */ NULL);
if (st)
st->inputBytesRead = in.length();
if (status == punycode_success) if (status == punycode_success)
{ {
std::vector <char> outUTF8Bytes(outputLen * 4); std::vector <char> outUTF8Bytes(outputLen * 4);
@ -150,6 +170,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
string outUTF8(&outUTF8Bytes[0], p); string outUTF8(&outUTF8Bytes[0], p);
charset::convert(outUTF8, out, vmime::charsets::UTF_8, m_dest); charset::convert(outUTF8, out, vmime::charsets::UTF_8, m_dest);
if (st)
st->outputBytesWritten = out.length();
} }
else else
{ {
@ -159,7 +182,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
} }
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_idna::getFilteredOutputStream(utility::outputStream& /* os */) shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_idna::getFilteredOutputStream
(utility::outputStream& /* os */, const charsetConverterOptions& /* opts */)
{ {
return null; return null;
} }

View File

@ -50,10 +50,12 @@ public:
~charsetConverter_idna(); ~charsetConverter_idna();
void convert(const string& in, string& out); void convert(const string& in, string& out, status* st = NULL);
void convert(utility::inputStream& in, utility::outputStream& out); void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os); shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions());
private: private:

View File

@ -69,8 +69,12 @@ charsetConverter_win::charsetConverter_win
} }
void charsetConverter_win::convert(utility::inputStream& in, utility::outputStream& out) void charsetConverter_win::convert
(utility::inputStream& in, utility::outputStream& out, status* st)
{ {
if (st)
new (st) status();
byte_t buffer[32768]; byte_t buffer[32768];
string inStr, outStr; string inStr, outStr;
@ -80,20 +84,16 @@ void charsetConverter_win::convert(utility::inputStream& in, utility::outputStre
utility::stringUtils::appendBytesToString(inStr, buffer, len); utility::stringUtils::appendBytesToString(inStr, buffer, len);
} }
convert(inStr, outStr); convert(inStr, outStr, st);
out.write(outStr.data(), outStr.length()); out.write(outStr.data(), outStr.length());
} }
void charsetConverter_win::convert(const string& in, string& out) void charsetConverter_win::convert(const string& in, string& out, status* st)
{ {
if (m_source == m_dest) if (st)
{ new (st) status();
// No conversion needed
out = in;
return;
}
const int sourceCodePage = getCodePage(m_source.getName().c_str()); const int sourceCodePage = getCodePage(m_source.getName().c_str());
const int destCodePage = getCodePage(m_dest.getName().c_str()); const int destCodePage = getCodePage(m_dest.getName().c_str());
@ -113,10 +113,27 @@ void charsetConverter_win::convert(const string& in, string& out)
const size_t bufferSize = in.length() * 2; // in wide characters const size_t bufferSize = in.length() * 2; // in wide characters
unicodeBuffer.resize(bufferSize); unicodeBuffer.resize(bufferSize);
DWORD flags = 0;
if (!m_options.silentlyReplaceInvalidSequences)
flags |= MB_ERR_INVALID_CHARS;
unicodePtr = reinterpret_cast <const WCHAR*>(&unicodeBuffer[0]); unicodePtr = reinterpret_cast <const WCHAR*>(&unicodeBuffer[0]);
unicodeLen = MultiByteToWideChar unicodeLen = MultiByteToWideChar
(sourceCodePage, 0, in.c_str(), static_cast <int>(in.length()), (sourceCodePage, 0, in.c_str(), static_cast <int>(in.length()),
reinterpret_cast <WCHAR*>(&unicodeBuffer[0]), static_cast <int>(bufferSize)); reinterpret_cast <WCHAR*>(&unicodeBuffer[0]), static_cast <int>(bufferSize));
if (unicodeLen == 0)
{
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
{
throw exceptions::illegal_byte_sequence_in_charset();
}
else
{
throw exceptions::charset_conv_error("MultiByteToWideChar() failed when converting to Unicode from " + m_source.getName());
}
}
} }
// Convert from Unicode to destination charset // Convert from Unicode to destination charset
@ -135,6 +152,18 @@ void charsetConverter_win::convert(const string& in, string& out)
(destCodePage, 0, unicodePtr, static_cast <int>(unicodeLen), (destCodePage, 0, unicodePtr, static_cast <int>(unicodeLen),
&buffer[0], static_cast <int>(bufferSize), 0, NULL); &buffer[0], static_cast <int>(bufferSize), 0, NULL);
if (len == 0)
{
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
{
throw exceptions::illegal_byte_sequence_in_charset();
}
else
{
throw exceptions::charset_conv_error("WideCharToMultiByte() failed when converting from Unicode to " + m_source.getName());
}
}
out.assign(&buffer[0], len); out.assign(&buffer[0], len);
} }
} }
@ -158,7 +187,8 @@ int charsetConverter_win::getCodePage(const char* name)
shared_ptr <utility::charsetFilteredOutputStream> shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_win::getFilteredOutputStream(utility::outputStream& /* os */) charsetConverter_win::getFilteredOutputStream
(utility::outputStream& /* os */, const charsetConverterOptions& /* opts */)
{ {
// TODO: implement me! // TODO: implement me!
return null; return null;

View File

@ -38,7 +38,16 @@ namespace vmime
{ {
/** A generic charset converter which uses Windows MultiByteToWideChar. /** A generic charset converter which uses Windows MultiByteToWideChar
* and WideCharToMultiByte API functions.
*
* ICU or iconv library should always be preferred over this one, even
* on Windows platform, as MultiByteToWideChar() and WideCharToMultiByte()
* functions cannot be used easily with streams (no context). Moreover,
* error handling is very poor, in particular when an invalid sequence
* is found...
*
* Also, "status" is not supported by this converter for the same reason.
*/ */
class charsetConverter_win : public charsetConverter class charsetConverter_win : public charsetConverter
@ -54,8 +63,8 @@ public:
charsetConverter_win(const charset& source, const charset& dest, charsetConverter_win(const charset& source, const charset& dest,
const charsetConverterOptions& opts = charsetConverterOptions()); const charsetConverterOptions& opts = charsetConverterOptions());
void convert(const string& in, string& out); void convert(const string& in, string& out, status* st);
void convert(utility::inputStream& in, utility::outputStream& out); void convert(utility::inputStream& in, utility::outputStream& out, status* st);
shared_ptr <utility::charsetFilteredOutputStream> shared_ptr <utility::charsetFilteredOutputStream>
getFilteredOutputStream(utility::outputStream& os); getFilteredOutputStream(utility::outputStream& os);

View File

@ -115,6 +115,20 @@ exception* charset_conv_error::clone() const { return new charset_conv_error(*th
const char* charset_conv_error::name() const throw() { return "charset_conv_error"; } const char* charset_conv_error::name() const throw() { return "charset_conv_error"; }
//
// illegal_byte_sequence_for_charset
//
illegal_byte_sequence_for_charset::~illegal_byte_sequence_for_charset() throw() {}
illegal_byte_sequence_for_charset::illegal_byte_sequence_for_charset(const string& what, const exception& other)
: exception(what.empty() ? "Found illegal byte sequence for this charset." : what, other) {}
exception* illegal_byte_sequence_for_charset::clone() const { return new illegal_byte_sequence_for_charset(*this); }
const char* illegal_byte_sequence_for_charset::name() const throw() { return "illegal_byte_sequence_for_charset"; }
// //
// no_encoder_available // no_encoder_available
// //

View File

@ -116,6 +116,18 @@ public:
}; };
class VMIME_EXPORT illegal_byte_sequence_for_charset : public vmime::exception
{
public:
illegal_byte_sequence_for_charset(const string& what = "", const exception& other = NO_EXCEPTION);
~illegal_byte_sequence_for_charset() throw();
exception* clone() const;
const char* name() const throw();
};
/** No encoder has been found for the specified encoding name. /** No encoder has been found for the specified encoding name.
*/ */

View File

@ -400,7 +400,8 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g
out->removeAllWords(); out->removeAllWords();
const std::vector <shared_ptr <word> > words = word::parseMultiple(ctx, in, 0, in.length(), NULL); std::vector <shared_ptr <word> > words = word::parseMultiple(ctx, in, 0, in.length(), NULL);
fixBrokenWords(words);
copy_vector(words, out->m_words); copy_vector(words, out->m_words);
@ -408,6 +409,48 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g
} }
// static
void text::fixBrokenWords(std::vector <shared_ptr <word> >& words)
{
if (words.size() < 2)
return;
// Fix words which encode a non-integral number of characters.
// This is not RFC-compliant, but we should be able to recover from it.
for (size_t i = 0, n = words.size() - 1 ; i < n ; ++i)
{
shared_ptr <word> w1 = words[i];
shared_ptr <word> w2 = words[i + 1];
// Check whether the word is valid
bool valid = w1->getCharset().isValidText(w1->getBuffer(), NULL);
// If the current word is not valid, try to grab some bytes
// from the next word, to see whether it becomes valid.
if (!valid)
{
string buffer(w1->getBuffer());
buffer += w2->getBuffer();
string::size_type firstInvalidByte;
valid = w1->getCharset().isValidText(buffer, &firstInvalidByte);
// Current word with additional bytes from the next word
// is now valid: adjust buffers of both words.
w1->setBuffer(string(buffer.begin(), buffer.begin() + firstInvalidByte));
w2->setBuffer(string(buffer.begin() + firstInvalidByte, buffer.end()));
// If the next word is now empty, remove it
if (w2->getBuffer().empty())
{
words.erase(words.begin() + i + 1);
--n;
}
}
}
}
const std::vector <shared_ptr <component> > text::getChildComponents() const std::vector <shared_ptr <component> > text::getChildComponents()
{ {
std::vector <shared_ptr <component> > list; std::vector <shared_ptr <component> > list;

View File

@ -251,6 +251,9 @@ public:
protected: protected:
static void fixBrokenWords(std::vector <shared_ptr <word> >& words);
// Component parsing & assembling // Component parsing & assembling
void parseImpl void parseImpl
(const parsingContext& ctx, (const parsingContext& ctx,

View File

@ -21,6 +21,8 @@
// the GNU General Public License cover the whole combination. // the GNU General Public License cover the whole combination.
// //
#include <algorithm>
#include "tests/testUtils.hpp" #include "tests/testUtils.hpp"
#include "charsetTestSuites.hpp" #include "charsetTestSuites.hpp"
@ -39,6 +41,14 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
VMIME_TEST(testDecodeIDNA) VMIME_TEST(testDecodeIDNA)
VMIME_TEST(testUTF7Support) VMIME_TEST(testUTF7Support)
VMIME_TEST(testReplaceInvalidSequence)
VMIME_TEST(testStopOnInvalidSequence)
VMIME_TEST(testStatus)
VMIME_TEST(testStatusWithInvalidSequence)
VMIME_TEST(testIsValidText)
VMIME_TEST_LIST_END VMIME_TEST_LIST_END
@ -106,10 +116,15 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
} }
static const vmime::string convertHelper static const vmime::string convertHelper
(const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest) (const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest,
const vmime::charsetConverterOptions& opts = vmime::charsetConverterOptions(),
vmime::charsetConverter::status* st = NULL)
{ {
vmime::shared_ptr <vmime::charsetConverter> conv =
vmime::charsetConverter::create(csrc, cdest, opts);
vmime::string out; vmime::string out;
vmime::charset::convert(in, out, csrc, cdest); conv->convert(in, out, st);
return out; return out;
} }
@ -145,5 +160,91 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
VASSERT_EQ("2", "f+APg-o", convertHelper("\x66\xc3\xb8\x6f", "utf-8", "utf-7")); VASSERT_EQ("2", "f+APg-o", convertHelper("\x66\xc3\xb8\x6f", "utf-8", "utf-7"));
} }
void testReplaceInvalidSequence()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = true;
opts.invalidSequence = "?";
vmime::string res = convertHelper
("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts);
// Result should be in the form "a???b?c??d" or "a??????b?c??d"...
// Remove consecutive question marks for easier matching.
res.erase(std::unique(res.begin(), res.end()), res.end());
VASSERT_EQ(
"Illegal UTF-8 sequence",
"a?b?c?d",
res
);
}
void testStopOnInvalidSequence()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
VASSERT_THROW(
"Illegal UTF-8 sequence",
convertHelper("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts),
vmime::exceptions::illegal_byte_sequence_for_charset
);
}
void testStatus()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
vmime::charsetConverter::status st;
// 012345 6 7
convertHelper("Gwena\xc3\xabl", "utf-8", "iso-8859-1", opts, &st);
VASSERT_EQ("inputBytesRead", 8, st.inputBytesRead);
VASSERT_EQ("outputBytesWritten", 7, st.outputBytesWritten);
}
void testStatusWithInvalidSequence()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
vmime::charsetConverter::status st;
try
{
// 01234 5 6789 0 1
convertHelper("Fran\xc3\xa7ois\xf1\x80\x65", "utf-8", "iso-8859-1", opts, &st);
}
catch (vmime::exceptions::illegal_byte_sequence_for_charset& e)
{
}
catch (...)
{
throw;
}
VASSERT_EQ("inputBytesRead", 9, st.inputBytesRead);
VASSERT_EQ("outputBytesWritten", 8, st.outputBytesWritten);
}
void testIsValidText()
{
// Invalid text
const vmime::string invalidText("Fran\xc3\xa7ois\xf1\x80\x65");
vmime::string::size_type firstInvalidByte;
VASSERT_EQ("invalid.isValidText", false, vmime::charset("utf-8").isValidText(invalidText, &firstInvalidByte));
VASSERT_EQ("invalid.firstInvalidByte", 9, firstInvalidByte);
// Valid text
const vmime::string validText("Gwena\xc3\xabl");
VASSERT_EQ("valid.isValidText", true, vmime::charset("utf-8").isValidText(validText, &firstInvalidByte));
VASSERT_EQ("valid.firstInvalidByte", 8, firstInvalidByte);
}
VMIME_TEST_SUITE_END VMIME_TEST_SUITE_END

View File

@ -61,6 +61,7 @@ VMIME_TEST_SUITE_BEGIN(textTest)
VMIME_TEST(testInternationalizedEmail_folding) VMIME_TEST(testInternationalizedEmail_folding)
VMIME_TEST(testWronglyPaddedB64Words) VMIME_TEST(testWronglyPaddedB64Words)
VMIME_TEST(testFixBrokenWords)
VMIME_TEST_LIST_END VMIME_TEST_LIST_END
@ -617,5 +618,50 @@ VMIME_TEST_SUITE_BEGIN(textTest)
outText.getConvertedText(vmime::charset("utf-8"))); outText.getConvertedText(vmime::charset("utf-8")));
} }
// Ensure that words which encode a non-integral number of characters
// are correctly decoded.
void testFixBrokenWords()
{
vmime::text outText;
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false; // just to be sure that broken words are actually fixed
// Test case 1
vmime::text::decodeAndUnfold
("=?utf-8?Q?Gwena=C3?="
"=?utf-8?Q?=ABl?=", &outText);
VASSERT_EQ("1", "Gwena\xebl",
outText.getConvertedText(vmime::charset("iso-8859-1"), opts));
// Test case 2
vmime::text::decodeAndUnfold
("=?utf-8?B?5Lit6Yu85qmf5qKw6JGj5LqL5pyDMTAz5bm056ysMDXlsYbn?="
"=?utf-8?B?rKwwN+asoeitsOeoiw==?=", &outText);
VASSERT_EQ("2", "\xe4\xb8\xad\xe9\x8b\xbc\xe6\xa9\x9f\xe6\xa2\xb0"
"\xe8\x91\xa3\xe4\xba\x8b\xe6\x9c\x83\x31\x30\x33\xe5\xb9\xb4"
"\xe7\xac\xac\x30\x35\xe5\xb1\x86\xe7\xac\xac\x30\x37\xe6\xac"
"\xa1\xe8\xad\xb0\xe7\xa8\x8b",
outText.getConvertedText(vmime::charset("utf-8")));
// Test case 3 (a character spanning over 3 words: 'を' = E3 82 92)
vmime::text::decodeAndUnfold
("=?utf-8?Q?abc=E3?="
"=?utf-8?Q?=82?="
"=?utf-8?Q?=92xyz?=", &outText);
std::string out; // decode as UTF-16 then rencode to UTF-8 for easier comparison
vmime::charset::convert(
outText.getConvertedText(vmime::charset("utf-16"), opts),
out,
vmime::charset("utf-16"),
vmime::charset("utf-8")
);
VASSERT_EQ("3", "abc\xe3\x82\x92xyz", out);
}
VMIME_TEST_SUITE_END VMIME_TEST_SUITE_END