Issue #103: fix badly encoded words.

This commit is contained in:
Vincent Richard 2015-02-16 18:43:03 +01:00
parent f51cb846a9
commit c5c66f9fdc
21 changed files with 640 additions and 96 deletions

View File

@ -735,12 +735,12 @@ INCLUDE(cmake/FindICU.cmake)
FIND_PACKAGE(ICU QUIET)
IF(WIN32)
SET(VMIME_CHARSETCONV_LIB_DETECTED "win")
ELSEIF(ICU_LIBRARIES)
IF(ICU_LIBRARIES)
SET(VMIME_CHARSETCONV_LIB_DETECTED "icu")
ELSEIF(ICONV_FOUND)
SET(VMIME_CHARSETCONV_LIB_DETECTED "iconv")
ELSEIF(WIN32)
SET(VMIME_CHARSETCONV_LIB_DETECTED "win")
ENDIF()
SET(
@ -803,6 +803,10 @@ ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "icu")
ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "win")
MESSAGE(WARNING "*** ICU or iconv library should always be preferred"
" over MultiByteToWideChar/WideCharToMultiByte on Windows, as"
" error handling is very poor, and there is no streaming support.")
SET(VMIME_CHARSETCONV_LIB_IS_ICONV "OFF")
SET(VMIME_CHARSETCONV_LIB_IS_ICU "OFF")
SET(VMIME_CHARSETCONV_LIB_IS_WIN "ON")

View File

@ -109,6 +109,43 @@ void charset::convert(const string& in, string& out, const charset& source, cons
}
bool charset::isValidText
(const string& text, string::size_type* firstInvalidByte) const
{
charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
charsetConverter::status st;
try
{
std::string out;
// Try converting to UTF-8
shared_ptr <charsetConverter> conv = charsetConverter::create(*this, vmime::charset("utf-8"), opts);
conv->convert(text, out, &st);
}
catch (exceptions::illegal_byte_sequence_for_charset& e)
{
// An illegal byte sequence was found in the input buffer
if (firstInvalidByte)
{
if (st.inputBytesRead < text.length())
*firstInvalidByte = st.inputBytesRead;
else
*firstInvalidByte = text.length();
}
return false;
}
if (firstInvalidByte)
*firstInvalidByte = text.length();
return true;
}
const charset charset::getLocalCharset()
{
return (platform::getHandler()->getLocalCharset());

View File

@ -95,8 +95,12 @@ public:
* @param source input charset
* @param dest output charset
* @param opts conversion options
* @throws exceptions::charset_conv_error if an error occured during
* the conversion
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/
static void convert(const string& in, string& out,
const charset& source, const charset& dest,
@ -110,13 +114,29 @@ public:
* @param source input charset
* @param dest output charset
* @param opts conversion options
* @throws exceptions::charset_conv_error if an error occured during
* the conversion
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/
static void convert(utility::inputStream& in, utility::outputStream& out,
const charset& source, const charset& dest,
const charsetConverterOptions& opts = charsetConverterOptions());
/** Checks whether the specified text is valid in this charset.
*
* @param text input text
* @param firstInvalidByte if the function returns false, will contain
* the index of the first invalid byte in the string. Can be NULL if
* not used.
* @return true if the text is perfectly valid in this charset,
* or false otherwise (eg. it contains illegal sequences)
*/
bool isValidText(const string& text, string::size_type* firstInvalidByte) const;
shared_ptr <component> clone() const;
void copyFrom(const component& other);

View File

@ -42,4 +42,11 @@ shared_ptr <charsetConverter> charsetConverter::create
}
charsetConverter::status::status()
: inputBytesRead(0), outputBytesWritten(0)
{
}
} // vmime

View File

@ -44,8 +44,13 @@ namespace utility
/** A filtered output stream which applies a charset conversion
* to input bytes.
*
* May throw a exceptions::charset_conv_error if an error
* May throw a exceptions::charset_conv_error if an unexpected error
* occured when initializing convert, or during charset conversion.
*
* May also throw a exceptions::illegal_byte_sequence_for_charset
* if an illegal byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions.
*/
class VMIME_EXPORT charsetFilteredOutputStream : public filteredOutputStream
@ -63,6 +68,23 @@ class VMIME_EXPORT charsetConverter : public object
{
public:
/** Holds information about a conversion.
*/
struct status
{
status();
/** Number of bytes read from input buffer and successfully converted.
*/
size_t inputBytesRead;
/** Number of bytes written to output buffer.
*/
size_t outputBytesWritten;
};
/** Construct and initialize an iconv charset converter.
*
* @param source input charset
@ -81,29 +103,44 @@ public:
*
* @param in input buffer
* @param out output buffer
* @throws exceptions::charset_conv_error if an error occured during
* the conversion
* @param st will receive some extra infos when conversion is finished
* or stopped by an error (can be NULL)
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/
virtual void convert(const string& in, string& out) = 0;
virtual void convert(const string& in, string& out, status* st = NULL) = 0;
/** Convert the contents of an input stream in a specified charset
* to another charset and write the result to an output stream.
*
* @param in input stream to read data from
* @param out output stream to write the converted data
* @throws exceptions::charset_conv_error if an error occured during
* the conversion
* @param st will receive some extra infos when conversion is finished
* or stopped by an error (can be NULL)
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
* byte sequence was found in the input bytes, and the
* 'silentlyReplaceInvalidSequences' flag is set to false in
* the charsetConverterOptions
* @throws exceptions::charset_conv_error if an unexpected error occured
* during the conversion
*/
virtual void convert(utility::inputStream& in, utility::outputStream& out) = 0;
virtual void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL) = 0;
/** Returns a filtered output stream which applies a charset
* conversion to input bytes. Please note that it may not be
* supported by the converter.
*
* @param os stream into which filtered data will be written
* @param opts conversion options
* @return a filtered output stream, or NULL if not supported
*/
virtual shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os) = 0;
virtual shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions()) = 0;
private:

View File

@ -29,7 +29,8 @@ namespace vmime
charsetConverterOptions::charsetConverterOptions()
: invalidSequence("?")
: silentlyReplaceInvalidSequences(true),
invalidSequence("?")
{
}

View File

@ -42,7 +42,15 @@ public:
charsetConverterOptions();
/** Replace invalid sequences with this string. */
/** If true, invalid sequences will be silently replaced with
* a string when possible (see 'invalidSequence').
* Default is true.
*/
bool silentlyReplaceInvalidSequences;
/** Replace invalid sequences with this string.
* Default is '?'.
*/
string invalidSequence;
};

View File

@ -147,8 +147,12 @@ charsetConverter_iconv::~charsetConverter_iconv()
}
void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputStream& out)
void charsetConverter_iconv::convert
(utility::inputStream& in, utility::outputStream& out, status* st)
{
if (st)
new (st) status();
if (m_desc == NULL)
throw exceptions::charset_conv_error("Cannot initialize converter.");
@ -175,6 +179,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
if (iconv(cd, ICONV_IN_TYPE(&inPtr), ptrLength,
ICONV_OUT_TYPE(&outPtr), &outLength) == static_cast <size_t>(-1))
{
if (st && inPtr)
{
st->inputBytesRead += (inPtr - inBuffer);
st->outputBytesWritten += (outPtr - outBuffer);
}
// Illegal input sequence or input sequence has no equivalent
// sequence in the destination charset.
if (prevIsInvalid)
@ -182,6 +192,9 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
// Write successfully converted bytes
out.write(outBuffer, sizeof(outBuffer) - outLength);
if (!m_options.silentlyReplaceInvalidSequences)
throw exceptions::illegal_byte_sequence_for_charset();
// Output a special character to indicate we don't known how to
// convert the sequence at this position
outputInvalidChar(out, cd, m_options);
@ -208,6 +221,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
// Write successfully converted bytes
out.write(outBuffer, sizeof(outBuffer) - outLength);
if (st && inPtr)
{
st->inputBytesRead += (inPtr - inBuffer);
st->outputBytesWritten += (outPtr - outBuffer);
}
inPos = 0;
prevIsInvalid = false;
}
@ -222,29 +241,27 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
}
void charsetConverter_iconv::convert(const string& in, string& out)
void charsetConverter_iconv::convert(const string& in, string& out, status* st)
{
if (m_source == m_dest)
{
// No conversion needed
out = in;
return;
}
if (st)
new (st) status();
out.clear();
utility::inputStreamStringAdapter is(in);
utility::outputStreamStringAdapter os(out);
convert(is, os);
convert(is, os, st);
os.flush();
}
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_iconv::getFilteredOutputStream(utility::outputStream& os)
shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_iconv::getFilteredOutputStream
(utility::outputStream& os, const charsetConverterOptions& opts)
{
return make_shared <utility::charsetFilteredOutputStream_iconv>(m_source, m_dest, &os);
return make_shared <utility::charsetFilteredOutputStream_iconv>(m_source, m_dest, &os, opts);
}
@ -255,9 +272,10 @@ namespace utility {
charsetFilteredOutputStream_iconv::charsetFilteredOutputStream_iconv
(const charset& source, const charset& dest, outputStream* os)
(const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts)
: m_desc(NULL), m_sourceCharset(source), m_destCharset(dest),
m_stream(*os), m_unconvCount(0)
m_stream(*os), m_unconvCount(0), m_options(opts)
{
// Get an iconv descriptor
const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str());
@ -314,6 +332,9 @@ void charsetFilteredOutputStream_iconv::writeImpl
// character and skip one byte in the invalid sequence.
if (m_unconvCount >= sizeof(m_unconvBuffer))
{
if (!m_options.silentlyReplaceInvalidSequences)
throw exceptions::illegal_byte_sequence_for_charset();
outputInvalidChar(m_stream, cd);
std::copy(m_unconvBuffer + 1,
@ -439,6 +460,9 @@ void charsetFilteredOutputStream_iconv::flush()
// Skip a "blocking" character
if (inputConverted == 0)
{
if (!m_options.silentlyReplaceInvalidSequences)
throw exceptions::illegal_byte_sequence_for_charset();
outputInvalidChar(m_stream, cd);
offset++;

View File

@ -56,10 +56,12 @@ public:
~charsetConverter_iconv();
void convert(const string& in, string& out);
void convert(utility::inputStream& in, utility::outputStream& out);
void convert(const string& in, string& out, status* st = NULL);
void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions());
private:
@ -84,9 +86,11 @@ public:
* @param source input charset
* @param dest output charset
* @param os stream into which write filtered data
* @param opts conversion options
*/
charsetFilteredOutputStream_iconv
(const charset& source, const charset& dest, outputStream* os);
(const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts = charsetConverterOptions());
~charsetFilteredOutputStream_iconv();
@ -121,6 +125,8 @@ private:
// Buffer used for conversion. Avoids declaring it in write().
// Should be at least MAX_CHARACTER_WIDTH * MAX_CHARACTER_WIDTH.
byte_t m_outputBuffer[32768];
charsetConverterOptions m_options;
};

View File

@ -91,10 +91,17 @@ charsetConverter_icu::~charsetConverter_icu()
}
void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStream& out)
void charsetConverter_icu::convert
(utility::inputStream& in, utility::outputStream& out, status* st)
{
UErrorCode err = U_ZERO_ERROR;
ucnv_reset(m_from);
ucnv_reset(m_to);
if (st)
new (st) status();
// From buffers
byte_t cpInBuffer[16]; // stream data put here
const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
@ -105,12 +112,31 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
std::vector <char> cpOutBuffer(cpOutBufferSz);
// Set replacement chars for when converting from Unicode to codepage
icu::UnicodeString substString(m_options.invalidSequence.c_str());
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
// Tell ICU what to do when encountering an illegal byte sequence
if (m_options.silentlyReplaceInvalidSequences)
{
// Set replacement chars for when converting from Unicode to codepage
icu::UnicodeString substString(m_options.invalidSequence.c_str());
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error setting replacement char.");
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
}
else
{
// Tell ICU top stop (and return an error) on illegal byte sequences
ucnv_setToUCallBack
(m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
ucnv_setFromUCallBack
(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
}
// Input data available
while (!in.eof())
@ -137,8 +163,22 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
ucnv_toUnicode(m_from, &target, targetLimit,
&source, sourceLimit, NULL, flush, &toErr);
if (st)
st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));
if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
{
if (toErr == U_INVALID_CHAR_FOUND ||
toErr == U_TRUNCATED_CHAR_FOUND ||
toErr == U_ILLEGAL_CHAR_FOUND)
{
// Error will be thrown later (*)
}
else
{
throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
}
}
// The Unicode source is the buffer just written and the limit
// is where the previous conversion stopped (target is moved in the conversion)
@ -158,8 +198,40 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
&uSource, uSourceLimit, NULL, flush, &fromErr);
if (st)
{
// Decrement input bytes count by the number of input bytes in error
char errBytes[16];
int8_t errBytesLen = sizeof(errBytes);
UErrorCode errBytesErr = U_ZERO_ERROR;
ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);
st->inputBytesRead -= errBytesLen;
st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
}
// (*) If an error occured while converting from input charset, throw it now
if (toErr == U_INVALID_CHAR_FOUND ||
toErr == U_TRUNCATED_CHAR_FOUND ||
toErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
{
if (fromErr == U_INVALID_CHAR_FOUND ||
fromErr == U_TRUNCATED_CHAR_FOUND ||
fromErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
else
{
throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
}
}
// Write to destination stream
out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));
@ -171,29 +243,27 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
}
void charsetConverter_icu::convert(const string& in, string& out)
void charsetConverter_icu::convert(const string& in, string& out, status* st)
{
if (m_source == m_dest)
{
// No conversion needed
out = in;
return;
}
if (st)
new (st) status();
out.clear();
utility::inputStreamStringAdapter is(in);
utility::outputStreamStringAdapter os(out);
convert(is, os);
convert(is, os, st);
os.flush();
}
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_icu::getFilteredOutputStream(utility::outputStream& os)
shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_icu::getFilteredOutputStream
(utility::outputStream& os, const charsetConverterOptions& opts)
{
return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os);
return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os, opts);
}
@ -204,8 +274,10 @@ namespace utility {
charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
(const charset& source, const charset& dest, outputStream* os)
: m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os)
(const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts)
: m_from(NULL), m_to(NULL), m_sourceCharset(source),
m_destCharset(dest), m_stream(*os), m_options(opts)
{
UErrorCode err = U_ZERO_ERROR;
m_from = ucnv_open(source.getName().c_str(), &err);
@ -224,12 +296,31 @@ charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
}
// Set replacement chars for when converting from Unicode to codepage
icu::UnicodeString substString(vmime::charsetConverterOptions().invalidSequence.c_str());
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
// Tell ICU what to do when encountering an illegal byte sequence
if (m_options.silentlyReplaceInvalidSequences)
{
// Set replacement chars for when converting from Unicode to codepage
icu::UnicodeString substString(m_options.invalidSequence.c_str());
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error setting replacement char.");
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
}
else
{
// Tell ICU top stop (and return an error) on illegal byte sequences
ucnv_setToUCallBack
(m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
ucnv_setFromUCallBack
(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
if (U_FAILURE(err))
throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
}
}
@ -275,8 +366,17 @@ void charsetFilteredOutputStream_icu::writeImpl
if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
{
throw exceptions::charset_conv_error
("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
if (toErr == U_INVALID_CHAR_FOUND ||
toErr == U_TRUNCATED_CHAR_FOUND ||
toErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
else
{
throw exceptions::charset_conv_error
("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
}
}
const size_t uniLength = uniTarget - &uniBuffer[0];
@ -303,8 +403,17 @@ void charsetFilteredOutputStream_icu::writeImpl
if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
{
throw exceptions::charset_conv_error
("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
if (fromErr == U_INVALID_CHAR_FOUND ||
fromErr == U_TRUNCATED_CHAR_FOUND ||
fromErr == U_ILLEGAL_CHAR_FOUND)
{
throw exceptions::illegal_byte_sequence_for_charset();
}
else
{
throw exceptions::charset_conv_error
("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
}
}
const size_t cpLength = cpTarget - &cpBuffer[0];

View File

@ -59,10 +59,12 @@ public:
~charsetConverter_icu();
void convert(const string& in, string& out);
void convert(utility::inputStream& in, utility::outputStream& out);
void convert(const string& in, string& out, status* st = NULL);
void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions());
private:
@ -88,9 +90,11 @@ public:
* @param source input charset
* @param dest output charset
* @param os stream into which write filtered data
* @param opts conversion options
*/
charsetFilteredOutputStream_icu
(const charset& source, const charset& dest, outputStream* os);
(const charset& source, const charset& dest, outputStream* os,
const charsetConverterOptions& opts = charsetConverterOptions());
~charsetFilteredOutputStream_icu();
@ -112,6 +116,8 @@ private:
const charset m_destCharset;
outputStream& m_stream;
charsetConverterOptions m_options;
};

View File

@ -57,8 +57,11 @@ charsetConverter_idna::~charsetConverter_idna()
}
void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out)
void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out, status* st)
{
if (st)
new (st) status();
// IDNA should be used for short strings, so it does not matter if we
// do not work directly on the stream
string inStr;
@ -66,20 +69,16 @@ void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStr
vmime::utility::bufferedStreamCopy(in, os);
string outStr;
convert(inStr, outStr);
convert(inStr, outStr, st);
out << outStr;
}
void charsetConverter_idna::convert(const string& in, string& out)
void charsetConverter_idna::convert(const string& in, string& out, status* st)
{
if (m_source == m_dest)
{
// No conversion needed
out = in;
return;
}
if (st)
new (st) status();
out.clear();
@ -87,6 +86,12 @@ void charsetConverter_idna::convert(const string& in, string& out)
{
if (utility::stringUtils::is7bit(in))
{
if (st)
{
st->inputBytesRead = in.length();
st->outputBytesWritten = in.length();
}
// No need to encode as Punycode
out = in;
return;
@ -107,6 +112,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
unichars.push_back(uc);
}
if (st)
st->inputBytesRead = in.length();
std::vector <char> output(inUTF8.length() * 2);
punycode_uint outputLen = output.size();
@ -116,6 +124,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
if (status == punycode_success)
{
out = string("xn--") + string(output.begin(), output.begin() + outputLen);
if (st)
st->outputBytesWritten = out.length();
}
else
{
@ -126,6 +137,12 @@ void charsetConverter_idna::convert(const string& in, string& out)
{
if (in.length() < 5 || in.substr(0, 4) != "xn--")
{
if (st)
{
st->inputBytesRead = in.length();
st->outputBytesWritten = in.length();
}
// Not an IDNA string
out = in;
return;
@ -137,6 +154,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
const punycode_status status = punycode_decode
(in.length() - 4, &in[4], &outputLen, &output[0], /* case_flags */ NULL);
if (st)
st->inputBytesRead = in.length();
if (status == punycode_success)
{
std::vector <char> outUTF8Bytes(outputLen * 4);
@ -150,6 +170,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
string outUTF8(&outUTF8Bytes[0], p);
charset::convert(outUTF8, out, vmime::charsets::UTF_8, m_dest);
if (st)
st->outputBytesWritten = out.length();
}
else
{
@ -159,7 +182,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
}
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_idna::getFilteredOutputStream(utility::outputStream& /* os */)
shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_idna::getFilteredOutputStream
(utility::outputStream& /* os */, const charsetConverterOptions& /* opts */)
{
return null;
}

View File

@ -50,10 +50,12 @@ public:
~charsetConverter_idna();
void convert(const string& in, string& out);
void convert(utility::inputStream& in, utility::outputStream& out);
void convert(const string& in, string& out, status* st = NULL);
void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os);
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
(utility::outputStream& os,
const charsetConverterOptions& opts = charsetConverterOptions());
private:

View File

@ -69,8 +69,12 @@ charsetConverter_win::charsetConverter_win
}
void charsetConverter_win::convert(utility::inputStream& in, utility::outputStream& out)
void charsetConverter_win::convert
(utility::inputStream& in, utility::outputStream& out, status* st)
{
if (st)
new (st) status();
byte_t buffer[32768];
string inStr, outStr;
@ -80,20 +84,16 @@ void charsetConverter_win::convert(utility::inputStream& in, utility::outputStre
utility::stringUtils::appendBytesToString(inStr, buffer, len);
}
convert(inStr, outStr);
convert(inStr, outStr, st);
out.write(outStr.data(), outStr.length());
}
void charsetConverter_win::convert(const string& in, string& out)
void charsetConverter_win::convert(const string& in, string& out, status* st)
{
if (m_source == m_dest)
{
// No conversion needed
out = in;
return;
}
if (st)
new (st) status();
const int sourceCodePage = getCodePage(m_source.getName().c_str());
const int destCodePage = getCodePage(m_dest.getName().c_str());
@ -113,10 +113,27 @@ void charsetConverter_win::convert(const string& in, string& out)
const size_t bufferSize = in.length() * 2; // in wide characters
unicodeBuffer.resize(bufferSize);
DWORD flags = 0;
if (!m_options.silentlyReplaceInvalidSequences)
flags |= MB_ERR_INVALID_CHARS;
unicodePtr = reinterpret_cast <const WCHAR*>(&unicodeBuffer[0]);
unicodeLen = MultiByteToWideChar
(sourceCodePage, 0, in.c_str(), static_cast <int>(in.length()),
reinterpret_cast <WCHAR*>(&unicodeBuffer[0]), static_cast <int>(bufferSize));
if (unicodeLen == 0)
{
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
{
throw exceptions::illegal_byte_sequence_in_charset();
}
else
{
throw exceptions::charset_conv_error("MultiByteToWideChar() failed when converting to Unicode from " + m_source.getName());
}
}
}
// Convert from Unicode to destination charset
@ -135,6 +152,18 @@ void charsetConverter_win::convert(const string& in, string& out)
(destCodePage, 0, unicodePtr, static_cast <int>(unicodeLen),
&buffer[0], static_cast <int>(bufferSize), 0, NULL);
if (len == 0)
{
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
{
throw exceptions::illegal_byte_sequence_in_charset();
}
else
{
throw exceptions::charset_conv_error("WideCharToMultiByte() failed when converting from Unicode to " + m_source.getName());
}
}
out.assign(&buffer[0], len);
}
}
@ -158,7 +187,8 @@ int charsetConverter_win::getCodePage(const char* name)
shared_ptr <utility::charsetFilteredOutputStream>
charsetConverter_win::getFilteredOutputStream(utility::outputStream& /* os */)
charsetConverter_win::getFilteredOutputStream
(utility::outputStream& /* os */, const charsetConverterOptions& /* opts */)
{
// TODO: implement me!
return null;

View File

@ -38,7 +38,16 @@ namespace vmime
{
/** A generic charset converter which uses Windows MultiByteToWideChar.
/** A generic charset converter which uses Windows MultiByteToWideChar
* and WideCharToMultiByte API functions.
*
* ICU or iconv library should always be preferred over this one, even
* on Windows platform, as MultiByteToWideChar() and WideCharToMultiByte()
* functions cannot be used easily with streams (no context). Moreover,
* error handling is very poor, in particular when an invalid sequence
* is found...
*
* Also, "status" is not supported by this converter for the same reason.
*/
class charsetConverter_win : public charsetConverter
@ -54,8 +63,8 @@ public:
charsetConverter_win(const charset& source, const charset& dest,
const charsetConverterOptions& opts = charsetConverterOptions());
void convert(const string& in, string& out);
void convert(utility::inputStream& in, utility::outputStream& out);
void convert(const string& in, string& out, status* st);
void convert(utility::inputStream& in, utility::outputStream& out, status* st);
shared_ptr <utility::charsetFilteredOutputStream>
getFilteredOutputStream(utility::outputStream& os);

View File

@ -115,6 +115,20 @@ exception* charset_conv_error::clone() const { return new charset_conv_error(*th
const char* charset_conv_error::name() const throw() { return "charset_conv_error"; }
//
// illegal_byte_sequence_for_charset
//
illegal_byte_sequence_for_charset::~illegal_byte_sequence_for_charset() throw() {}
illegal_byte_sequence_for_charset::illegal_byte_sequence_for_charset(const string& what, const exception& other)
: exception(what.empty() ? "Found illegal byte sequence for this charset." : what, other) {}
exception* illegal_byte_sequence_for_charset::clone() const { return new illegal_byte_sequence_for_charset(*this); }
const char* illegal_byte_sequence_for_charset::name() const throw() { return "illegal_byte_sequence_for_charset"; }
//
// no_encoder_available
//

View File

@ -116,6 +116,18 @@ public:
};
class VMIME_EXPORT illegal_byte_sequence_for_charset : public vmime::exception
{
public:
illegal_byte_sequence_for_charset(const string& what = "", const exception& other = NO_EXCEPTION);
~illegal_byte_sequence_for_charset() throw();
exception* clone() const;
const char* name() const throw();
};
/** No encoder has been found for the specified encoding name.
*/

View File

@ -400,7 +400,8 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g
out->removeAllWords();
const std::vector <shared_ptr <word> > words = word::parseMultiple(ctx, in, 0, in.length(), NULL);
std::vector <shared_ptr <word> > words = word::parseMultiple(ctx, in, 0, in.length(), NULL);
fixBrokenWords(words);
copy_vector(words, out->m_words);
@ -408,6 +409,48 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g
}
// static
void text::fixBrokenWords(std::vector <shared_ptr <word> >& words)
{
if (words.size() < 2)
return;
// Fix words which encode a non-integral number of characters.
// This is not RFC-compliant, but we should be able to recover from it.
for (size_t i = 0, n = words.size() - 1 ; i < n ; ++i)
{
shared_ptr <word> w1 = words[i];
shared_ptr <word> w2 = words[i + 1];
// Check whether the word is valid
bool valid = w1->getCharset().isValidText(w1->getBuffer(), NULL);
// If the current word is not valid, try to grab some bytes
// from the next word, to see whether it becomes valid.
if (!valid)
{
string buffer(w1->getBuffer());
buffer += w2->getBuffer();
string::size_type firstInvalidByte;
valid = w1->getCharset().isValidText(buffer, &firstInvalidByte);
// Current word with additional bytes from the next word
// is now valid: adjust buffers of both words.
w1->setBuffer(string(buffer.begin(), buffer.begin() + firstInvalidByte));
w2->setBuffer(string(buffer.begin() + firstInvalidByte, buffer.end()));
// If the next word is now empty, remove it
if (w2->getBuffer().empty())
{
words.erase(words.begin() + i + 1);
--n;
}
}
}
}
const std::vector <shared_ptr <component> > text::getChildComponents()
{
std::vector <shared_ptr <component> > list;

View File

@ -251,6 +251,9 @@ public:
protected:
static void fixBrokenWords(std::vector <shared_ptr <word> >& words);
// Component parsing & assembling
void parseImpl
(const parsingContext& ctx,

View File

@ -21,6 +21,8 @@
// the GNU General Public License cover the whole combination.
//
#include <algorithm>
#include "tests/testUtils.hpp"
#include "charsetTestSuites.hpp"
@ -39,6 +41,14 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
VMIME_TEST(testDecodeIDNA)
VMIME_TEST(testUTF7Support)
VMIME_TEST(testReplaceInvalidSequence)
VMIME_TEST(testStopOnInvalidSequence)
VMIME_TEST(testStatus)
VMIME_TEST(testStatusWithInvalidSequence)
VMIME_TEST(testIsValidText)
VMIME_TEST_LIST_END
@ -106,10 +116,15 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
}
static const vmime::string convertHelper
(const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest)
(const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest,
const vmime::charsetConverterOptions& opts = vmime::charsetConverterOptions(),
vmime::charsetConverter::status* st = NULL)
{
vmime::shared_ptr <vmime::charsetConverter> conv =
vmime::charsetConverter::create(csrc, cdest, opts);
vmime::string out;
vmime::charset::convert(in, out, csrc, cdest);
conv->convert(in, out, st);
return out;
}
@ -145,5 +160,91 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
VASSERT_EQ("2", "f+APg-o", convertHelper("\x66\xc3\xb8\x6f", "utf-8", "utf-7"));
}
void testReplaceInvalidSequence()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = true;
opts.invalidSequence = "?";
vmime::string res = convertHelper
("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts);
// Result should be in the form "a???b?c??d" or "a??????b?c??d"...
// Remove consecutive question marks for easier matching.
res.erase(std::unique(res.begin(), res.end()), res.end());
VASSERT_EQ(
"Illegal UTF-8 sequence",
"a?b?c?d",
res
);
}
void testStopOnInvalidSequence()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
VASSERT_THROW(
"Illegal UTF-8 sequence",
convertHelper("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts),
vmime::exceptions::illegal_byte_sequence_for_charset
);
}
void testStatus()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
vmime::charsetConverter::status st;
// 012345 6 7
convertHelper("Gwena\xc3\xabl", "utf-8", "iso-8859-1", opts, &st);
VASSERT_EQ("inputBytesRead", 8, st.inputBytesRead);
VASSERT_EQ("outputBytesWritten", 7, st.outputBytesWritten);
}
void testStatusWithInvalidSequence()
{
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false;
vmime::charsetConverter::status st;
try
{
// 01234 5 6789 0 1
convertHelper("Fran\xc3\xa7ois\xf1\x80\x65", "utf-8", "iso-8859-1", opts, &st);
}
catch (vmime::exceptions::illegal_byte_sequence_for_charset& e)
{
}
catch (...)
{
throw;
}
VASSERT_EQ("inputBytesRead", 9, st.inputBytesRead);
VASSERT_EQ("outputBytesWritten", 8, st.outputBytesWritten);
}
void testIsValidText()
{
// Invalid text
const vmime::string invalidText("Fran\xc3\xa7ois\xf1\x80\x65");
vmime::string::size_type firstInvalidByte;
VASSERT_EQ("invalid.isValidText", false, vmime::charset("utf-8").isValidText(invalidText, &firstInvalidByte));
VASSERT_EQ("invalid.firstInvalidByte", 9, firstInvalidByte);
// Valid text
const vmime::string validText("Gwena\xc3\xabl");
VASSERT_EQ("valid.isValidText", true, vmime::charset("utf-8").isValidText(validText, &firstInvalidByte));
VASSERT_EQ("valid.firstInvalidByte", 8, firstInvalidByte);
}
VMIME_TEST_SUITE_END

View File

@ -61,6 +61,7 @@ VMIME_TEST_SUITE_BEGIN(textTest)
VMIME_TEST(testInternationalizedEmail_folding)
VMIME_TEST(testWronglyPaddedB64Words)
VMIME_TEST(testFixBrokenWords)
VMIME_TEST_LIST_END
@ -617,5 +618,50 @@ VMIME_TEST_SUITE_BEGIN(textTest)
outText.getConvertedText(vmime::charset("utf-8")));
}
// Ensure that words which encode a non-integral number of characters
// are correctly decoded.
void testFixBrokenWords()
{
vmime::text outText;
vmime::charsetConverterOptions opts;
opts.silentlyReplaceInvalidSequences = false; // just to be sure that broken words are actually fixed
// Test case 1
vmime::text::decodeAndUnfold
("=?utf-8?Q?Gwena=C3?="
"=?utf-8?Q?=ABl?=", &outText);
VASSERT_EQ("1", "Gwena\xebl",
outText.getConvertedText(vmime::charset("iso-8859-1"), opts));
// Test case 2
vmime::text::decodeAndUnfold
("=?utf-8?B?5Lit6Yu85qmf5qKw6JGj5LqL5pyDMTAz5bm056ysMDXlsYbn?="
"=?utf-8?B?rKwwN+asoeitsOeoiw==?=", &outText);
VASSERT_EQ("2", "\xe4\xb8\xad\xe9\x8b\xbc\xe6\xa9\x9f\xe6\xa2\xb0"
"\xe8\x91\xa3\xe4\xba\x8b\xe6\x9c\x83\x31\x30\x33\xe5\xb9\xb4"
"\xe7\xac\xac\x30\x35\xe5\xb1\x86\xe7\xac\xac\x30\x37\xe6\xac"
"\xa1\xe8\xad\xb0\xe7\xa8\x8b",
outText.getConvertedText(vmime::charset("utf-8")));
// Test case 3 (a character spanning over 3 words: 'を' = E3 82 92)
vmime::text::decodeAndUnfold
("=?utf-8?Q?abc=E3?="
"=?utf-8?Q?=82?="
"=?utf-8?Q?=92xyz?=", &outText);
std::string out; // decode as UTF-16 then rencode to UTF-8 for easier comparison
vmime::charset::convert(
outText.getConvertedText(vmime::charset("utf-16"), opts),
out,
vmime::charset("utf-16"),
vmime::charset("utf-8")
);
VASSERT_EQ("3", "abc\xe3\x82\x92xyz", out);
}
VMIME_TEST_SUITE_END