Issue #103: fix badly encoded words.
This commit is contained in:
parent
f51cb846a9
commit
c5c66f9fdc
@ -735,12 +735,12 @@ INCLUDE(cmake/FindICU.cmake)
|
||||
|
||||
FIND_PACKAGE(ICU QUIET)
|
||||
|
||||
IF(WIN32)
|
||||
SET(VMIME_CHARSETCONV_LIB_DETECTED "win")
|
||||
ELSEIF(ICU_LIBRARIES)
|
||||
IF(ICU_LIBRARIES)
|
||||
SET(VMIME_CHARSETCONV_LIB_DETECTED "icu")
|
||||
ELSEIF(ICONV_FOUND)
|
||||
SET(VMIME_CHARSETCONV_LIB_DETECTED "iconv")
|
||||
ELSEIF(WIN32)
|
||||
SET(VMIME_CHARSETCONV_LIB_DETECTED "win")
|
||||
ENDIF()
|
||||
|
||||
SET(
|
||||
@ -803,6 +803,10 @@ ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "icu")
|
||||
|
||||
ELSEIF(VMIME_CHARSETCONV_LIB STREQUAL "win")
|
||||
|
||||
MESSAGE(WARNING "*** ICU or iconv library should always be preferred"
|
||||
" over MultiByteToWideChar/WideCharToMultiByte on Windows, as"
|
||||
" error handling is very poor, and there is no streaming support.")
|
||||
|
||||
SET(VMIME_CHARSETCONV_LIB_IS_ICONV "OFF")
|
||||
SET(VMIME_CHARSETCONV_LIB_IS_ICU "OFF")
|
||||
SET(VMIME_CHARSETCONV_LIB_IS_WIN "ON")
|
||||
|
@ -109,6 +109,43 @@ void charset::convert(const string& in, string& out, const charset& source, cons
|
||||
}
|
||||
|
||||
|
||||
bool charset::isValidText
|
||||
(const string& text, string::size_type* firstInvalidByte) const
|
||||
{
|
||||
charsetConverterOptions opts;
|
||||
opts.silentlyReplaceInvalidSequences = false;
|
||||
|
||||
charsetConverter::status st;
|
||||
|
||||
try
|
||||
{
|
||||
std::string out;
|
||||
|
||||
// Try converting to UTF-8
|
||||
shared_ptr <charsetConverter> conv = charsetConverter::create(*this, vmime::charset("utf-8"), opts);
|
||||
conv->convert(text, out, &st);
|
||||
}
|
||||
catch (exceptions::illegal_byte_sequence_for_charset& e)
|
||||
{
|
||||
// An illegal byte sequence was found in the input buffer
|
||||
if (firstInvalidByte)
|
||||
{
|
||||
if (st.inputBytesRead < text.length())
|
||||
*firstInvalidByte = st.inputBytesRead;
|
||||
else
|
||||
*firstInvalidByte = text.length();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (firstInvalidByte)
|
||||
*firstInvalidByte = text.length();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
const charset charset::getLocalCharset()
|
||||
{
|
||||
return (platform::getHandler()->getLocalCharset());
|
||||
|
@ -95,8 +95,12 @@ public:
|
||||
* @param source input charset
|
||||
* @param dest output charset
|
||||
* @param opts conversion options
|
||||
* @throws exceptions::charset_conv_error if an error occured during
|
||||
* the conversion
|
||||
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
|
||||
* byte sequence was found in the input bytes, and the
|
||||
* 'silentlyReplaceInvalidSequences' flag is set to false in
|
||||
* the charsetConverterOptions
|
||||
* @throws exceptions::charset_conv_error if an unexpected error occured
|
||||
* during the conversion
|
||||
*/
|
||||
static void convert(const string& in, string& out,
|
||||
const charset& source, const charset& dest,
|
||||
@ -110,13 +114,29 @@ public:
|
||||
* @param source input charset
|
||||
* @param dest output charset
|
||||
* @param opts conversion options
|
||||
* @throws exceptions::charset_conv_error if an error occured during
|
||||
* the conversion
|
||||
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
|
||||
* byte sequence was found in the input bytes, and the
|
||||
* 'silentlyReplaceInvalidSequences' flag is set to false in
|
||||
* the charsetConverterOptions
|
||||
* @throws exceptions::charset_conv_error if an unexpected error occured
|
||||
* during the conversion
|
||||
*/
|
||||
static void convert(utility::inputStream& in, utility::outputStream& out,
|
||||
const charset& source, const charset& dest,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
/** Checks whether the specified text is valid in this charset.
|
||||
*
|
||||
* @param text input text
|
||||
* @param firstInvalidByte if the function returns false, will contain
|
||||
* the index of the first invalid byte in the string. Can be NULL if
|
||||
* not used.
|
||||
* @return true if the text is perfectly valid in this charset,
|
||||
* or false otherwise (eg. it contains illegal sequences)
|
||||
*/
|
||||
bool isValidText(const string& text, string::size_type* firstInvalidByte) const;
|
||||
|
||||
|
||||
shared_ptr <component> clone() const;
|
||||
void copyFrom(const component& other);
|
||||
|
||||
|
@ -42,4 +42,11 @@ shared_ptr <charsetConverter> charsetConverter::create
|
||||
}
|
||||
|
||||
|
||||
charsetConverter::status::status()
|
||||
: inputBytesRead(0), outputBytesWritten(0)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
|
||||
} // vmime
|
||||
|
@ -44,8 +44,13 @@ namespace utility
|
||||
/** A filtered output stream which applies a charset conversion
|
||||
* to input bytes.
|
||||
*
|
||||
* May throw a exceptions::charset_conv_error if an error
|
||||
* May throw a exceptions::charset_conv_error if an unexpected error
|
||||
* occured when initializing convert, or during charset conversion.
|
||||
*
|
||||
* May also throw a exceptions::illegal_byte_sequence_for_charset
|
||||
* if an illegal byte sequence was found in the input bytes, and the
|
||||
* 'silentlyReplaceInvalidSequences' flag is set to false in
|
||||
* the charsetConverterOptions.
|
||||
*/
|
||||
|
||||
class VMIME_EXPORT charsetFilteredOutputStream : public filteredOutputStream
|
||||
@ -63,6 +68,23 @@ class VMIME_EXPORT charsetConverter : public object
|
||||
{
|
||||
public:
|
||||
|
||||
/** Holds information about a conversion.
|
||||
*/
|
||||
struct status
|
||||
{
|
||||
status();
|
||||
|
||||
|
||||
/** Number of bytes read from input buffer and successfully converted.
|
||||
*/
|
||||
size_t inputBytesRead;
|
||||
|
||||
/** Number of bytes written to output buffer.
|
||||
*/
|
||||
size_t outputBytesWritten;
|
||||
};
|
||||
|
||||
|
||||
/** Construct and initialize an iconv charset converter.
|
||||
*
|
||||
* @param source input charset
|
||||
@ -81,29 +103,44 @@ public:
|
||||
*
|
||||
* @param in input buffer
|
||||
* @param out output buffer
|
||||
* @throws exceptions::charset_conv_error if an error occured during
|
||||
* the conversion
|
||||
* @param st will receive some extra infos when conversion is finished
|
||||
* or stopped by an error (can be NULL)
|
||||
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
|
||||
* byte sequence was found in the input bytes, and the
|
||||
* 'silentlyReplaceInvalidSequences' flag is set to false in
|
||||
* the charsetConverterOptions
|
||||
* @throws exceptions::charset_conv_error if an unexpected error occured
|
||||
* during the conversion
|
||||
*/
|
||||
virtual void convert(const string& in, string& out) = 0;
|
||||
virtual void convert(const string& in, string& out, status* st = NULL) = 0;
|
||||
|
||||
/** Convert the contents of an input stream in a specified charset
|
||||
* to another charset and write the result to an output stream.
|
||||
*
|
||||
* @param in input stream to read data from
|
||||
* @param out output stream to write the converted data
|
||||
* @throws exceptions::charset_conv_error if an error occured during
|
||||
* the conversion
|
||||
* @param st will receive some extra infos when conversion is finished
|
||||
* or stopped by an error (can be NULL)
|
||||
* @throws exceptions::illegal_byte_sequence_for_charset if an illegal
|
||||
* byte sequence was found in the input bytes, and the
|
||||
* 'silentlyReplaceInvalidSequences' flag is set to false in
|
||||
* the charsetConverterOptions
|
||||
* @throws exceptions::charset_conv_error if an unexpected error occured
|
||||
* during the conversion
|
||||
*/
|
||||
virtual void convert(utility::inputStream& in, utility::outputStream& out) = 0;
|
||||
virtual void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL) = 0;
|
||||
|
||||
/** Returns a filtered output stream which applies a charset
|
||||
* conversion to input bytes. Please note that it may not be
|
||||
* supported by the converter.
|
||||
*
|
||||
* @param os stream into which filtered data will be written
|
||||
* @param opts conversion options
|
||||
* @return a filtered output stream, or NULL if not supported
|
||||
*/
|
||||
virtual shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os) = 0;
|
||||
virtual shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
|
||||
(utility::outputStream& os,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions()) = 0;
|
||||
|
||||
private:
|
||||
|
||||
|
@ -29,7 +29,8 @@ namespace vmime
|
||||
|
||||
|
||||
charsetConverterOptions::charsetConverterOptions()
|
||||
: invalidSequence("?")
|
||||
: silentlyReplaceInvalidSequences(true),
|
||||
invalidSequence("?")
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -42,7 +42,15 @@ public:
|
||||
charsetConverterOptions();
|
||||
|
||||
|
||||
/** Replace invalid sequences with this string. */
|
||||
/** If true, invalid sequences will be silently replaced with
|
||||
* a string when possible (see 'invalidSequence').
|
||||
* Default is true.
|
||||
*/
|
||||
bool silentlyReplaceInvalidSequences;
|
||||
|
||||
/** Replace invalid sequences with this string.
|
||||
* Default is '?'.
|
||||
*/
|
||||
string invalidSequence;
|
||||
};
|
||||
|
||||
|
@ -147,8 +147,12 @@ charsetConverter_iconv::~charsetConverter_iconv()
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputStream& out)
|
||||
void charsetConverter_iconv::convert
|
||||
(utility::inputStream& in, utility::outputStream& out, status* st)
|
||||
{
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
if (m_desc == NULL)
|
||||
throw exceptions::charset_conv_error("Cannot initialize converter.");
|
||||
|
||||
@ -175,6 +179,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
|
||||
if (iconv(cd, ICONV_IN_TYPE(&inPtr), ptrLength,
|
||||
ICONV_OUT_TYPE(&outPtr), &outLength) == static_cast <size_t>(-1))
|
||||
{
|
||||
if (st && inPtr)
|
||||
{
|
||||
st->inputBytesRead += (inPtr - inBuffer);
|
||||
st->outputBytesWritten += (outPtr - outBuffer);
|
||||
}
|
||||
|
||||
// Illegal input sequence or input sequence has no equivalent
|
||||
// sequence in the destination charset.
|
||||
if (prevIsInvalid)
|
||||
@ -182,6 +192,9 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
|
||||
// Write successfully converted bytes
|
||||
out.write(outBuffer, sizeof(outBuffer) - outLength);
|
||||
|
||||
if (!m_options.silentlyReplaceInvalidSequences)
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
|
||||
// Output a special character to indicate we don't known how to
|
||||
// convert the sequence at this position
|
||||
outputInvalidChar(out, cd, m_options);
|
||||
@ -208,6 +221,12 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
|
||||
// Write successfully converted bytes
|
||||
out.write(outBuffer, sizeof(outBuffer) - outLength);
|
||||
|
||||
if (st && inPtr)
|
||||
{
|
||||
st->inputBytesRead += (inPtr - inBuffer);
|
||||
st->outputBytesWritten += (outPtr - outBuffer);
|
||||
}
|
||||
|
||||
inPos = 0;
|
||||
prevIsInvalid = false;
|
||||
}
|
||||
@ -222,29 +241,27 @@ void charsetConverter_iconv::convert(utility::inputStream& in, utility::outputSt
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_iconv::convert(const string& in, string& out)
|
||||
void charsetConverter_iconv::convert(const string& in, string& out, status* st)
|
||||
{
|
||||
if (m_source == m_dest)
|
||||
{
|
||||
// No conversion needed
|
||||
out = in;
|
||||
return;
|
||||
}
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
out.clear();
|
||||
|
||||
utility::inputStreamStringAdapter is(in);
|
||||
utility::outputStreamStringAdapter os(out);
|
||||
|
||||
convert(is, os);
|
||||
convert(is, os, st);
|
||||
|
||||
os.flush();
|
||||
}
|
||||
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_iconv::getFilteredOutputStream(utility::outputStream& os)
|
||||
shared_ptr <utility::charsetFilteredOutputStream>
|
||||
charsetConverter_iconv::getFilteredOutputStream
|
||||
(utility::outputStream& os, const charsetConverterOptions& opts)
|
||||
{
|
||||
return make_shared <utility::charsetFilteredOutputStream_iconv>(m_source, m_dest, &os);
|
||||
return make_shared <utility::charsetFilteredOutputStream_iconv>(m_source, m_dest, &os, opts);
|
||||
}
|
||||
|
||||
|
||||
@ -255,9 +272,10 @@ namespace utility {
|
||||
|
||||
|
||||
charsetFilteredOutputStream_iconv::charsetFilteredOutputStream_iconv
|
||||
(const charset& source, const charset& dest, outputStream* os)
|
||||
(const charset& source, const charset& dest, outputStream* os,
|
||||
const charsetConverterOptions& opts)
|
||||
: m_desc(NULL), m_sourceCharset(source), m_destCharset(dest),
|
||||
m_stream(*os), m_unconvCount(0)
|
||||
m_stream(*os), m_unconvCount(0), m_options(opts)
|
||||
{
|
||||
// Get an iconv descriptor
|
||||
const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str());
|
||||
@ -314,6 +332,9 @@ void charsetFilteredOutputStream_iconv::writeImpl
|
||||
// character and skip one byte in the invalid sequence.
|
||||
if (m_unconvCount >= sizeof(m_unconvBuffer))
|
||||
{
|
||||
if (!m_options.silentlyReplaceInvalidSequences)
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
|
||||
outputInvalidChar(m_stream, cd);
|
||||
|
||||
std::copy(m_unconvBuffer + 1,
|
||||
@ -439,6 +460,9 @@ void charsetFilteredOutputStream_iconv::flush()
|
||||
// Skip a "blocking" character
|
||||
if (inputConverted == 0)
|
||||
{
|
||||
if (!m_options.silentlyReplaceInvalidSequences)
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
|
||||
outputInvalidChar(m_stream, cd);
|
||||
|
||||
offset++;
|
||||
|
@ -56,10 +56,12 @@ public:
|
||||
|
||||
~charsetConverter_iconv();
|
||||
|
||||
void convert(const string& in, string& out);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out);
|
||||
void convert(const string& in, string& out, status* st = NULL);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os);
|
||||
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
|
||||
(utility::outputStream& os,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
private:
|
||||
|
||||
@ -84,9 +86,11 @@ public:
|
||||
* @param source input charset
|
||||
* @param dest output charset
|
||||
* @param os stream into which write filtered data
|
||||
* @param opts conversion options
|
||||
*/
|
||||
charsetFilteredOutputStream_iconv
|
||||
(const charset& source, const charset& dest, outputStream* os);
|
||||
(const charset& source, const charset& dest, outputStream* os,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
~charsetFilteredOutputStream_iconv();
|
||||
|
||||
@ -121,6 +125,8 @@ private:
|
||||
// Buffer used for conversion. Avoids declaring it in write().
|
||||
// Should be at least MAX_CHARACTER_WIDTH * MAX_CHARACTER_WIDTH.
|
||||
byte_t m_outputBuffer[32768];
|
||||
|
||||
charsetConverterOptions m_options;
|
||||
};
|
||||
|
||||
|
||||
|
@ -91,10 +91,17 @@ charsetConverter_icu::~charsetConverter_icu()
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStream& out)
|
||||
void charsetConverter_icu::convert
|
||||
(utility::inputStream& in, utility::outputStream& out, status* st)
|
||||
{
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
|
||||
ucnv_reset(m_from);
|
||||
ucnv_reset(m_to);
|
||||
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
// From buffers
|
||||
byte_t cpInBuffer[16]; // stream data put here
|
||||
const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
|
||||
@ -105,12 +112,31 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
|
||||
const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
|
||||
std::vector <char> cpOutBuffer(cpOutBufferSz);
|
||||
|
||||
// Set replacement chars for when converting from Unicode to codepage
|
||||
icu::UnicodeString substString(m_options.invalidSequence.c_str());
|
||||
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
|
||||
// Tell ICU what to do when encountering an illegal byte sequence
|
||||
if (m_options.silentlyReplaceInvalidSequences)
|
||||
{
|
||||
// Set replacement chars for when converting from Unicode to codepage
|
||||
icu::UnicodeString substString(m_options.invalidSequence.c_str());
|
||||
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
|
||||
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error setting replacement char.");
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
|
||||
}
|
||||
else
|
||||
{
|
||||
// Tell ICU top stop (and return an error) on illegal byte sequences
|
||||
ucnv_setToUCallBack
|
||||
(m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
|
||||
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
|
||||
|
||||
ucnv_setFromUCallBack
|
||||
(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
|
||||
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
|
||||
}
|
||||
|
||||
// Input data available
|
||||
while (!in.eof())
|
||||
@ -137,8 +163,22 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
|
||||
ucnv_toUnicode(m_from, &target, targetLimit,
|
||||
&source, sourceLimit, NULL, flush, &toErr);
|
||||
|
||||
if (st)
|
||||
st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));
|
||||
|
||||
if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
|
||||
throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
|
||||
{
|
||||
if (toErr == U_INVALID_CHAR_FOUND ||
|
||||
toErr == U_TRUNCATED_CHAR_FOUND ||
|
||||
toErr == U_ILLEGAL_CHAR_FOUND)
|
||||
{
|
||||
// Error will be thrown later (*)
|
||||
}
|
||||
else
|
||||
{
|
||||
throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
|
||||
}
|
||||
}
|
||||
|
||||
// The Unicode source is the buffer just written and the limit
|
||||
// is where the previous conversion stopped (target is moved in the conversion)
|
||||
@ -158,8 +198,40 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
|
||||
ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
|
||||
&uSource, uSourceLimit, NULL, flush, &fromErr);
|
||||
|
||||
if (st)
|
||||
{
|
||||
// Decrement input bytes count by the number of input bytes in error
|
||||
char errBytes[16];
|
||||
int8_t errBytesLen = sizeof(errBytes);
|
||||
UErrorCode errBytesErr = U_ZERO_ERROR;
|
||||
|
||||
ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);
|
||||
|
||||
st->inputBytesRead -= errBytesLen;
|
||||
st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
|
||||
}
|
||||
|
||||
// (*) If an error occured while converting from input charset, throw it now
|
||||
if (toErr == U_INVALID_CHAR_FOUND ||
|
||||
toErr == U_TRUNCATED_CHAR_FOUND ||
|
||||
toErr == U_ILLEGAL_CHAR_FOUND)
|
||||
{
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
}
|
||||
|
||||
if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
|
||||
throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
|
||||
{
|
||||
if (fromErr == U_INVALID_CHAR_FOUND ||
|
||||
fromErr == U_TRUNCATED_CHAR_FOUND ||
|
||||
fromErr == U_ILLEGAL_CHAR_FOUND)
|
||||
{
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
|
||||
}
|
||||
}
|
||||
|
||||
// Write to destination stream
|
||||
out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));
|
||||
@ -171,29 +243,27 @@ void charsetConverter_icu::convert(utility::inputStream& in, utility::outputStre
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_icu::convert(const string& in, string& out)
|
||||
void charsetConverter_icu::convert(const string& in, string& out, status* st)
|
||||
{
|
||||
if (m_source == m_dest)
|
||||
{
|
||||
// No conversion needed
|
||||
out = in;
|
||||
return;
|
||||
}
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
out.clear();
|
||||
|
||||
utility::inputStreamStringAdapter is(in);
|
||||
utility::outputStreamStringAdapter os(out);
|
||||
|
||||
convert(is, os);
|
||||
convert(is, os, st);
|
||||
|
||||
os.flush();
|
||||
}
|
||||
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_icu::getFilteredOutputStream(utility::outputStream& os)
|
||||
shared_ptr <utility::charsetFilteredOutputStream>
|
||||
charsetConverter_icu::getFilteredOutputStream
|
||||
(utility::outputStream& os, const charsetConverterOptions& opts)
|
||||
{
|
||||
return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os);
|
||||
return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os, opts);
|
||||
}
|
||||
|
||||
|
||||
@ -204,8 +274,10 @@ namespace utility {
|
||||
|
||||
|
||||
charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
|
||||
(const charset& source, const charset& dest, outputStream* os)
|
||||
: m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os)
|
||||
(const charset& source, const charset& dest, outputStream* os,
|
||||
const charsetConverterOptions& opts)
|
||||
: m_from(NULL), m_to(NULL), m_sourceCharset(source),
|
||||
m_destCharset(dest), m_stream(*os), m_options(opts)
|
||||
{
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
m_from = ucnv_open(source.getName().c_str(), &err);
|
||||
@ -224,12 +296,31 @@ charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu
|
||||
("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + ".");
|
||||
}
|
||||
|
||||
// Set replacement chars for when converting from Unicode to codepage
|
||||
icu::UnicodeString substString(vmime::charsetConverterOptions().invalidSequence.c_str());
|
||||
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
|
||||
// Tell ICU what to do when encountering an illegal byte sequence
|
||||
if (m_options.silentlyReplaceInvalidSequences)
|
||||
{
|
||||
// Set replacement chars for when converting from Unicode to codepage
|
||||
icu::UnicodeString substString(m_options.invalidSequence.c_str());
|
||||
ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
|
||||
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error setting replacement char.");
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
|
||||
}
|
||||
else
|
||||
{
|
||||
// Tell ICU top stop (and return an error) on illegal byte sequences
|
||||
ucnv_setToUCallBack
|
||||
(m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
|
||||
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
|
||||
|
||||
ucnv_setFromUCallBack
|
||||
(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);
|
||||
|
||||
if (U_FAILURE(err))
|
||||
throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -275,8 +366,17 @@ void charsetFilteredOutputStream_icu::writeImpl
|
||||
|
||||
if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
|
||||
{
|
||||
throw exceptions::charset_conv_error
|
||||
("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
|
||||
if (toErr == U_INVALID_CHAR_FOUND ||
|
||||
toErr == U_TRUNCATED_CHAR_FOUND ||
|
||||
toErr == U_ILLEGAL_CHAR_FOUND)
|
||||
{
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw exceptions::charset_conv_error
|
||||
("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
|
||||
}
|
||||
}
|
||||
|
||||
const size_t uniLength = uniTarget - &uniBuffer[0];
|
||||
@ -303,8 +403,17 @@ void charsetFilteredOutputStream_icu::writeImpl
|
||||
|
||||
if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
|
||||
{
|
||||
throw exceptions::charset_conv_error
|
||||
("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
|
||||
if (fromErr == U_INVALID_CHAR_FOUND ||
|
||||
fromErr == U_TRUNCATED_CHAR_FOUND ||
|
||||
fromErr == U_ILLEGAL_CHAR_FOUND)
|
||||
{
|
||||
throw exceptions::illegal_byte_sequence_for_charset();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw exceptions::charset_conv_error
|
||||
("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
|
||||
}
|
||||
}
|
||||
|
||||
const size_t cpLength = cpTarget - &cpBuffer[0];
|
||||
|
@ -59,10 +59,12 @@ public:
|
||||
|
||||
~charsetConverter_icu();
|
||||
|
||||
void convert(const string& in, string& out);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out);
|
||||
void convert(const string& in, string& out, status* st = NULL);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os);
|
||||
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
|
||||
(utility::outputStream& os,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
private:
|
||||
|
||||
@ -88,9 +90,11 @@ public:
|
||||
* @param source input charset
|
||||
* @param dest output charset
|
||||
* @param os stream into which write filtered data
|
||||
* @param opts conversion options
|
||||
*/
|
||||
charsetFilteredOutputStream_icu
|
||||
(const charset& source, const charset& dest, outputStream* os);
|
||||
(const charset& source, const charset& dest, outputStream* os,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
~charsetFilteredOutputStream_icu();
|
||||
|
||||
@ -112,6 +116,8 @@ private:
|
||||
const charset m_destCharset;
|
||||
|
||||
outputStream& m_stream;
|
||||
|
||||
charsetConverterOptions m_options;
|
||||
};
|
||||
|
||||
|
||||
|
@ -57,8 +57,11 @@ charsetConverter_idna::~charsetConverter_idna()
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out)
|
||||
void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStream& out, status* st)
|
||||
{
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
// IDNA should be used for short strings, so it does not matter if we
|
||||
// do not work directly on the stream
|
||||
string inStr;
|
||||
@ -66,20 +69,16 @@ void charsetConverter_idna::convert(utility::inputStream& in, utility::outputStr
|
||||
vmime::utility::bufferedStreamCopy(in, os);
|
||||
|
||||
string outStr;
|
||||
convert(inStr, outStr);
|
||||
convert(inStr, outStr, st);
|
||||
|
||||
out << outStr;
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_idna::convert(const string& in, string& out)
|
||||
void charsetConverter_idna::convert(const string& in, string& out, status* st)
|
||||
{
|
||||
if (m_source == m_dest)
|
||||
{
|
||||
// No conversion needed
|
||||
out = in;
|
||||
return;
|
||||
}
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
out.clear();
|
||||
|
||||
@ -87,6 +86,12 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
{
|
||||
if (utility::stringUtils::is7bit(in))
|
||||
{
|
||||
if (st)
|
||||
{
|
||||
st->inputBytesRead = in.length();
|
||||
st->outputBytesWritten = in.length();
|
||||
}
|
||||
|
||||
// No need to encode as Punycode
|
||||
out = in;
|
||||
return;
|
||||
@ -107,6 +112,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
unichars.push_back(uc);
|
||||
}
|
||||
|
||||
if (st)
|
||||
st->inputBytesRead = in.length();
|
||||
|
||||
std::vector <char> output(inUTF8.length() * 2);
|
||||
punycode_uint outputLen = output.size();
|
||||
|
||||
@ -116,6 +124,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
if (status == punycode_success)
|
||||
{
|
||||
out = string("xn--") + string(output.begin(), output.begin() + outputLen);
|
||||
|
||||
if (st)
|
||||
st->outputBytesWritten = out.length();
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -126,6 +137,12 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
{
|
||||
if (in.length() < 5 || in.substr(0, 4) != "xn--")
|
||||
{
|
||||
if (st)
|
||||
{
|
||||
st->inputBytesRead = in.length();
|
||||
st->outputBytesWritten = in.length();
|
||||
}
|
||||
|
||||
// Not an IDNA string
|
||||
out = in;
|
||||
return;
|
||||
@ -137,6 +154,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
const punycode_status status = punycode_decode
|
||||
(in.length() - 4, &in[4], &outputLen, &output[0], /* case_flags */ NULL);
|
||||
|
||||
if (st)
|
||||
st->inputBytesRead = in.length();
|
||||
|
||||
if (status == punycode_success)
|
||||
{
|
||||
std::vector <char> outUTF8Bytes(outputLen * 4);
|
||||
@ -150,6 +170,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
|
||||
string outUTF8(&outUTF8Bytes[0], p);
|
||||
charset::convert(outUTF8, out, vmime::charsets::UTF_8, m_dest);
|
||||
|
||||
if (st)
|
||||
st->outputBytesWritten = out.length();
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -159,7 +182,9 @@ void charsetConverter_idna::convert(const string& in, string& out)
|
||||
}
|
||||
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream> charsetConverter_idna::getFilteredOutputStream(utility::outputStream& /* os */)
|
||||
shared_ptr <utility::charsetFilteredOutputStream>
|
||||
charsetConverter_idna::getFilteredOutputStream
|
||||
(utility::outputStream& /* os */, const charsetConverterOptions& /* opts */)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
@ -50,10 +50,12 @@ public:
|
||||
|
||||
~charsetConverter_idna();
|
||||
|
||||
void convert(const string& in, string& out);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out);
|
||||
void convert(const string& in, string& out, status* st = NULL);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out, status* st = NULL);
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream(utility::outputStream& os);
|
||||
shared_ptr <utility::charsetFilteredOutputStream> getFilteredOutputStream
|
||||
(utility::outputStream& os,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
private:
|
||||
|
||||
|
@ -69,8 +69,12 @@ charsetConverter_win::charsetConverter_win
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_win::convert(utility::inputStream& in, utility::outputStream& out)
|
||||
void charsetConverter_win::convert
|
||||
(utility::inputStream& in, utility::outputStream& out, status* st)
|
||||
{
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
byte_t buffer[32768];
|
||||
string inStr, outStr;
|
||||
|
||||
@ -80,20 +84,16 @@ void charsetConverter_win::convert(utility::inputStream& in, utility::outputStre
|
||||
utility::stringUtils::appendBytesToString(inStr, buffer, len);
|
||||
}
|
||||
|
||||
convert(inStr, outStr);
|
||||
convert(inStr, outStr, st);
|
||||
|
||||
out.write(outStr.data(), outStr.length());
|
||||
}
|
||||
|
||||
|
||||
void charsetConverter_win::convert(const string& in, string& out)
|
||||
void charsetConverter_win::convert(const string& in, string& out, status* st)
|
||||
{
|
||||
if (m_source == m_dest)
|
||||
{
|
||||
// No conversion needed
|
||||
out = in;
|
||||
return;
|
||||
}
|
||||
if (st)
|
||||
new (st) status();
|
||||
|
||||
const int sourceCodePage = getCodePage(m_source.getName().c_str());
|
||||
const int destCodePage = getCodePage(m_dest.getName().c_str());
|
||||
@ -113,10 +113,27 @@ void charsetConverter_win::convert(const string& in, string& out)
|
||||
const size_t bufferSize = in.length() * 2; // in wide characters
|
||||
unicodeBuffer.resize(bufferSize);
|
||||
|
||||
DWORD flags = 0;
|
||||
|
||||
if (!m_options.silentlyReplaceInvalidSequences)
|
||||
flags |= MB_ERR_INVALID_CHARS;
|
||||
|
||||
unicodePtr = reinterpret_cast <const WCHAR*>(&unicodeBuffer[0]);
|
||||
unicodeLen = MultiByteToWideChar
|
||||
(sourceCodePage, 0, in.c_str(), static_cast <int>(in.length()),
|
||||
reinterpret_cast <WCHAR*>(&unicodeBuffer[0]), static_cast <int>(bufferSize));
|
||||
|
||||
if (unicodeLen == 0)
|
||||
{
|
||||
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
|
||||
{
|
||||
throw exceptions::illegal_byte_sequence_in_charset();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw exceptions::charset_conv_error("MultiByteToWideChar() failed when converting to Unicode from " + m_source.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert from Unicode to destination charset
|
||||
@ -135,6 +152,18 @@ void charsetConverter_win::convert(const string& in, string& out)
|
||||
(destCodePage, 0, unicodePtr, static_cast <int>(unicodeLen),
|
||||
&buffer[0], static_cast <int>(bufferSize), 0, NULL);
|
||||
|
||||
if (len == 0)
|
||||
{
|
||||
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
|
||||
{
|
||||
throw exceptions::illegal_byte_sequence_in_charset();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw exceptions::charset_conv_error("WideCharToMultiByte() failed when converting from Unicode to " + m_source.getName());
|
||||
}
|
||||
}
|
||||
|
||||
out.assign(&buffer[0], len);
|
||||
}
|
||||
}
|
||||
@ -158,7 +187,8 @@ int charsetConverter_win::getCodePage(const char* name)
|
||||
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream>
|
||||
charsetConverter_win::getFilteredOutputStream(utility::outputStream& /* os */)
|
||||
charsetConverter_win::getFilteredOutputStream
|
||||
(utility::outputStream& /* os */, const charsetConverterOptions& /* opts */)
|
||||
{
|
||||
// TODO: implement me!
|
||||
return null;
|
||||
|
@ -38,7 +38,16 @@ namespace vmime
|
||||
{
|
||||
|
||||
|
||||
/** A generic charset converter which uses Windows MultiByteToWideChar.
|
||||
/** A generic charset converter which uses Windows MultiByteToWideChar
|
||||
* and WideCharToMultiByte API functions.
|
||||
*
|
||||
* ICU or iconv library should always be preferred over this one, even
|
||||
* on Windows platform, as MultiByteToWideChar() and WideCharToMultiByte()
|
||||
* functions cannot be used easily with streams (no context). Moreover,
|
||||
* error handling is very poor, in particular when an invalid sequence
|
||||
* is found...
|
||||
*
|
||||
* Also, "status" is not supported by this converter for the same reason.
|
||||
*/
|
||||
|
||||
class charsetConverter_win : public charsetConverter
|
||||
@ -54,8 +63,8 @@ public:
|
||||
charsetConverter_win(const charset& source, const charset& dest,
|
||||
const charsetConverterOptions& opts = charsetConverterOptions());
|
||||
|
||||
void convert(const string& in, string& out);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out);
|
||||
void convert(const string& in, string& out, status* st);
|
||||
void convert(utility::inputStream& in, utility::outputStream& out, status* st);
|
||||
|
||||
shared_ptr <utility::charsetFilteredOutputStream>
|
||||
getFilteredOutputStream(utility::outputStream& os);
|
||||
|
@ -115,6 +115,20 @@ exception* charset_conv_error::clone() const { return new charset_conv_error(*th
|
||||
const char* charset_conv_error::name() const throw() { return "charset_conv_error"; }
|
||||
|
||||
|
||||
|
||||
//
|
||||
// illegal_byte_sequence_for_charset
|
||||
//
|
||||
|
||||
illegal_byte_sequence_for_charset::~illegal_byte_sequence_for_charset() throw() {}
|
||||
illegal_byte_sequence_for_charset::illegal_byte_sequence_for_charset(const string& what, const exception& other)
|
||||
: exception(what.empty() ? "Found illegal byte sequence for this charset." : what, other) {}
|
||||
|
||||
exception* illegal_byte_sequence_for_charset::clone() const { return new illegal_byte_sequence_for_charset(*this); }
|
||||
const char* illegal_byte_sequence_for_charset::name() const throw() { return "illegal_byte_sequence_for_charset"; }
|
||||
|
||||
|
||||
|
||||
//
|
||||
// no_encoder_available
|
||||
//
|
||||
|
@ -116,6 +116,18 @@ public:
|
||||
};
|
||||
|
||||
|
||||
class VMIME_EXPORT illegal_byte_sequence_for_charset : public vmime::exception
|
||||
{
|
||||
public:
|
||||
|
||||
illegal_byte_sequence_for_charset(const string& what = "", const exception& other = NO_EXCEPTION);
|
||||
~illegal_byte_sequence_for_charset() throw();
|
||||
|
||||
exception* clone() const;
|
||||
const char* name() const throw();
|
||||
};
|
||||
|
||||
|
||||
/** No encoder has been found for the specified encoding name.
|
||||
*/
|
||||
|
||||
|
@ -400,7 +400,8 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g
|
||||
|
||||
out->removeAllWords();
|
||||
|
||||
const std::vector <shared_ptr <word> > words = word::parseMultiple(ctx, in, 0, in.length(), NULL);
|
||||
std::vector <shared_ptr <word> > words = word::parseMultiple(ctx, in, 0, in.length(), NULL);
|
||||
fixBrokenWords(words);
|
||||
|
||||
copy_vector(words, out->m_words);
|
||||
|
||||
@ -408,6 +409,48 @@ text* text::decodeAndUnfold(const parsingContext& ctx, const string& in, text* g
|
||||
}
|
||||
|
||||
|
||||
// static
|
||||
void text::fixBrokenWords(std::vector <shared_ptr <word> >& words)
|
||||
{
|
||||
if (words.size() < 2)
|
||||
return;
|
||||
|
||||
// Fix words which encode a non-integral number of characters.
|
||||
// This is not RFC-compliant, but we should be able to recover from it.
|
||||
for (size_t i = 0, n = words.size() - 1 ; i < n ; ++i)
|
||||
{
|
||||
shared_ptr <word> w1 = words[i];
|
||||
shared_ptr <word> w2 = words[i + 1];
|
||||
|
||||
// Check whether the word is valid
|
||||
bool valid = w1->getCharset().isValidText(w1->getBuffer(), NULL);
|
||||
|
||||
// If the current word is not valid, try to grab some bytes
|
||||
// from the next word, to see whether it becomes valid.
|
||||
if (!valid)
|
||||
{
|
||||
string buffer(w1->getBuffer());
|
||||
buffer += w2->getBuffer();
|
||||
|
||||
string::size_type firstInvalidByte;
|
||||
valid = w1->getCharset().isValidText(buffer, &firstInvalidByte);
|
||||
|
||||
// Current word with additional bytes from the next word
|
||||
// is now valid: adjust buffers of both words.
|
||||
w1->setBuffer(string(buffer.begin(), buffer.begin() + firstInvalidByte));
|
||||
w2->setBuffer(string(buffer.begin() + firstInvalidByte, buffer.end()));
|
||||
|
||||
// If the next word is now empty, remove it
|
||||
if (w2->getBuffer().empty())
|
||||
{
|
||||
words.erase(words.begin() + i + 1);
|
||||
--n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const std::vector <shared_ptr <component> > text::getChildComponents()
|
||||
{
|
||||
std::vector <shared_ptr <component> > list;
|
||||
|
@ -251,6 +251,9 @@ public:
|
||||
|
||||
protected:
|
||||
|
||||
static void fixBrokenWords(std::vector <shared_ptr <word> >& words);
|
||||
|
||||
|
||||
// Component parsing & assembling
|
||||
void parseImpl
|
||||
(const parsingContext& ctx,
|
||||
|
@ -21,6 +21,8 @@
|
||||
// the GNU General Public License cover the whole combination.
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "tests/testUtils.hpp"
|
||||
|
||||
#include "charsetTestSuites.hpp"
|
||||
@ -39,6 +41,14 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
|
||||
VMIME_TEST(testDecodeIDNA)
|
||||
|
||||
VMIME_TEST(testUTF7Support)
|
||||
|
||||
VMIME_TEST(testReplaceInvalidSequence)
|
||||
VMIME_TEST(testStopOnInvalidSequence)
|
||||
|
||||
VMIME_TEST(testStatus)
|
||||
VMIME_TEST(testStatusWithInvalidSequence)
|
||||
|
||||
VMIME_TEST(testIsValidText)
|
||||
VMIME_TEST_LIST_END
|
||||
|
||||
|
||||
@ -106,10 +116,15 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
|
||||
}
|
||||
|
||||
static const vmime::string convertHelper
|
||||
(const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest)
|
||||
(const vmime::string& in, const vmime::charset& csrc, const vmime::charset& cdest,
|
||||
const vmime::charsetConverterOptions& opts = vmime::charsetConverterOptions(),
|
||||
vmime::charsetConverter::status* st = NULL)
|
||||
{
|
||||
vmime::shared_ptr <vmime::charsetConverter> conv =
|
||||
vmime::charsetConverter::create(csrc, cdest, opts);
|
||||
|
||||
vmime::string out;
|
||||
vmime::charset::convert(in, out, csrc, cdest);
|
||||
conv->convert(in, out, st);
|
||||
|
||||
return out;
|
||||
}
|
||||
@ -145,5 +160,91 @@ VMIME_TEST_SUITE_BEGIN(charsetTest)
|
||||
VASSERT_EQ("2", "f+APg-o", convertHelper("\x66\xc3\xb8\x6f", "utf-8", "utf-7"));
|
||||
}
|
||||
|
||||
void testReplaceInvalidSequence()
|
||||
{
|
||||
vmime::charsetConverterOptions opts;
|
||||
opts.silentlyReplaceInvalidSequences = true;
|
||||
opts.invalidSequence = "?";
|
||||
|
||||
vmime::string res = convertHelper
|
||||
("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts);
|
||||
|
||||
// Result should be in the form "a???b?c??d" or "a??????b?c??d"...
|
||||
// Remove consecutive question marks for easier matching.
|
||||
res.erase(std::unique(res.begin(), res.end()), res.end());
|
||||
|
||||
VASSERT_EQ(
|
||||
"Illegal UTF-8 sequence",
|
||||
"a?b?c?d",
|
||||
res
|
||||
);
|
||||
}
|
||||
|
||||
void testStopOnInvalidSequence()
|
||||
{
|
||||
vmime::charsetConverterOptions opts;
|
||||
opts.silentlyReplaceInvalidSequences = false;
|
||||
|
||||
VASSERT_THROW(
|
||||
"Illegal UTF-8 sequence",
|
||||
convertHelper("\x61\xf1\x80\x80\xe1\x80\xc2\x62\x80\x63\x80\xbf\x64", "utf-8", "iso-8859-1", opts),
|
||||
vmime::exceptions::illegal_byte_sequence_for_charset
|
||||
);
|
||||
}
|
||||
|
||||
void testStatus()
|
||||
{
|
||||
vmime::charsetConverterOptions opts;
|
||||
opts.silentlyReplaceInvalidSequences = false;
|
||||
|
||||
vmime::charsetConverter::status st;
|
||||
|
||||
// 012345 6 7
|
||||
convertHelper("Gwena\xc3\xabl", "utf-8", "iso-8859-1", opts, &st);
|
||||
|
||||
VASSERT_EQ("inputBytesRead", 8, st.inputBytesRead);
|
||||
VASSERT_EQ("outputBytesWritten", 7, st.outputBytesWritten);
|
||||
}
|
||||
|
||||
void testStatusWithInvalidSequence()
|
||||
{
|
||||
vmime::charsetConverterOptions opts;
|
||||
opts.silentlyReplaceInvalidSequences = false;
|
||||
|
||||
vmime::charsetConverter::status st;
|
||||
|
||||
try
|
||||
{
|
||||
// 01234 5 6789 0 1
|
||||
convertHelper("Fran\xc3\xa7ois\xf1\x80\x65", "utf-8", "iso-8859-1", opts, &st);
|
||||
}
|
||||
catch (vmime::exceptions::illegal_byte_sequence_for_charset& e)
|
||||
{
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
|
||||
VASSERT_EQ("inputBytesRead", 9, st.inputBytesRead);
|
||||
VASSERT_EQ("outputBytesWritten", 8, st.outputBytesWritten);
|
||||
}
|
||||
|
||||
void testIsValidText()
|
||||
{
|
||||
// Invalid text
|
||||
const vmime::string invalidText("Fran\xc3\xa7ois\xf1\x80\x65");
|
||||
vmime::string::size_type firstInvalidByte;
|
||||
|
||||
VASSERT_EQ("invalid.isValidText", false, vmime::charset("utf-8").isValidText(invalidText, &firstInvalidByte));
|
||||
VASSERT_EQ("invalid.firstInvalidByte", 9, firstInvalidByte);
|
||||
|
||||
// Valid text
|
||||
const vmime::string validText("Gwena\xc3\xabl");
|
||||
|
||||
VASSERT_EQ("valid.isValidText", true, vmime::charset("utf-8").isValidText(validText, &firstInvalidByte));
|
||||
VASSERT_EQ("valid.firstInvalidByte", 8, firstInvalidByte);
|
||||
}
|
||||
|
||||
VMIME_TEST_SUITE_END
|
||||
|
||||
|
@ -61,6 +61,7 @@ VMIME_TEST_SUITE_BEGIN(textTest)
|
||||
VMIME_TEST(testInternationalizedEmail_folding)
|
||||
|
||||
VMIME_TEST(testWronglyPaddedB64Words)
|
||||
VMIME_TEST(testFixBrokenWords)
|
||||
VMIME_TEST_LIST_END
|
||||
|
||||
|
||||
@ -617,5 +618,50 @@ VMIME_TEST_SUITE_BEGIN(textTest)
|
||||
outText.getConvertedText(vmime::charset("utf-8")));
|
||||
}
|
||||
|
||||
// Ensure that words which encode a non-integral number of characters
|
||||
// are correctly decoded.
|
||||
void testFixBrokenWords()
|
||||
{
|
||||
vmime::text outText;
|
||||
|
||||
vmime::charsetConverterOptions opts;
|
||||
opts.silentlyReplaceInvalidSequences = false; // just to be sure that broken words are actually fixed
|
||||
|
||||
// Test case 1
|
||||
vmime::text::decodeAndUnfold
|
||||
("=?utf-8?Q?Gwena=C3?="
|
||||
"=?utf-8?Q?=ABl?=", &outText);
|
||||
|
||||
VASSERT_EQ("1", "Gwena\xebl",
|
||||
outText.getConvertedText(vmime::charset("iso-8859-1"), opts));
|
||||
|
||||
// Test case 2
|
||||
vmime::text::decodeAndUnfold
|
||||
("=?utf-8?B?5Lit6Yu85qmf5qKw6JGj5LqL5pyDMTAz5bm056ysMDXlsYbn?="
|
||||
"=?utf-8?B?rKwwN+asoeitsOeoiw==?=", &outText);
|
||||
|
||||
VASSERT_EQ("2", "\xe4\xb8\xad\xe9\x8b\xbc\xe6\xa9\x9f\xe6\xa2\xb0"
|
||||
"\xe8\x91\xa3\xe4\xba\x8b\xe6\x9c\x83\x31\x30\x33\xe5\xb9\xb4"
|
||||
"\xe7\xac\xac\x30\x35\xe5\xb1\x86\xe7\xac\xac\x30\x37\xe6\xac"
|
||||
"\xa1\xe8\xad\xb0\xe7\xa8\x8b",
|
||||
outText.getConvertedText(vmime::charset("utf-8")));
|
||||
|
||||
// Test case 3 (a character spanning over 3 words: 'を' = E3 82 92)
|
||||
vmime::text::decodeAndUnfold
|
||||
("=?utf-8?Q?abc=E3?="
|
||||
"=?utf-8?Q?=82?="
|
||||
"=?utf-8?Q?=92xyz?=", &outText);
|
||||
|
||||
std::string out; // decode as UTF-16 then rencode to UTF-8 for easier comparison
|
||||
vmime::charset::convert(
|
||||
outText.getConvertedText(vmime::charset("utf-16"), opts),
|
||||
out,
|
||||
vmime::charset("utf-16"),
|
||||
vmime::charset("utf-8")
|
||||
);
|
||||
|
||||
VASSERT_EQ("3", "abc\xe3\x82\x92xyz", out);
|
||||
}
|
||||
|
||||
VMIME_TEST_SUITE_END
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user