vmime/src/vmime/wordEncoder.cpp
Jan Engelhardt a2636bd4ae
asciiPercent computation: another potential multiplication overflow (#307)
* build: resolve a -Wconversion compiler warning

wordEncoder.cpp:312:91: warning: conversion from
‘std::__cxx11::basic_string<char>::size_type’ {aka ‘long unsigned
int’} to ‘double’ may change value [-Wconversion]
  312 | buffer.length() == 0 ? 1 : static_cast<double>(asciiCount) / buffer.length();
      |                                                              ~~~~~~~~~~~~~^~

* wordEncoder: replace value 100 for asciiPercent

asciiPercent is a ratio, and not counting in units of hundredths
anymore. The maximum value therefore should be 1 not 100.

* vmime: avoid integer multiply wraparound in text::createFromString

The change from commit v0.9.2-194-gb447adbe needs to be applied to
one more function that replicates the same code.

(If the input string is 42949673 characters long or larger, there will
be integer overflow on 32-bit platforms when multiplying by 100.
Switch that one computation to floating point.)
2024-05-21 20:48:08 +02:00

323 lines
8.1 KiB
C++

//
// VMime library (http://www.vmime.org)
// Copyright (C) 2002 Vincent Richard <vincent@vmime.org>
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 3 of
// the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Linking this library statically or dynamically with other modules is making
// a combined work based on this library. Thus, the terms and conditions of
// the GNU General Public License cover the whole combination.
//
#include "vmime/wordEncoder.hpp"
#include "vmime/exception.hpp"
#include "vmime/charsetConverter.hpp"
#include "vmime/encoding.hpp"
#include "vmime/utility/encoder/b64Encoder.hpp"
#include "vmime/utility/encoder/qpEncoder.hpp"
#include "vmime/utility/stringUtils.hpp"
#include "vmime/utility/outputStreamStringAdapter.hpp"
#include "vmime/utility/inputStreamStringAdapter.hpp"
namespace vmime {
wordEncoder::wordEncoder(
const string& buffer,
const charset& charset,
const Encoding encoding
)
: m_buffer(buffer),
m_pos(0),
m_length(buffer.length()),
m_charset(charset),
m_encoding(encoding) {
try {
string utf8Buffer;
vmime::charset::convert(
buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8)
);
m_buffer = utf8Buffer;
m_length = utf8Buffer.length();
m_simple = false;
} catch (exceptions::charset_conv_error&) {
// Ignore exception.
// We will fall back on simple encoding.
m_simple = true;
}
if (m_encoding == ENCODING_AUTO) {
m_encoding = guessBestEncoding(buffer, charset);
}
if (m_encoding == ENCODING_B64) {
m_encoder = make_shared <utility::encoder::b64Encoder>();
} else { // ENCODING_QP
m_encoder = make_shared <utility::encoder::qpEncoder>();
m_encoder->getProperties()["rfc2047"] = true;
}
}
static size_t getUTF8CharLength(
const string& buffer,
const size_t pos,
const size_t length
) {
// Gives the number of extra bytes in a UTF8 char, given the leading char
static const unsigned char UTF8_EXTRA_BYTES[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
const unsigned char c = buffer[pos];
const unsigned char n = UTF8_EXTRA_BYTES[c];
if (n < length - pos) {
return n + 1;
} else {
return 1;
}
}
const string wordEncoder::getNextChunk(const size_t maxLength) {
const size_t remaining = m_length - m_pos;
if (remaining == 0) {
return string();
}
vmime::string chunk;
vmime::utility::outputStreamStringAdapter chunkStream(chunk);
// Simple encoding
if (m_simple) {
// WARNING! Simple encoding can encode a non-integral number of
// characters and then may generate incorrectly-formed words!
if (m_encoding == ENCODING_B64) {
// Here, we have a formula to compute the maximum number of source
// bytes to encode knowing the maximum number of encoded chars. In
// Base64 encoding, 3 bytes of input provide 4 bytes of output.
const size_t inputCount =
std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1);
// Encode chunk
utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
m_encoder->encode(in, chunkStream);
m_pos += inputCount;
} else { // ENCODING_QP
// Compute exactly how much input bytes are needed to have an output
// string length of less than 'maxLength' bytes. In Quoted-Printable
// encoding, encoded bytes take 3 bytes.
size_t inputCount = 0;
size_t outputCount = 0;
while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining)) {
const unsigned char c = m_buffer[m_pos + inputCount];
inputCount++;
outputCount += utility::encoder::qpEncoder::RFC2047_getEncodedLength(c);
}
// Encode chunk
utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
m_encoder->encode(in, chunkStream);
m_pos += inputCount;
}
// Fully RFC-compliant encoding
} else {
shared_ptr <charsetConverter> conv = charsetConverter::create(charsets::UTF_8, m_charset);
size_t inputCount = 0;
size_t outputCount = 0;
string encodeBuffer;
while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining)) {
// Get the next UTF8 character
const size_t inputCharLength =
getUTF8CharLength(m_buffer, m_pos + inputCount, m_length);
const string inputChar(
m_buffer.begin() + m_pos + inputCount,
m_buffer.begin() + m_pos + inputCount + inputCharLength
);
// Convert back to original encoding
string encodeBytes;
conv->convert(inputChar, encodeBytes);
encodeBuffer += encodeBytes;
// Compute number of output bytes
if (m_encoding == ENCODING_B64) {
outputCount = std::max(
static_cast <size_t>(4),
(encodeBuffer.length() * 4) / 3
);
} else { // ENCODING_QP
for (size_t i = 0, n = encodeBytes.length() ; i < n ; ++i) {
const unsigned char c = encodeBytes[i];
outputCount += utility::encoder::qpEncoder::RFC2047_getEncodedLength(c);
}
}
inputCount += inputCharLength;
}
// Encode chunk
utility::inputStreamStringAdapter in(encodeBuffer);
m_encoder->encode(in, chunkStream);
m_pos += inputCount;
}
return chunk;
}
wordEncoder::Encoding wordEncoder::getEncoding() const {
return m_encoding;
}
// static
bool wordEncoder::isEncodingNeeded(
const generationContext& ctx,
const string& buffer,
const charset& charset,
const string& lang
) {
if (!ctx.getInternationalizedEmailSupport()) {
// Charset-specific encoding
encoding recEncoding;
if (charset.getRecommendedEncoding(recEncoding)) {
return true;
}
// No encoding is needed if the buffer only contains ASCII chars
if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos) {
return true;
}
}
// Force encoding when there are only ASCII chars, but there is
// also at least one of '\n' or '\r' (header fields)
if (buffer.find_first_of("\n\r") != string::npos) {
return true;
}
// If any RFC-2047 sequence is found in the buffer, encode it
if (buffer.find("=?") != string::npos || buffer.find("?=") != string::npos) {
return true;
}
// If a language is specified, force encoding
if (!lang.empty()) {
return true;
}
return false;
}
// static
wordEncoder::Encoding wordEncoder::guessBestEncoding(
const string& buffer,
const charset& charset
) {
// Charset-specific encoding
encoding recEncoding;
if (charset.getRecommendedEncoding(recEncoding)) {
if (recEncoding == encoding(encodingTypes::QUOTED_PRINTABLE)) {
return ENCODING_QP;
} else {
return ENCODING_B64;
}
}
// Base64 would be more space-efficient when the ASCII content is
// below 83.33%, but QP has a legibility arugment going for it, so we
// picked 60%.
const size_t asciiCount =
utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end());
const double asciiPercent =
buffer.length() == 0 ? 1 : static_cast<double>(asciiCount) / static_cast<double>(buffer.length());
if (asciiPercent < 0.60) {
return ENCODING_B64;
} else {
return ENCODING_QP;
}
}
} // vmime