diff options
Diffstat (limited to 'src/base.cpp')
-rw-r--r-- | src/base.cpp | 877 |
1 files changed, 877 insertions, 0 deletions
diff --git a/src/base.cpp b/src/base.cpp new file mode 100644 index 00000000..84fdc9fc --- /dev/null +++ b/src/base.cpp @@ -0,0 +1,877 @@ +// +// VMime library (http://vmime.sourceforge.net) +// Copyright (C) 2002-2004 Vincent Richard <[email protected]> +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +// + +#include "config.hpp" + +#include "charset.hpp" +#include "base.hpp" + +#include "encoder.hpp" +#include "encoderB64.hpp" +#include "encoderQP.hpp" + +#include "text.hpp" + +#include "parserHelpers.hpp" + +// For initializing +#include "encoderFactory.hpp" +#include "headerFieldFactory.hpp" +#include "parameterFactory.hpp" +#include "textPartFactory.hpp" +#include "options.hpp" + +#if VMIME_HAVE_MESSAGING_FEATURES + #include "messaging/serviceFactory.hpp" +#endif + + +namespace vmime +{ + + +/** "Null" (empty) string. + */ +const string NULL_STRING; + +#if VMIME_WIDE_CHAR_SUPPORT + /** "Null" (empty) wide-char string. + */ + const wstring NULL_WSTRING; +#endif + +/** "Null" (empty) text. + */ +const text NULL_TEXT; + + +/** Return the library name (eg. "libvmime"). + * + * @return library name + */ +const string libname() { return (VMIME_PACKAGE); } + +/** Return the library version (eg. "0.5.2"). + * + * @return library version + */ +const string libversion() { return (VMIME_VERSION " (" __DATE__ " " __TIME__ ")"); } + + +// New line sequence to be used when folding header fields. +const string NEW_LINE_SEQUENCE("\r\n "); +const string::size_type NEW_LINE_SEQUENCE_LENGTH(1); // space + +/** The CR-LF sequence. + */ +const string CRLF("\r\n"); + + +/** The current MIME version supported by VMime. + */ +const string MIME_VERSION("1.0"); + + +// Line length limits +namespace lineLengthLimits +{ + const string::size_type infinite = std::numeric_limits <string::size_type>::max(); +} + + + +/** Test two strings for equality (case insensitive). + * WARNING: use this with ASCII-only strings. + * + * @param s1 first string + * @param s2 second string (must be in lower-case!) + * @param n length of the second string + * @return true if the two strings compare equally, false otherwise + */ + +bool isStringEqualNoCase(const string& s1, const char* s2, const string::size_type n) +{ + // 'n' is the number of characters to compare + // 's2' must be in lowercase letters only + if (s1.length() < n) + return (false); + + bool equal = true; + + for (string::size_type i = 0 ; equal && i < n ; ++i) + equal = (std::tolower(s1[i], std::locale()) == s2[i]); + + return (equal); +} + + +/** Test two strings for equality (case insensitive). + * WARNING: use this with ASCII-only strings. + * + * @param s1 first string + * @param s2 second string + * @return true if the two strings compare equally, false otherwise + */ + +bool isStringEqualNoCase(const string& s1, const string& s2) +{ + if (s1.length() != s2.length()) + return (false); + + bool equal = true; + const string::const_iterator end = s1.end(); + + for (string::const_iterator i = s1.begin(), j = s2.begin(); i != end ; ++i, ++j) + equal = (std::tolower(*i, std::locale()) == std::tolower(*j, std::locale())); + + return (equal); +} + + +/** Test two strings for equality (case insensitive). + * WARNING: use this with ASCII-only strings. + * + * @param begin start position of the first string + * @param end end position of the first string + * @param s second string (must be in lower-case!) + * @param n length of the second string + * @return true if the two strings compare equally, false otherwise + */ + +bool isStringEqualNoCase(const string::const_iterator begin, const string::const_iterator end, + const char* s, const string::size_type n) +{ + if ((string::size_type)(end - begin) < n) + return (false); + + bool equal = true; + char* c = const_cast<char*>(s); + string::size_type r = n; + + for (string::const_iterator i = begin ; equal && r && *c ; ++i, ++c, --r) + equal = (std::tolower(*i, std::locale()) == *c); + + return (r == 0 && equal); +} + + +/** Transform all the characters in a string to lower-case. + * WARNING: use this with ASCII-only strings. + * + * @param str the string to transform + * @return a new string in lower-case + */ + +const string toLower(const string& str) +{ + string out(str); + const string::iterator end = out.end(); + + for (string::iterator i = out.begin() ; i != end ; ++i) + *i = std::tolower(*i, std::locale()); + + return (out); +} + + +/** Strip the space characters (SPC, TAB, CR, LF) at the beginning + * and at the end of the specified string. + * + * @param str string in which to strip spaces + * @return a new string with space characters removed + */ + +const string trim(const string& str) +{ + string::const_iterator b = str.begin(); + string::const_iterator e = str.end(); + + if (b != e) + { + for ( ; b != e && isspace(*b) ; ++b); + for ( ; e != b && isspace(*(e - 1)) ; --e); + } + + return (string(b, e)); +} + + +/** Return the number of 7-bit US-ASCII characters in a string. + * + * @param begin start position + * @param end end position + * @return number of ASCII characters + */ + +string::size_type countASCIIchars + (const string::const_iterator begin, const string::const_iterator end) +{ + string::size_type count = 0; + + for (string::const_iterator i = begin ; i != end ; ++i) + { + if (isascii(*i)) + { + if (*i != '=' || *(i + 1) != '?') // To avoid bad behaviour... + ++count; + } + } + + return (count); +} + + +/** Encode and fold text in respect to RFC-2047. + * + * @param os output stream + * @param in input text + * @param maxLineLength maximum line length for output + * @param firstLineOffset the first line length (may be useful if the current output line is not empty) + * @param lastLineLength will receive the length of the last line written + * @param flags encoding flags (see encodeAndFoldFlags) + */ + +void encodeAndFoldText(utility::outputStream& os, const text& in, const string::size_type maxLineLength, + const string::size_type firstLineOffset, string::size_type* lastLineLength, const int flags) +{ + string::size_type curLineLength = firstLineOffset; + + for (text::const_iterator wi = in.begin() ; wi != in.end() ; ++wi) + { + const word& w = *wi; + const string& buffer = w.buffer(); + + // Calculate the number of ASCII chars to check whether encoding is needed + // and _which_ encoding to use. + const string::size_type asciiCount = countASCIIchars(buffer.begin(), buffer.end()); + + bool noEncoding = (flags & encodeAndFoldFlags::forceNoEncoding) || + (!(flags & encodeAndFoldFlags::forceEncoding) && asciiCount == buffer.length()); + + if (noEncoding) + { + // We will fold lines without encoding them. + + string::const_iterator lastWSpos = buffer.end(); // last white-space position + string::const_iterator curLineStart = buffer.begin(); // current line start + + string::const_iterator p = buffer.begin(); + const string::const_iterator end = buffer.end(); + + bool finished = false; + bool newLine = false; + + while (!finished) + { + for ( ; p != end ; ++p, ++curLineLength) + { + // Exceeded maximum line length, but we have found a white-space + // where we can cut the line... + if (curLineLength >= maxLineLength && lastWSpos != end) + break; + + if (*p == ' ' || *p == '\t') + { + // Remember the position of this white-space character + lastWSpos = p; + } + } + + if (p != end) + ++curLineLength; + + //if (p == end || curLineLength >= maxLineLength) + { + if (p == end || lastWSpos == end) + { + // If we are here, it means that we have found no whitespace + // before the first "maxLineLength" characters. In this case, + // we write the full line no matter of the max line length... + + if (!newLine && p != end && lastWSpos == end && + wi != in.begin() && curLineStart == buffer.begin()) + { + // Here, we are continuing on the line of previous encoded + // word, but there is not even enough space to put the + // first word of this line, so we start a new line. + if (flags & encodeAndFoldFlags::noNewLineSequence) + { + os << CRLF; + curLineLength = 0; + } + else + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + p = curLineStart; + lastWSpos = end; + newLine = true; + } + else + { + os << string(curLineStart, p); + + if (p == end) + { + finished = true; + } + else + { + if (flags & encodeAndFoldFlags::noNewLineSequence) + { + os << CRLF; + curLineLength = 0; + } + else + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + curLineStart = p; + lastWSpos = end; + newLine = true; + } + } + } + else + { + // In this case, there will not be enough space on the line for all the + // characters _after_ the last white-space; so we cut the line at this + // last white-space. + +#if 1 + if (curLineLength != 1 && wi != in.begin()) + os << " "; // Separate from previous word +#endif + + os << string(curLineStart, lastWSpos); + + if (flags & encodeAndFoldFlags::noNewLineSequence) + { + os << CRLF; + curLineLength = 0; + } + else + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + curLineStart = lastWSpos + 1; + + p = lastWSpos + 1; + lastWSpos = end; + newLine = true; + } + } + } + } + /* + RFC #2047: + 4. Encodings + + Initially, the legal values for "encoding" are "Q" and "B". These + encodings are described below. The "Q" encoding is recommended for + use when most of the characters to be encoded are in the ASCII + character set; otherwise, the "B" encoding should be used. + Nevertheless, a mail reader which claims to recognize 'encoded-word's + MUST be able to accept either encoding for any character set which it + supports. + */ + else + { + // We will encode _AND_ fold lines + + /* + RFC #2047: + 2. Syntax of encoded-words + + " While there is no limit to the length of a multiple-line header + field, each line of a header field that contains one or more + 'encoded-word's is limited to 76 characters. " + */ + + const string::size_type maxLineLength3 = + (maxLineLength == lineLengthLimits::infinite) + ? maxLineLength + : std::min(maxLineLength, (const string::size_type) 76); + + // Base64 if more than 60% non-ascii, quoted-printable else (default) + const string::size_type asciiPercent = (100 * asciiCount) / buffer.length(); + const string::value_type encoding = (asciiPercent <= 40) ? 'B' : 'Q'; + + string wordStart("=?" + w.charset().name() + "?" + encoding + "?"); + string wordEnd("?="); + + const string::size_type minWordLength = wordStart.length() + wordEnd.length(); + const string::size_type maxLineLength2 = (maxLineLength3 < minWordLength + 1) + ? maxLineLength3 + minWordLength + 1 : maxLineLength3; + + // Checks whether remaining space on this line is usable. If too few + // characters can be encoded, start a new line. + bool startNewLine = true; + + if (curLineLength + 2 < maxLineLength2) + { + const string::size_type remainingSpaceOnLine = maxLineLength2 - curLineLength - 2; + + if (remainingSpaceOnLine < minWordLength + 10) + { + // Space for no more than 10 encoded chars! + // It is not worth while to continue on this line... + startNewLine = true; + } + else + { + // OK, there is enough usable space on the current line. + startNewLine = false; + } + } + + if (startNewLine) + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + + // Encode and fold input buffer + string::const_iterator pos = buffer.begin(); + string::size_type remaining = buffer.length(); + + encoder* theEncoder = ((encoding == 'B') + ? ((encoder*) new encoderB64) + : ((encoder*) new encoderQP)); + + string qpEncodedBuffer; + + if (encoding == 'Q') + { + theEncoder->properties()["rfc2047"] = true; + + // In the case of Quoted-Printable encoding, we cannot simply encode input + // buffer line by line. So, we encode the whole buffer and we will fold it + // in the next loop... + utility::inputStreamStringAdapter in(buffer); + utility::outputStreamStringAdapter out(qpEncodedBuffer); + + theEncoder->encode(in, out); + + pos = qpEncodedBuffer.begin(); + remaining = qpEncodedBuffer.length(); + } + +#if 1 + if (curLineLength != 1 && wi != in.begin()) + { + os << " "; // Separate from previous word + ++curLineLength; + } +#endif + + for ( ; remaining ; ) + { + // Start a new encoded word + os << wordStart; + curLineLength += minWordLength; + + // Compute the number of encoded chars that will fit on this line + const string::size_type fit = maxLineLength2 - curLineLength; + + // Base-64 encoding + if (encoding == 'B') + { + // TODO: WARNING! "Any encoded word which encodes a non-integral + // number of characters or octets is incorrectly formed." + + // Here, we have a formula to compute the maximum number of source + // characters to encode knowing the maximum number of encoded chars + // (with Base64, 3 bytes of input provide 4 bytes of output). + string::size_type count = (fit > 1) ? ((fit - 1) * 3) / 4 : 1; + if (count > remaining) count = remaining; + + utility::inputStreamStringAdapter in + (buffer, pos - buffer.begin(), pos - buffer.begin() + count); + + curLineLength += theEncoder->encode(in, os); + + pos += count; + remaining -= count; + } + // Quoted-Printable encoding + else + { + // TODO: WARNING! "Any encoded word which encodes a non-integral + // number of characters or octets is incorrectly formed." + + // All we have to do here is to take a certain number of character + // (that is less than or equal to "fit") from the QP encoded buffer, + // but we also make sure not to fold a "=XY" encoded char. + const string::const_iterator qpEnd = qpEncodedBuffer.end(); + string::const_iterator lastFoldPos = pos; + string::const_iterator p = pos; + string::size_type n = 0; + + while (n < fit && p != qpEnd) + { + if (*p == '=') + { + if (n + 3 >= fit) + { + lastFoldPos = p; + break; + } + + p += 3; + n += 3; + } + else + { + ++p; + ++n; + } + } + + if (lastFoldPos == pos) + lastFoldPos = p; + + os << string(pos, lastFoldPos); + + curLineLength += (lastFoldPos - pos) + 1; + + pos += n; + remaining -= n; + } + + // End of the encoded word + os << wordEnd; + + if (remaining) + { + os << NEW_LINE_SEQUENCE; + curLineLength = NEW_LINE_SEQUENCE_LENGTH; + } + } + + delete (theEncoder); + } + } + + if (lastLineLength) + *lastLineLength = curLineLength; +} + + +void decodeAndUnfoldText(const string::const_iterator& inStart, const string::const_iterator& inEnd, text& out) +{ + // NOTE: See RFC-2047, Pages 11-12 for knowing about handling + // of white-spaces between encoded words. + + out.clear(); + + string::const_iterator p = inStart; + const string::const_iterator end = inEnd; + + const charset defaultCharset(charsets::US_ASCII); + charset prevWordCharset(defaultCharset); + + bool prevIsEncoded = false; + + string::const_iterator prevPos = p; + + for ( ; ; ) + { + if (p == end || *p == '\n') + { + string::const_iterator textEnd = p; + + if (textEnd != inStart && *(textEnd - 1) == '\r') + --textEnd; + + if (textEnd != prevPos) + { + if (out.size() && prevWordCharset == defaultCharset) + { + out.back().buffer() += string(prevPos, textEnd); + } + else + { + prevWordCharset = defaultCharset; + out.append(word(string(prevPos, textEnd), defaultCharset)); + prevIsEncoded = false; + } + } + + if (p == end) + { + // Finished + break; + } + + // Skip the new-line character + prevPos = ++p; + } + else if (*p == '=' && (p + 1) != end && *(p + 1) == '?') + { + string::const_iterator wordPos = p; + p += 2; // skip '=?' + + if (p != end) + { + const string::const_iterator charsetPos = p; + + for ( ; p != end && *p != '?' ; ++p); + + if (p != end) // a charset is specified + { + const string::const_iterator charsetEnd = p; + const string::const_iterator encPos = ++p; // skip '?' + + for ( ; p != end && *p != '?' ; ++p); + + if (p != end) // an encoding is specified + { + //const string::const_iterator encEnd = p; + const string::const_iterator dataPos = ++p; // skip '?' + + for ( ; p != end && !(*p == '?' && *(p + 1) == '=') ; ++p); + + if (p != end) // some data is specified + { + const string::const_iterator dataEnd = p; + p += 2; // skip '?=' + + encoder* theEncoder = NULL; + + // Base-64 encoding + if (*encPos == 'B' || *encPos == 'b') + { + theEncoder = new encoderB64; + } + // Quoted-Printable encoding + else if (*encPos == 'Q' || *encPos == 'q') + { + theEncoder = new encoderQP; + theEncoder->properties()["rfc2047"] = true; + } + + if (theEncoder) + { + // Decode text + string decodedBuffer; + + utility::inputStreamStringAdapter ein(string(dataPos, dataEnd)); + utility::outputStreamStringAdapter eout(decodedBuffer); + + theEncoder->decode(ein, eout); + delete (theEncoder); + + // Append all the unencoded text before this word + if (prevPos != wordPos) + { + string::const_iterator p = prevPos; + + if (prevIsEncoded) + { + // Check whether there are only white-spaces between + // the two encoded words + for ( ; (p != wordPos) && isspace(*p) ; ++p); + } + + if (p != wordPos) // if not empty + { + if (out.size() && prevWordCharset == defaultCharset) + { + out.back().buffer() += string(prevPos, wordPos); + } + else + { + out.append(word(string(prevPos, wordPos), defaultCharset)); + prevWordCharset = defaultCharset; + } + } + } + + // Append this fresh decoded word to output text + charset thisCharset(string(charsetPos, charsetEnd)); + + if (out.size() && prevWordCharset == thisCharset) + { + out.back().buffer() += decodedBuffer; + } + else + { + prevWordCharset = thisCharset; + out.append(word(decodedBuffer, thisCharset)); + } + + // This word has been decoded: we can advance in the input buffer + prevPos = p; + prevIsEncoded = true; + } + else + { + // Unknown encoding: can't decode this word, we will + // treat this word as ordinary text (RFC-2047, Page 9). + } + } + } + } + } + } + else + { + ++p; + } + + for ( ; p != end && *p != '=' && *p != '\n' ; ++p); + } +} + + +void decodeAndUnfoldText(const string& in, text& out) +{ + decodeAndUnfoldText(in.begin(), in.end(), out); +} + + +/** This function can be used to make several encoded words from a text. + * All the characters in the text must be in the same specified charset. + * + * <p>Eg: giving:</p> + * <pre> <iso-8859-1> "Linux dans un t'el'ephone mobile" + * ("=?iso-8859-1?Q?Linux_dans_un_t=E9l=E9phone_mobile?=") + * </pre><p>it will return:</p> + * <pre> <:us-ascii> "Linux dans un " + * <iso-8859-1> "t'el'ephone " + * <us-ascii> "mobile" + * ("Linux dans un =?iso-8859-1?Q?t=E9l=E9phone_?= mobile") + * </pre> + * + * @param in input string + * @param ch input charset + * @param out output text + */ + +void makeWordsFromText(const string& in, const charset& ch, text& out) +{ + const string::const_iterator end = in.end(); + string::const_iterator p = in.begin(); + string::const_iterator start = in.begin(); + + bool is8bit = false; // is the current word 8-bit? + bool prevIs8bit = false; // is previous word 8-bit? + unsigned int count = 0; // total number of words + + out.clear(); + + for ( ; ; ) + { + if (p == end || isspace(*p)) + { + if (p != end) + ++p; + + if (is8bit) + { + if (prevIs8bit) + { + // No need to create a new encoded word, just append + // the current word to the previous one. + out.back().buffer() += string(start, p); + } + else + { + out.append(word(string(start, p), ch)); + prevIs8bit = true; + ++count; + } + } + else + { + if (count && !prevIs8bit) + { + out.back().buffer() += string(start, p); + } + else + { + out.append(word(string(start, p), charset(charsets::US_ASCII))); + prevIs8bit = false; + ++count; + } + } + + if (p == end) + break; + + is8bit = false; + start = p; + } + else if (!isascii(*p)) + { + is8bit = true; + ++p; + } + else + { + ++p; + } + } +} + + +// +// V-Mime Initializer +// ==================== +// +// Force instanciation of singletons. This is to prevent problems that might +// happen in multithreaded applications... +// +// WARNING: we put the initializer at the end of this compilation unit. This +// ensures this object is initialized _after_ all other global variables in +// the same compilation unit (in particular "lineLengthLimits::infinite", +// which is used by the generate() function (called from "textPartFactory" +// constructor, for example). +// + +class initializer +{ +public: + + initializer() + { + options::getInstance(); + + encoderFactory::getInstance(); + headerFieldFactory::getInstance(); + parameterFactory::getInstance(); + textPartFactory::getInstance(); + + #if VMIME_HAVE_MESSAGING_FEATURES + messaging::serviceFactory::getInstance(); + #endif + } +}; + +initializer theInitializer; + + +} // vmime |