From 4f33877820edee1b47d1b6f4fc800eaad273adaa Mon Sep 17 00:00:00 2001 From: Vincent Richard Date: Mon, 16 Apr 2012 22:32:33 +0200 Subject: Added ability to parse directly from an input stream (eg. file). This allows very big messages to be parsed without loading the whole message data into memory. --- src/body.cpp | 220 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 156 insertions(+), 64 deletions(-) (limited to 'src/body.cpp') diff --git a/src/body.cpp b/src/body.cpp index 9d7d57f9..732fa8b5 100644 --- a/src/body.cpp +++ b/src/body.cpp @@ -31,10 +31,13 @@ #include "vmime/utility/random.hpp" +#include "vmime/utility/seekableInputStreamRegionAdapter.hpp" + #include "vmime/parserHelpers.hpp" #include "vmime/emptyContentHandler.hpp" #include "vmime/stringContentHandler.hpp" +#include "vmime/streamContentHandler.hpp" namespace vmime @@ -52,11 +55,28 @@ body::~body() } -void body::parse(const string& buffer, const string::size_type position, - const string::size_type end, string::size_type* newPosition) +void body::parseImpl + (ref parser, + const utility::stream::size_type position, + const utility::stream::size_type end, + utility::stream::size_type* newPosition) { removeAllParts(); + m_prologText.clear(); + m_epilogText.clear(); + + if (end == position) + { + + setParsedBounds(position, end); + + if (newPosition) + *newPosition = end; + + return; + } + // Check whether the body is a MIME-multipart bool isMultipart = false; string boundary; @@ -80,37 +100,61 @@ void body::parse(const string& buffer, const string::size_type position, { // No "boundary" parameter specified: we can try to // guess it by scanning the body contents... - string::size_type pos = buffer.find("\n--", position); + utility::stream::size_type pos = position; + + parser->seek(pos); + + if (pos + 2 < end && parser->matchBytes("--", 2)) + { + pos += 2; + } + else + { + pos = parser->findNext("\n--", position); - if ((pos != string::npos) && (pos < end)) + if ((pos != utility::stream::npos) && (pos + 3 < end)) + pos += 3; // skip \n-- + } + + if ((pos != utility::stream::npos) && (pos < end)) { - pos += 3; + parser->seek(pos); - const string::size_type start = pos; + // Read some bytes after boundary separator + utility::stream::value_type buffer[256]; + const utility::stream::size_type bufferLen = + parser->read(buffer, std::min(end - pos, sizeof(buffer) / sizeof(buffer[0]))); - char_t c = buffer[pos]; - string::size_type length = 0; + buffer[sizeof(buffer) / sizeof(buffer[0]) - 1] = '\0'; + // Extract boundary from buffer (stop at first CR or LF). // We have to stop after a reasonnably long boundary length (100) // not to take the whole body contents for a boundary... - while (pos < end && length < 100 && !(c == '\r' || c == '\n')) + string::value_type boundaryBytes[100]; + string::size_type boundaryLen = 0; + + for (string::value_type c = buffer[0] ; + boundaryLen < bufferLen && boundaryLen < 100 && !(c == '\r' || c == '\n') ; + c = buffer[++boundaryLen]) { - ++length; - c = buffer[pos++]; + boundaryBytes[boundaryLen] = buffer[boundaryLen]; } - if (pos < end && length < 100) + if (boundaryLen >= 1 && boundaryLen < 100) { // RFC #1521, Page 31: // "...the boundary parameter, which consists of 1 to 70 // characters from a set of characters known to be very // robust through email gateways, and NOT ending with // white space..." - while (pos != start && parserHelpers::isSpace(buffer[pos - 1])) - --pos; - - boundary = string(buffer.begin() + start, - buffer.begin() + pos); + while (boundaryLen != 0 && + parserHelpers::isSpace(boundaryBytes[boundaryLen - 1])) + { + boundaryLen--; + } + + if (boundaryLen >= 1) + boundary = string(boundaryBytes, boundaryBytes + boundaryLen); } } } @@ -126,51 +170,79 @@ void body::parse(const string& buffer, const string::size_type position, { const string boundarySep("--" + boundary); - string::size_type partStart = position; - string::size_type pos = position; + utility::stream::size_type partStart = position; + utility::stream::size_type pos = position; bool lastPart = false; - while (pos != string::npos && pos < end) + while (pos != utility::stream::npos && pos < end) { - pos = buffer.find(boundarySep, pos); - - if (pos == string::npos || - ((pos == 0 || buffer[pos - 1] == '\n') && - (buffer[pos + boundarySep.length()] == '\r' || - buffer[pos + boundarySep.length()] == '\n' || - buffer[pos + boundarySep.length()] == '-' - ) - ) - ) + pos = parser->findNext(boundarySep, pos); + + if (pos == utility::stream::npos) + break; // not found + + if (pos != 0) { - break; + parser->seek(pos - 1); + + if (parser->peekByte() != '\n') + { + // Boundary is not at a beginning of a line + pos++; + continue; + } + + parser->skip(1 + boundarySep.length()); + } + else + { + parser->seek(pos + boundarySep.length()); } - // boundary not a beginning of line, or just a prefix of another, continue the search. + const utility::stream::value_type next = parser->peekByte(); + + if (next == '\r' || next == '\n' || next == '-') + break; + + // Boundary is a prefix of another, continue the search pos++; } - if (pos != string::npos && pos < end) + if (pos != utility::stream::npos && pos < end) { vmime::text text; - text.parse(buffer, position, pos); + text.parse(parser, position, pos); m_prologText = text.getWholeBuffer(); } - for (int index = 0 ; !lastPart && (pos != string::npos) && (pos < end) ; ++index) + for (int index = 0 ; !lastPart && (pos != utility::stream::npos) && (pos < end) ; ++index) { - string::size_type partEnd = pos; + utility::stream::size_type partEnd = pos; // Get rid of the [CR]LF just before the boundary string - if (pos >= (position + 1) && buffer[pos - 1] == '\n') --partEnd; - if (pos >= (position + 2) && buffer[pos - 2] == '\r') --partEnd; + if (pos >= (position + 1)) + { + parser->seek(pos - 1); + + if (parser->peekByte() == '\n') + --partEnd; + } + + if (pos >= (position + 2)) + { + parser->seek(pos - 2); + + if (parser->peekByte() == '\r') + --partEnd; + } // Check whether it is the last part (boundary terminated by "--") pos += boundarySep.length(); + parser->seek(pos); - if (pos + 1 < end && buffer[pos] == '-' && buffer[pos + 1] == '-') + if (pos + 1 < end && parser->matchBytes("--", 2)) { lastPart = true; pos += 2; @@ -180,15 +252,15 @@ void body::parse(const string& buffer, const string::size_type position, // "...(If a boundary appears to end with white space, the // white space must be presumed to have been added by a // gateway, and must be deleted.)..." - while (pos < end && (buffer[pos] == ' ' || buffer[pos] == '\t')) - ++pos; + parser->seek(pos); + pos += parser->skipIf(parserHelpers::isSpaceOrTab, end); // End of boundary line - if (pos + 1 < end && buffer[pos] == '\r' && buffer[pos + 1] =='\n') + if (pos + 1 < end && parser->matchBytes("\r\n", 2)) { pos += 2; } - else if (pos < end && buffer[pos] == '\n') + else if (pos < end && parser->peekByte() == '\n') { ++pos; } @@ -202,7 +274,7 @@ void body::parse(const string& buffer, const string::size_type position, if (partEnd < partStart) std::swap(partStart, partEnd); - part->parse(buffer, partStart, partEnd, NULL); + part->parse(parser, partStart, partEnd, NULL); part->m_parent = m_part; m_parts.push_back(part); @@ -210,23 +282,37 @@ void body::parse(const string& buffer, const string::size_type position, partStart = pos; - while (pos != string::npos && pos < end) + while (pos != utility::stream::npos && pos < end) { - pos = buffer.find(boundarySep, pos); - - if (pos == string::npos || - ((pos == 0 || buffer[pos - 1] == '\n') && - (buffer[pos + boundarySep.length()] == '\r' || - buffer[pos + boundarySep.length()] == '\n' || - buffer[pos + boundarySep.length()] == '-' - ) - ) - ) + pos = parser->findNext(boundarySep, pos); + + if (pos == utility::stream::npos) + break; // not found + + if (pos != 0) { - break; + parser->seek(pos - 1); + + if (parser->peekByte() != '\n') + { + // Boundary is not at a beginning of a line + pos++; + continue; + } + + parser->skip(1 + boundarySep.length()); + } + else + { + parser->seek(pos + boundarySep.length()); } - // boundary not a beginning of line, or just a prefix of another, continue the search. + const utility::stream::value_type next = parser->peekByte(); + + if (next == '\r' || next == '\n' || next == '-') + break; + + // Boundary is a prefix of another, continue the search pos++; } } @@ -234,13 +320,13 @@ void body::parse(const string& buffer, const string::size_type position, m_contents = vmime::create (); // Last part was not found: recover from missing boundary - if (!lastPart && pos == string::npos) + if (!lastPart && pos == utility::stream::npos) { ref part = vmime::create (); try { - part->parse(buffer, partStart, end); + part->parse(parser, partStart, end); } catch (std::exception&) { @@ -255,7 +341,7 @@ void body::parse(const string& buffer, const string::size_type position, else if (partStart < end) { vmime::text text; - text.parse(buffer, partStart, end); + text.parse(parser, partStart, end); m_epilogText = text.getWholeBuffer(); } @@ -282,7 +368,13 @@ void body::parse(const string& buffer, const string::size_type position, } // Extract the (encoded) contents - m_contents = vmime::create (buffer, position, end, enc); + const utility::stream::size_type length = end - position; + + ref contentStream = + vmime::create + (parser->getUnderlyingStream(), position, length); + + m_contents = vmime::create (contentStream, length, enc); } setParsedBounds(position, end); @@ -292,7 +384,7 @@ void body::parse(const string& buffer, const string::size_type position, } -void body::generate(utility::outputStream& os, const string::size_type maxLineLength, +void body::generateImpl(utility::outputStream& os, const string::size_type maxLineLength, const string::size_type /* curLinePos */, string::size_type* newLinePos) const { // MIME-Multipart @@ -862,9 +954,9 @@ const std::vector > body::getPartList() } -const std::vector > body::getChildComponents() const +const std::vector > body::getChildComponents() { - std::vector > list; + std::vector > list; copy_vector(m_parts, list); -- cgit v1.2.3