From e88b8eeac27ec3a829a4622c861a9018b9baa329 Mon Sep 17 00:00:00 2001 From: Vincent Richard Date: Sun, 3 May 2015 19:17:00 +0200 Subject: [PATCH] Fixed parsing of UTF8 email addresses (RFC-2047 local part + IDNA domain name). --- src/vmime/emailAddress.cpp | 206 +++++++++++++++++++++++++----- tests/parser/emailAddressTest.cpp | 16 +++ 2 files changed, 192 insertions(+), 30 deletions(-) diff --git a/src/vmime/emailAddress.cpp b/src/vmime/emailAddress.cpp index e185b17b..a898e1ce 100644 --- a/src/vmime/emailAddress.cpp +++ b/src/vmime/emailAddress.cpp @@ -35,6 +35,98 @@ namespace vmime { +/** Decode an IDNA-encoded domain name ("xn--5rtw95l.xn--wgv71a") + * to a fully decoded domain name in UTF-8 ("黒川.日本"). + * + * @param idnaDomain domain name encoded with IDNA + * @return decoded domain name in UTF-8 + */ +static const string domainNameFromIDNA(const string& idnaDomain) +{ + std::ostringstream domainName; + size_t p = 0; + + for (size_t n = idnaDomain.find('.', p) ; + (n = idnaDomain.find('.', p)) != string::npos ; p = n + 1) + { + const string encodedPart(idnaDomain.begin() + p, idnaDomain.begin() + n); + + if (encodedPart.length() > 4 && + encodedPart[0] == 'x' && encodedPart[1] == 'n' && + encodedPart[2] == '-' && encodedPart[3] == '-') + { + string decodedPart; + charset::convert(encodedPart, decodedPart, + vmime::charsets::IDNA, vmime::charsets::UTF_8); + + domainName << decodedPart << '.'; + } + else + { + domainName << encodedPart << '.'; // not encoded + } + } + + if (p < idnaDomain.length()) + { + const string encodedPart(idnaDomain.begin() + p, idnaDomain.end()); + + if (encodedPart.length() > 4 && + encodedPart[0] == 'x' && encodedPart[1] == 'n' && + encodedPart[2] == '-' && encodedPart[3] == '-') + { + string decodedPart; + charset::convert(encodedPart, decodedPart, + vmime::charsets::IDNA, vmime::charsets::UTF_8); + + domainName << decodedPart; + } + else + { + domainName << encodedPart; // not encoded + } + } + + return domainName.str(); +} + + +/** Encode an UTF-8 domain name ("黒川.日本") to an IDNA-encoded + * domain name ("xn--5rtw95l.xn--wgv71a"). + * + * @param domainName domain name in UTF-8 + * @return domain name encoded with IDNA + */ +static const string domainNameToIDNA(const string& domainName) +{ + std::ostringstream idnaDomain; + size_t p = 0; + + for (size_t n = domainName.find('.', p) ; + (n = domainName.find('.', p)) != string::npos ; p = n + 1) + { + string idnaPart; + charset::convert(string(domainName.begin() + p, domainName.begin() + n), + idnaPart, vmime::charsets::UTF_8, vmime::charsets::IDNA); + + idnaDomain << idnaPart << '.'; + } + + if (p < domainName.length()) + { + string idnaPart; + charset::convert(string(domainName.begin() + p, domainName.end()), + idnaPart, vmime::charsets::UTF_8, vmime::charsets::IDNA); + + idnaDomain << idnaPart; + } + + return idnaDomain.str(); +} + + + + emailAddress::emailAddress() { } @@ -86,6 +178,10 @@ void emailAddress::parseImpl State_LocalPartMiddle, State_LocalPartComment, State_LocalPartQuoted, + State_LocalPartRFC2047Start, + State_LocalPartRFC2047Middle, + State_LocalPartRFC2047MiddleQM, + State_LocalPartRFC2047End, State_DomainPartStart, State_DomainPartMiddle, State_DomainPartComment, @@ -101,6 +197,7 @@ void emailAddress::parseImpl bool atFound = false; bool stop = false; int commentLevel = 0; + bool localPartIsRFC2047 = false; while (p < pend && !stop) { @@ -128,6 +225,11 @@ void emailAddress::parseImpl state = State_LocalPartQuoted; ++p; } + else if (c == '=') + { + state = State_LocalPartRFC2047Start; + ++p; + } else if (c == '(') { state = State_LocalPartComment; @@ -214,6 +316,25 @@ void emailAddress::parseImpl break; + case State_LocalPartRFC2047Start: + + if (c == '?') + { + state = State_LocalPartRFC2047Middle; + localPart << "=?"; + localPartIsRFC2047 = true; + ++p; + } + else + { + state = State_LocalPartMiddle; + localPart << '='; + localPart << c; + ++p; + } + + break; + case State_LocalPartMiddle: if (c == '.') @@ -256,6 +377,55 @@ void emailAddress::parseImpl break; + case State_LocalPartRFC2047Middle: + + if (c == '?') + { + state = State_LocalPartRFC2047MiddleQM; + ++p; + } + else + { + localPart << c; + ++p; + } + + break; + + case State_LocalPartRFC2047MiddleQM: + + if (c == '=') + { + // End of RFC-2047 encoded word + state = State_LocalPartRFC2047End; + localPart << "?="; + ++p; + } + else + { + state = State_LocalPartRFC2047Middle; + localPart << '?'; + localPart << c; + ++p; + } + + break; + + case State_LocalPartRFC2047End: + + if (c == '@') + { + atFound = true; + state = State_DomainPartStart; + ++p; + } + else + { + state = State_End; + } + + break; + case State_DomainPartStart: if (c == '(') @@ -357,8 +527,12 @@ void emailAddress::parseImpl if (domainPart.str().empty() && !atFound) domainPart << platform::getHandler()->getHostName(); - m_localName = word(localPart.str(), vmime::charsets::UTF_8); - m_domainName = word(domainPart.str(), vmime::charsets::UTF_8); + if (localPartIsRFC2047) + m_localName.parse(localPart.str()); + else + m_localName = word(localPart.str(), vmime::charsets::UTF_8); + + m_domainName = word(domainNameFromIDNA(domainPart.str()), vmime::charsets::UTF_8); } setParsedBounds(position, p - pend); @@ -368,34 +542,6 @@ void emailAddress::parseImpl } -static const string domainNameToIDNA(const string& domainName) -{ - std::ostringstream idnaDomain; - size_t p = 0; - - for (size_t n = domainName.find('.', p) ; - (n = domainName.find('.', p)) != string::npos ; p = n + 1) - { - string idnaPart; - charset::convert(string(domainName.begin() + p, domainName.begin() + n), - idnaPart, vmime::charsets::UTF_8, vmime::charsets::IDNA); - - idnaDomain << idnaPart << '.'; - } - - if (p < domainName.length()) - { - string idnaPart; - charset::convert(string(domainName.begin() + p, domainName.end()), - idnaPart, vmime::charsets::UTF_8, vmime::charsets::IDNA); - - idnaDomain << idnaPart; - } - - return idnaDomain.str(); -} - - void emailAddress::generateImpl (const generationContext& ctx, utility::outputStream& os, const size_t curLinePos, size_t* newLinePos) const diff --git a/tests/parser/emailAddressTest.cpp b/tests/parser/emailAddressTest.cpp index 4916aaa9..2de401a9 100644 --- a/tests/parser/emailAddressTest.cpp +++ b/tests/parser/emailAddressTest.cpp @@ -40,6 +40,7 @@ VMIME_TEST_SUITE_BEGIN(emailAddressTest) VMIME_TEST(testParseSpecialChars) VMIME_TEST(testParseCommentInLocalPart) VMIME_TEST(testParseCommentInDomainPart) + VMIME_TEST(testParseRFC2047EncodedLocalPart) VMIME_TEST(testGenerateSpecialChars) VMIME_TEST_LIST_END @@ -198,6 +199,21 @@ VMIME_TEST_SUITE_BEGIN(emailAddressTest) VASSERT_EQ("4/domain", "example.com", eml4.getDomainName()); } + void testParseRFC2047EncodedLocalPart() + { + vmime::emailAddress eml1("=?utf-8?Q?Pel=C3=A9?=@example.com"); + VASSERT_EQ("1/local", "Pelé", eml1.getLocalName()); + VASSERT_EQ("1/domain", "example.com", eml1.getDomainName()); + + vmime::emailAddress eml2("=?utf-8?B?55Sy5paQ?=@xn--5rtw95l.xn--wgv71a"); + VASSERT_EQ("2/local", "甲斐", eml2.getLocalName()); + VASSERT_EQ("2/domain", "黒川.日本", eml2.getDomainName()); + + vmime::emailAddress eml3("=?utf-8?B?55Sy5paQ?=@xn--5rtw95l.com"); + VASSERT_EQ("3/local", "甲斐", eml3.getLocalName()); + VASSERT_EQ("3/domain", "黒川.com", eml3.getDomainName()); + } + void testGenerateASCII() { VASSERT_EQ("email 1", "local@domain", vmime::emailAddress("local", "domain").generate());