From 4fb6dfb731807a489f4a4d847968c5d52378ad8d Mon Sep 17 00:00:00 2001 From: "achristensen@apple.com" Date: Tue, 20 Sep 2016 21:50:30 +0000 Subject: [PATCH] Reduce allocations in URLParser https://bugs.webkit.org/show_bug.cgi?id=162241 Reviewed by Chris Dumez. Source/WebCore: Use Vectors instead of StringBuilders. This allows us to use the inline capacity on the stack for short URLs (<2KB) and also allows us to skip branches because we know whether the contained type is UChar or LChar at compile time. It also allows us to use uncheckedAppend. Added new API tests for parts that were less tested, but there is no change in behavior except for a performance improvement. * platform/URLParser.cpp: (WebCore::appendCodePoint): (WebCore::encodeQuery): (WebCore::URLParser::failure): (WebCore::URLParser::parse): (WebCore::percentDecode): (WebCore::domainToASCII): (WebCore::hasInvalidDomainCharacter): (WebCore::URLParser::parseHost): (WebCore::formURLDecode): (WebCore::isC0Control): Deleted. * platform/URLParser.h: Tools: * TestWebKitAPI/Tests/WebCore/URLParser.cpp: (TestWebKitAPI::TEST_F): (TestWebKitAPI::checkURL): git-svn-id: https://svn.webkit.org/repository/webkit/trunk@206177 268f45cc-cd09-0410-ab3c-d52691b4dbfc --- Source/WebCore/ChangeLog | 27 ++++++ Source/WebCore/platform/URLParser.cpp | 111 +++++++++++++----------- Source/WebCore/platform/URLParser.h | 2 +- Tools/ChangeLog | 11 +++ Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp | 29 +++++++ 5 files changed, 127 insertions(+), 53 deletions(-) diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog index bdab29e..ab9aff8 100644 --- a/Source/WebCore/ChangeLog +++ b/Source/WebCore/ChangeLog @@ -1,3 +1,30 @@ +2016-09-20 Alex Christensen + + Reduce allocations in URLParser + https://bugs.webkit.org/show_bug.cgi?id=162241 + + Reviewed by Chris Dumez. + + Use Vectors instead of StringBuilders. This allows us to use the inline capacity on the stack + for short URLs (<2KB) and also allows us to skip branches because we know whether the + contained type is UChar or LChar at compile time. It also allows us to use uncheckedAppend. + + Added new API tests for parts that were less tested, but there is + no change in behavior except for a performance improvement. + + * platform/URLParser.cpp: + (WebCore::appendCodePoint): + (WebCore::encodeQuery): + (WebCore::URLParser::failure): + (WebCore::URLParser::parse): + (WebCore::percentDecode): + (WebCore::domainToASCII): + (WebCore::hasInvalidDomainCharacter): + (WebCore::URLParser::parseHost): + (WebCore::formURLDecode): + (WebCore::isC0Control): Deleted. + * platform/URLParser.h: + 2016-09-20 Nan Wang AX: voiceover does not read contents of input role="spinbutton" diff --git a/Source/WebCore/platform/URLParser.cpp b/Source/WebCore/platform/URLParser.cpp index eb0f79b..30bb22e 100644 --- a/Source/WebCore/platform/URLParser.cpp +++ b/Source/WebCore/platform/URLParser.cpp @@ -30,10 +30,6 @@ #include #include #include -#include -#include -#include -#include namespace WebCore { @@ -115,6 +111,17 @@ auto CodePointIterator::operator++() -> CodePointIterator& m_begin += i; return *this; } + +static void appendCodePoint(Vector& destination, UChar32 codePoint) +{ + if (U_IS_BMP(codePoint)) { + destination.append(static_cast(codePoint)); + return; + } + destination.reserveCapacity(destination.size() + 2); + destination.uncheckedAppend(U16_LEAD(codePoint)); + destination.uncheckedAppend(U16_TRAIL(codePoint)); +} enum URLCharacterClass { UserInfo = 0x1, @@ -504,10 +511,10 @@ inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector& dest } } -inline static void encodeQuery(const StringBuilder& source, Vector& destination, const TextEncoding& encoding) +inline static void encodeQuery(const Vector& source, Vector& destination, const TextEncoding& encoding) { // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested. - CString encoded = encoding.encode(source.toStringPreserveCapacity(), URLEncodedEntitiesForUnencodables); + CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables); const char* data = encoded.data(); size_t length = encoded.length(); for (size_t i = 0; i < length; ++i) { @@ -912,7 +919,7 @@ URL URLParser::parseSerializedURL(const String& input) return parse(input.characters8(), input.length(), { }, UTF8Encoding()); return parse(input.characters16(), input.length(), { }, UTF8Encoding()); } - + template URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding) { @@ -923,7 +930,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR m_asciiBuffer.reserveCapacity(length); bool isUTF8Encoding = encoding == UTF8Encoding(); - StringBuilder queryBuffer; + Vector queryBuffer; unsigned endIndex = length; while (endIndex && isC0ControlOrSpace(input[endIndex - 1])) @@ -1408,7 +1415,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR if (isUTF8Encoding) utf8PercentEncodeQuery(*c, m_asciiBuffer); else - queryBuffer.append(*c); + appendCodePoint(queryBuffer, *c); ++c; break; case State::Fragment: @@ -1416,7 +1423,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c)) m_asciiBuffer.append(*c); else - m_unicodeFragmentBuffer.append(*c); + appendCodePoint(m_unicodeFragmentBuffer, *c); ++c; break; } @@ -1926,25 +1933,27 @@ inline static Optional> parseIPv6Host(CodePointIterator< return address; } -// FIXME: This should return a CString. -inline static String percentDecode(const LChar* input, size_t length) +const size_t defaultInlineBufferSize = 2048; + +inline static Vector percentDecode(const LChar* input, size_t length) { - StringBuilder output; + Vector output; + output.reserveInitialCapacity(length); for (size_t i = 0; i < length; ++i) { uint8_t byte = input[i]; if (byte != '%') - output.append(byte); + output.uncheckedAppend(byte); else if (i < length - 2) { if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) { - output.append(toASCIIHexValue(input[i + 1], input[i + 2])); + output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2])); i += 2; } else - output.append(byte); + output.uncheckedAppend(byte); } else - output.append(byte); + output.uncheckedAppend(byte); } - return output.toStringPreserveCapacity(); + return output; } inline static bool containsOnlyASCII(const String& string) @@ -1954,22 +1963,26 @@ inline static bool containsOnlyASCII(const String& string) return charactersAreAllASCII(string.characters16(), string.length()); } -inline static Optional domainToASCII(const String& domain) +inline static Optional> domainToASCII(const String& domain) { - const unsigned hostnameBufferLength = 2048; - + Vector ascii; if (containsOnlyASCII(domain)) { - if (domain.is8Bit()) - return domain.convertToASCIILowercase(); - Vector buffer; size_t length = domain.length(); - buffer.reserveInitialCapacity(length); - for (size_t i = 0; i < length; ++i) - buffer.append(toASCIILower(domain[i])); - return String(buffer.data(), length); + if (domain.is8Bit()) { + const LChar* characters = domain.characters8(); + ascii.reserveInitialCapacity(length); + for (size_t i = 0; i < length; ++i) + ascii.uncheckedAppend(toASCIILower(characters[i])); + } else { + const UChar* characters = domain.characters16(); + ascii.reserveInitialCapacity(length); + for (size_t i = 0; i < length; ++i) + ascii.uncheckedAppend(toASCIILower(characters[i])); + } + return ascii; } - UChar hostnameBuffer[hostnameBufferLength]; + UChar hostnameBuffer[defaultInlineBufferSize]; UErrorCode error = U_ZERO_ERROR; #if COMPILER(GCC) || COMPILER(CLANG) @@ -1977,30 +1990,29 @@ inline static Optional domainToASCII(const String& domain) #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #endif // FIXME: This should use uidna_openUTS46 / uidna_close instead - int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, nullptr, &error); + int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error); #if COMPILER(GCC) || COMPILER(CLANG) #pragma GCC diagnostic pop #endif + ASSERT(numCharactersConverted <= static_cast(defaultInlineBufferSize)); if (error == U_ZERO_ERROR) { - LChar buffer[hostnameBufferLength]; for (int32_t i = 0; i < numCharactersConverted; ++i) { ASSERT(isASCII(hostnameBuffer[i])); - buffer[i] = hostnameBuffer[i]; + ASSERT(!isASCIIUpper(hostnameBuffer[i])); } - return String(buffer, numCharactersConverted); + ascii.append(hostnameBuffer, numCharactersConverted); + return ascii; } // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer. return Nullopt; } -inline static bool hasInvalidDomainCharacter(const String& asciiDomain) +inline static bool hasInvalidDomainCharacter(const Vector& asciiDomain) { - RELEASE_ASSERT(asciiDomain.is8Bit()); - const LChar* characters = asciiDomain.characters8(); - for (size_t i = 0; i < asciiDomain.length(); ++i) { - if (isInvalidDomainCharacter(characters[i])) + for (size_t i = 0; i < asciiDomain.size(); ++i) { + if (isInvalidDomainCharacter(asciiDomain[i])) return true; } return false; @@ -2095,9 +2107,8 @@ bool URLParser::parseHostAndPort(CodePointIterator iterator) m_url.m_portEnd = m_asciiBuffer.size(); return true; } - - // FIXME: We probably don't need to make so many buffers and String copies. - StringBuilder utf8Encoded; + + Vector utf8Encoded; for (; !iterator.atEnd(); ++iterator) { if (!serialized && isTabOrNewline(*iterator)) continue; @@ -2111,18 +2122,15 @@ bool URLParser::parseHostAndPort(CodePointIterator iterator) // FIXME: Check error. utf8Encoded.append(buffer, offset); } - RELEASE_ASSERT(utf8Encoded.is8Bit()); - String percentDecoded = percentDecode(utf8Encoded.characters8(), utf8Encoded.length()); - RELEASE_ASSERT(percentDecoded.is8Bit()); - String domain = String::fromUTF8(percentDecoded.characters8(), percentDecoded.length()); + Vector percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size()); + String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size()); auto asciiDomain = domainToASCII(domain); if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value())) return false; - String& asciiDomainValue = asciiDomain.value(); - RELEASE_ASSERT(asciiDomainValue.is8Bit()); - const LChar* asciiDomainCharacters = asciiDomainValue.characters8(); + Vector& asciiDomainValue = asciiDomain.value(); + const LChar* asciiDomainCharacters = asciiDomainValue.data(); - if (auto address = parseIPv4Host(CodePointIterator(asciiDomainCharacters, asciiDomainCharacters + asciiDomainValue.length()))) { + if (auto address = parseIPv4Host(CodePointIterator(asciiDomainValue.begin(), asciiDomainValue.end()))) { serializeIPv4(address.value(), m_asciiBuffer); m_url.m_hostEnd = m_asciiBuffer.size(); if (iterator.atEnd()) { @@ -2133,7 +2141,7 @@ bool URLParser::parseHostAndPort(CodePointIterator iterator) return parsePort(iterator); } - m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.length()); + m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size()); m_url.m_hostEnd = m_asciiBuffer.size(); if (!iterator.atEnd()) { ASSERT(*iterator == ':'); @@ -2150,8 +2158,7 @@ inline static Optional formURLDecode(StringView input) if (utf8.isNull()) return Nullopt; auto percentDecoded = percentDecode(reinterpret_cast(utf8.data()), utf8.length()); - RELEASE_ASSERT(percentDecoded.is8Bit()); - return String::fromUTF8(percentDecoded.characters8(), percentDecoded.length()); + return String::fromUTF8(percentDecoded.data(), percentDecoded.size()); } auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm diff --git a/Source/WebCore/platform/URLParser.h b/Source/WebCore/platform/URLParser.h index e2f5047..692bc63 100644 --- a/Source/WebCore/platform/URLParser.h +++ b/Source/WebCore/platform/URLParser.h @@ -52,7 +52,7 @@ public: private: URL m_url; Vector m_asciiBuffer; - Vector m_unicodeFragmentBuffer; + Vector m_unicodeFragmentBuffer; bool m_urlIsSpecial { false }; bool m_hostHasPercentOrNonASCII { false }; diff --git a/Tools/ChangeLog b/Tools/ChangeLog index bf42aaa..2b1db95 100644 --- a/Tools/ChangeLog +++ b/Tools/ChangeLog @@ -1,5 +1,16 @@ 2016-09-20 Alex Christensen + Reduce allocations in URLParser + https://bugs.webkit.org/show_bug.cgi?id=162241 + + Reviewed by Chris Dumez. + + * TestWebKitAPI/Tests/WebCore/URLParser.cpp: + (TestWebKitAPI::TEST_F): + (TestWebKitAPI::checkURL): + +2016-09-20 Alex Christensen + Align URLParser with web platform tests when parsing non-special relative URLs ending in AuthorityOrHost state https://bugs.webkit.org/show_bug.cgi?id=162251 diff --git a/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp b/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp index 859e5fc..0ca8efb 100644 --- a/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp +++ b/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp @@ -26,6 +26,7 @@ #include "config.h" #include #include +#include using namespace WebCore; @@ -558,6 +559,9 @@ TEST_F(URLParserTest, ParserDifferences) checkRelativeURLDifferences("foo://", "http://example.org/foo/bar", {"foo", "", "", "", 0, "/", "", "", "foo:///"}, {"foo", "", "", "", 0, "//", "", "", "foo://"}); + checkURLDifferences(wideString(L"http://host?ß😍#ß😍"), + {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", wideString(L"ß😍"), wideString(L"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, + {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}); // This matches the spec and web platform tests, but not Chrome, Firefox, or URL::parse. checkRelativeURLDifferences("notspecial:/", "about:blank", @@ -649,6 +653,10 @@ TEST_F(URLParserTest, DefaultPort) checkURLDifferences("unknown://host:81", {"unknown", "", "", "host", 81, "/", "", "", "unknown://host:81/"}, {"unknown", "", "", "host", 81, "", "", "", "unknown://host:81"}); + checkURLDifferences("http://%48OsT", + {"http", "", "", "host", 0, "/", "", "", "http://host/"}, + {"http", "", "", "%48ost", 0, "/", "", "", "http://%48ost/"}); + } static void shouldFail(const String& urlString) @@ -713,4 +721,25 @@ TEST_F(URLParserTest, AdditionalTests) checkRelativeURL("notspecial:", "http://example.org/foo/bar", {"notspecial", "", "", "", 0, "", "", "", "notspecial:"}); } +static void checkURL(const String& urlString, const TextEncoding& encoding, const ExpectedParts& parts) +{ + URLParser parser; + auto url = parser.parse(urlString, { }, encoding); + EXPECT_TRUE(eq(parts.protocol, url.protocol())); + EXPECT_TRUE(eq(parts.user, url.user())); + EXPECT_TRUE(eq(parts.password, url.pass())); + EXPECT_TRUE(eq(parts.host, url.host())); + EXPECT_EQ(parts.port, url.port()); + EXPECT_TRUE(eq(parts.path, url.path())); + EXPECT_TRUE(eq(parts.query, url.query())); + EXPECT_TRUE(eq(parts.fragment, url.fragmentIdentifier())); + EXPECT_TRUE(eq(parts.string, url.string())); +} + +TEST_F(URLParserTest, QueryEncoding) +{ + checkURL(wideString(L"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", wideString(L"ß😍"), wideString(L"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}); + // FIXME: Add tests with other encodings. +} + } // namespace TestWebKitAPI -- 1.8.3.1