Reduce allocations in URLParser
authorachristensen@apple.com <achristensen@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 20 Sep 2016 21:50:30 +0000 (21:50 +0000)
committerachristensen@apple.com <achristensen@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 20 Sep 2016 21:50:30 +0000 (21:50 +0000)
https://bugs.webkit.org/show_bug.cgi?id=162241

Reviewed by Chris Dumez.

Source/WebCore:

Use Vectors instead of StringBuilders.  This allows us to use the inline capacity on the stack
for short URLs (<2KB) and also allows us to skip branches because we know whether the
contained type is UChar or LChar at compile time.  It also allows us to use uncheckedAppend.

Added new API tests for parts that were less tested, but there is
no change in behavior except for a performance improvement.

* platform/URLParser.cpp:
(WebCore::appendCodePoint):
(WebCore::encodeQuery):
(WebCore::URLParser::failure):
(WebCore::URLParser::parse):
(WebCore::percentDecode):
(WebCore::domainToASCII):
(WebCore::hasInvalidDomainCharacter):
(WebCore::URLParser::parseHost):
(WebCore::formURLDecode):
(WebCore::isC0Control): Deleted.
* platform/URLParser.h:

Tools:

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
(TestWebKitAPI::checkURL):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@206177 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/WebCore/ChangeLog
Source/WebCore/platform/URLParser.cpp
Source/WebCore/platform/URLParser.h
Tools/ChangeLog
Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp

index bdab29e..ab9aff8 100644 (file)
@@ -1,3 +1,30 @@
+2016-09-20  Alex Christensen  <achristensen@webkit.org>
+
+        Reduce allocations in URLParser
+        https://bugs.webkit.org/show_bug.cgi?id=162241
+
+        Reviewed by Chris Dumez.
+
+        Use Vectors instead of StringBuilders.  This allows us to use the inline capacity on the stack
+        for short URLs (<2KB) and also allows us to skip branches because we know whether the
+        contained type is UChar or LChar at compile time.  It also allows us to use uncheckedAppend.
+
+        Added new API tests for parts that were less tested, but there is
+        no change in behavior except for a performance improvement.
+
+        * platform/URLParser.cpp:
+        (WebCore::appendCodePoint):
+        (WebCore::encodeQuery):
+        (WebCore::URLParser::failure):
+        (WebCore::URLParser::parse):
+        (WebCore::percentDecode):
+        (WebCore::domainToASCII):
+        (WebCore::hasInvalidDomainCharacter):
+        (WebCore::URLParser::parseHost):
+        (WebCore::formURLDecode):
+        (WebCore::isC0Control): Deleted.
+        * platform/URLParser.h:
+
 2016-09-20  Nan Wang  <n_wang@apple.com>
 
         AX: voiceover does not read contents of input role="spinbutton"
index eb0f79b..30bb22e 100644 (file)
 #include <array>
 #include <unicode/uidna.h>
 #include <unicode/utypes.h>
-#include <wtf/HashMap.h>
-#include <wtf/NeverDestroyed.h>
-#include <wtf/text/StringBuilder.h>
-#include <wtf/text/StringHash.h>
 
 namespace WebCore {
 
@@ -115,6 +111,17 @@ auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
     m_begin += i;
     return *this;
 }
+    
+static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
+{
+    if (U_IS_BMP(codePoint)) {
+        destination.append(static_cast<UChar>(codePoint));
+        return;
+    }
+    destination.reserveCapacity(destination.size() + 2);
+    destination.uncheckedAppend(U16_LEAD(codePoint));
+    destination.uncheckedAppend(U16_TRAIL(codePoint));
+}
 
 enum URLCharacterClass {
     UserInfo = 0x1,
@@ -504,10 +511,10 @@ inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& dest
     }
 }
     
-inline static void encodeQuery(const StringBuilder& source, Vector<LChar>& destination, const TextEncoding& encoding)
+inline static void encodeQuery(const Vector<UChar>& source, Vector<LChar>& destination, const TextEncoding& encoding)
 {
     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
-    CString encoded = encoding.encode(source.toStringPreserveCapacity(), URLEncodedEntitiesForUnencodables);
+    CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
     const char* data = encoded.data();
     size_t length = encoded.length();
     for (size_t i = 0; i < length; ++i) {
@@ -912,7 +919,7 @@ URL URLParser::parseSerializedURL(const String& input)
         return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
     return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
 }
-    
+
 template<bool serialized, typename CharacterType>
 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
 {
@@ -923,7 +930,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR
     m_asciiBuffer.reserveCapacity(length);
     
     bool isUTF8Encoding = encoding == UTF8Encoding();
-    StringBuilder queryBuffer;
+    Vector<UChar> queryBuffer;
 
     unsigned endIndex = length;
     while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
@@ -1408,7 +1415,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR
             if (isUTF8Encoding)
                 utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
             else
-                queryBuffer.append(*c);
+                appendCodePoint(queryBuffer, *c);
             ++c;
             break;
         case State::Fragment:
@@ -1416,7 +1423,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR
             if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
                 m_asciiBuffer.append(*c);
             else
-                m_unicodeFragmentBuffer.append(*c);
+                appendCodePoint(m_unicodeFragmentBuffer, *c);
             ++c;
             break;
         }
@@ -1926,25 +1933,27 @@ inline static Optional<std::array<uint16_t, 8>> parseIPv6Host(CodePointIterator<
     return address;
 }
 
-// FIXME: This should return a CString.
-inline static String percentDecode(const LChar* input, size_t length)
+const size_t defaultInlineBufferSize = 2048;
+
+inline static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
 {
-    StringBuilder output;
+    Vector<LChar, defaultInlineBufferSize> output;
+    output.reserveInitialCapacity(length);
     
     for (size_t i = 0; i < length; ++i) {
         uint8_t byte = input[i];
         if (byte != '%')
-            output.append(byte);
+            output.uncheckedAppend(byte);
         else if (i < length - 2) {
             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
-                output.append(toASCIIHexValue(input[i + 1], input[i + 2]));
+                output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
                 i += 2;
             } else
-                output.append(byte);
+                output.uncheckedAppend(byte);
         } else
-            output.append(byte);
+            output.uncheckedAppend(byte);
     }
-    return output.toStringPreserveCapacity();
+    return output;
 }
 
 inline static bool containsOnlyASCII(const String& string)
@@ -1954,22 +1963,26 @@ inline static bool containsOnlyASCII(const String& string)
     return charactersAreAllASCII(string.characters16(), string.length());
 }
 
-inline static Optional<String> domainToASCII(const String& domain)
+inline static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
 {
-    const unsigned hostnameBufferLength = 2048;
-
+    Vector<LChar, defaultInlineBufferSize> ascii;
     if (containsOnlyASCII(domain)) {
-        if (domain.is8Bit())
-            return domain.convertToASCIILowercase();
-        Vector<LChar, hostnameBufferLength> buffer;
         size_t length = domain.length();
-        buffer.reserveInitialCapacity(length);
-        for (size_t i = 0; i < length; ++i)
-            buffer.append(toASCIILower(domain[i]));
-        return String(buffer.data(), length);
+        if (domain.is8Bit()) {
+            const LChar* characters = domain.characters8();
+            ascii.reserveInitialCapacity(length);
+            for (size_t i = 0; i < length; ++i)
+                ascii.uncheckedAppend(toASCIILower(characters[i]));
+        } else {
+            const UChar* characters = domain.characters16();
+            ascii.reserveInitialCapacity(length);
+            for (size_t i = 0; i < length; ++i)
+                ascii.uncheckedAppend(toASCIILower(characters[i]));
+        }
+        return ascii;
     }
     
-    UChar hostnameBuffer[hostnameBufferLength];
+    UChar hostnameBuffer[defaultInlineBufferSize];
     UErrorCode error = U_ZERO_ERROR;
 
 #if COMPILER(GCC) || COMPILER(CLANG)
@@ -1977,30 +1990,29 @@ inline static Optional<String> domainToASCII(const String& domain)
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
     // FIXME: This should use uidna_openUTS46 / uidna_close instead
-    int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
+    int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
 #if COMPILER(GCC) || COMPILER(CLANG)
 #pragma GCC diagnostic pop
 #endif
+    ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
 
     if (error == U_ZERO_ERROR) {
-        LChar buffer[hostnameBufferLength];
         for (int32_t i = 0; i < numCharactersConverted; ++i) {
             ASSERT(isASCII(hostnameBuffer[i]));
-            buffer[i] = hostnameBuffer[i];
+            ASSERT(!isASCIIUpper(hostnameBuffer[i]));
         }
-        return String(buffer, numCharactersConverted);
+        ascii.append(hostnameBuffer, numCharactersConverted);
+        return ascii;
     }
 
     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
     return Nullopt;
 }
 
-inline static bool hasInvalidDomainCharacter(const String& asciiDomain)
+inline static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
 {
-    RELEASE_ASSERT(asciiDomain.is8Bit());
-    const LChar* characters = asciiDomain.characters8();
-    for (size_t i = 0; i < asciiDomain.length(); ++i) {
-        if (isInvalidDomainCharacter(characters[i]))
+    for (size_t i = 0; i < asciiDomain.size(); ++i) {
+        if (isInvalidDomainCharacter(asciiDomain[i]))
             return true;
     }
     return false;
@@ -2095,9 +2107,8 @@ bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
         m_url.m_portEnd = m_asciiBuffer.size();
         return true;
     }
-
-    // FIXME: We probably don't need to make so many buffers and String copies.
-    StringBuilder utf8Encoded;
+    
+    Vector<LChar, defaultInlineBufferSize> utf8Encoded;
     for (; !iterator.atEnd(); ++iterator) {
         if (!serialized && isTabOrNewline(*iterator))
             continue;
@@ -2111,18 +2122,15 @@ bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
         // FIXME: Check error.
         utf8Encoded.append(buffer, offset);
     }
-    RELEASE_ASSERT(utf8Encoded.is8Bit());
-    String percentDecoded = percentDecode(utf8Encoded.characters8(), utf8Encoded.length());
-    RELEASE_ASSERT(percentDecoded.is8Bit());
-    String domain = String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
+    Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
+    String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
     auto asciiDomain = domainToASCII(domain);
     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
         return false;
-    String& asciiDomainValue = asciiDomain.value();
-    RELEASE_ASSERT(asciiDomainValue.is8Bit());
-    const LChar* asciiDomainCharacters = asciiDomainValue.characters8();
+    Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
+    const LChar* asciiDomainCharacters = asciiDomainValue.data();
 
-    if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainCharacters, asciiDomainCharacters + asciiDomainValue.length()))) {
+    if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
         serializeIPv4(address.value(), m_asciiBuffer);
         m_url.m_hostEnd = m_asciiBuffer.size();
         if (iterator.atEnd()) {
@@ -2133,7 +2141,7 @@ bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
         return parsePort<serialized>(iterator);
     }
 
-    m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.length());
+    m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size());
     m_url.m_hostEnd = m_asciiBuffer.size();
     if (!iterator.atEnd()) {
         ASSERT(*iterator == ':');
@@ -2150,8 +2158,7 @@ inline static Optional<String> formURLDecode(StringView input)
     if (utf8.isNull())
         return Nullopt;
     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
-    RELEASE_ASSERT(percentDecoded.is8Bit());
-    return String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
+    return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
 }
 
 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm
index e2f5047..692bc63 100644 (file)
@@ -52,7 +52,7 @@ public:
 private:
     URL m_url;
     Vector<LChar> m_asciiBuffer;
-    Vector<UChar32> m_unicodeFragmentBuffer;
+    Vector<UChar> m_unicodeFragmentBuffer;
     bool m_urlIsSpecial { false };
     bool m_hostHasPercentOrNonASCII { false };
 
index bf42aaa..2b1db95 100644 (file)
@@ -1,5 +1,16 @@
 2016-09-20  Alex Christensen  <achristensen@webkit.org>
 
+        Reduce allocations in URLParser
+        https://bugs.webkit.org/show_bug.cgi?id=162241
+
+        Reviewed by Chris Dumez.
+
+        * TestWebKitAPI/Tests/WebCore/URLParser.cpp:
+        (TestWebKitAPI::TEST_F):
+        (TestWebKitAPI::checkURL):
+
+2016-09-20  Alex Christensen  <achristensen@webkit.org>
+
         Align URLParser with web platform tests when parsing non-special relative URLs ending in AuthorityOrHost state
         https://bugs.webkit.org/show_bug.cgi?id=162251
 
index 859e5fc..0ca8efb 100644 (file)
@@ -26,6 +26,7 @@
 #include "config.h"
 #include <WebCore/URLParser.h>
 #include <wtf/MainThread.h>
+#include <wtf/text/StringBuilder.h>
 
 using namespace WebCore;
 
@@ -558,6 +559,9 @@ TEST_F(URLParserTest, ParserDifferences)
     checkRelativeURLDifferences("foo://", "http://example.org/foo/bar",
         {"foo", "", "", "", 0, "/", "", "", "foo:///"},
         {"foo", "", "", "", 0, "//", "", "", "foo://"});
+    checkURLDifferences(wideString(L"http://host?ß😍#ß😍"),
+        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", wideString(L"ß😍"), wideString(L"http://host/?%C3%9F%F0%9F%98%8D#ß😍")},
+        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"});
 
     // This matches the spec and web platform tests, but not Chrome, Firefox, or URL::parse.
     checkRelativeURLDifferences("notspecial:/", "about:blank",
@@ -649,6 +653,10 @@ TEST_F(URLParserTest, DefaultPort)
     checkURLDifferences("unknown://host:81",
         {"unknown", "", "", "host", 81, "/", "", "", "unknown://host:81/"},
         {"unknown", "", "", "host", 81, "", "", "", "unknown://host:81"});
+    checkURLDifferences("http://%48OsT",
+        {"http", "", "", "host", 0, "/", "", "", "http://host/"},
+        {"http", "", "", "%48ost", 0, "/", "", "", "http://%48ost/"});
+
 }
     
 static void shouldFail(const String& urlString)
@@ -713,4 +721,25 @@ TEST_F(URLParserTest, AdditionalTests)
     checkRelativeURL("notspecial:", "http://example.org/foo/bar", {"notspecial", "", "", "", 0, "", "", "", "notspecial:"});
 }
 
+static void checkURL(const String& urlString, const TextEncoding& encoding, const ExpectedParts& parts)
+{
+    URLParser parser;
+    auto url = parser.parse(urlString, { }, encoding);
+    EXPECT_TRUE(eq(parts.protocol, url.protocol()));
+    EXPECT_TRUE(eq(parts.user, url.user()));
+    EXPECT_TRUE(eq(parts.password, url.pass()));
+    EXPECT_TRUE(eq(parts.host, url.host()));
+    EXPECT_EQ(parts.port, url.port());
+    EXPECT_TRUE(eq(parts.path, url.path()));
+    EXPECT_TRUE(eq(parts.query, url.query()));
+    EXPECT_TRUE(eq(parts.fragment, url.fragmentIdentifier()));
+    EXPECT_TRUE(eq(parts.string, url.string()));
+}
+
+TEST_F(URLParserTest, QueryEncoding)
+{
+    checkURL(wideString(L"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", wideString(L"ß😍"), wideString(L"http://host/?%C3%9F%F0%9F%98%8D#ß😍")});
+    // FIXME: Add tests with other encodings.
+}
+
 } // namespace TestWebKitAPI