Reduce allocations in URLParser
[WebKit-https.git] / Source / WebCore / platform / URLParser.cpp
index eb0f79b..30bb22e 100644 (file)
 #include <array>
 #include <unicode/uidna.h>
 #include <unicode/utypes.h>
-#include <wtf/HashMap.h>
-#include <wtf/NeverDestroyed.h>
-#include <wtf/text/StringBuilder.h>
-#include <wtf/text/StringHash.h>
 
 namespace WebCore {
 
@@ -115,6 +111,17 @@ auto CodePointIterator<UChar>::operator++() -> CodePointIterator&
     m_begin += i;
     return *this;
 }
+    
+static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
+{
+    if (U_IS_BMP(codePoint)) {
+        destination.append(static_cast<UChar>(codePoint));
+        return;
+    }
+    destination.reserveCapacity(destination.size() + 2);
+    destination.uncheckedAppend(U16_LEAD(codePoint));
+    destination.uncheckedAppend(U16_TRAIL(codePoint));
+}
 
 enum URLCharacterClass {
     UserInfo = 0x1,
@@ -504,10 +511,10 @@ inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& dest
     }
 }
     
-inline static void encodeQuery(const StringBuilder& source, Vector<LChar>& destination, const TextEncoding& encoding)
+inline static void encodeQuery(const Vector<UChar>& source, Vector<LChar>& destination, const TextEncoding& encoding)
 {
     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
-    CString encoded = encoding.encode(source.toStringPreserveCapacity(), URLEncodedEntitiesForUnencodables);
+    CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
     const char* data = encoded.data();
     size_t length = encoded.length();
     for (size_t i = 0; i < length; ++i) {
@@ -912,7 +919,7 @@ URL URLParser::parseSerializedURL(const String& input)
         return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
     return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
 }
-    
+
 template<bool serialized, typename CharacterType>
 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
 {
@@ -923,7 +930,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR
     m_asciiBuffer.reserveCapacity(length);
     
     bool isUTF8Encoding = encoding == UTF8Encoding();
-    StringBuilder queryBuffer;
+    Vector<UChar> queryBuffer;
 
     unsigned endIndex = length;
     while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
@@ -1408,7 +1415,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR
             if (isUTF8Encoding)
                 utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
             else
-                queryBuffer.append(*c);
+                appendCodePoint(queryBuffer, *c);
             ++c;
             break;
         case State::Fragment:
@@ -1416,7 +1423,7 @@ URL URLParser::parse(const CharacterType* input, const unsigned length, const UR
             if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
                 m_asciiBuffer.append(*c);
             else
-                m_unicodeFragmentBuffer.append(*c);
+                appendCodePoint(m_unicodeFragmentBuffer, *c);
             ++c;
             break;
         }
@@ -1926,25 +1933,27 @@ inline static Optional<std::array<uint16_t, 8>> parseIPv6Host(CodePointIterator<
     return address;
 }
 
-// FIXME: This should return a CString.
-inline static String percentDecode(const LChar* input, size_t length)
+const size_t defaultInlineBufferSize = 2048;
+
+inline static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
 {
-    StringBuilder output;
+    Vector<LChar, defaultInlineBufferSize> output;
+    output.reserveInitialCapacity(length);
     
     for (size_t i = 0; i < length; ++i) {
         uint8_t byte = input[i];
         if (byte != '%')
-            output.append(byte);
+            output.uncheckedAppend(byte);
         else if (i < length - 2) {
             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
-                output.append(toASCIIHexValue(input[i + 1], input[i + 2]));
+                output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
                 i += 2;
             } else
-                output.append(byte);
+                output.uncheckedAppend(byte);
         } else
-            output.append(byte);
+            output.uncheckedAppend(byte);
     }
-    return output.toStringPreserveCapacity();
+    return output;
 }
 
 inline static bool containsOnlyASCII(const String& string)
@@ -1954,22 +1963,26 @@ inline static bool containsOnlyASCII(const String& string)
     return charactersAreAllASCII(string.characters16(), string.length());
 }
 
-inline static Optional<String> domainToASCII(const String& domain)
+inline static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
 {
-    const unsigned hostnameBufferLength = 2048;
-
+    Vector<LChar, defaultInlineBufferSize> ascii;
     if (containsOnlyASCII(domain)) {
-        if (domain.is8Bit())
-            return domain.convertToASCIILowercase();
-        Vector<LChar, hostnameBufferLength> buffer;
         size_t length = domain.length();
-        buffer.reserveInitialCapacity(length);
-        for (size_t i = 0; i < length; ++i)
-            buffer.append(toASCIILower(domain[i]));
-        return String(buffer.data(), length);
+        if (domain.is8Bit()) {
+            const LChar* characters = domain.characters8();
+            ascii.reserveInitialCapacity(length);
+            for (size_t i = 0; i < length; ++i)
+                ascii.uncheckedAppend(toASCIILower(characters[i]));
+        } else {
+            const UChar* characters = domain.characters16();
+            ascii.reserveInitialCapacity(length);
+            for (size_t i = 0; i < length; ++i)
+                ascii.uncheckedAppend(toASCIILower(characters[i]));
+        }
+        return ascii;
     }
     
-    UChar hostnameBuffer[hostnameBufferLength];
+    UChar hostnameBuffer[defaultInlineBufferSize];
     UErrorCode error = U_ZERO_ERROR;
 
 #if COMPILER(GCC) || COMPILER(CLANG)
@@ -1977,30 +1990,29 @@ inline static Optional<String> domainToASCII(const String& domain)
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
     // FIXME: This should use uidna_openUTS46 / uidna_close instead
-    int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
+    int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
 #if COMPILER(GCC) || COMPILER(CLANG)
 #pragma GCC diagnostic pop
 #endif
+    ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
 
     if (error == U_ZERO_ERROR) {
-        LChar buffer[hostnameBufferLength];
         for (int32_t i = 0; i < numCharactersConverted; ++i) {
             ASSERT(isASCII(hostnameBuffer[i]));
-            buffer[i] = hostnameBuffer[i];
+            ASSERT(!isASCIIUpper(hostnameBuffer[i]));
         }
-        return String(buffer, numCharactersConverted);
+        ascii.append(hostnameBuffer, numCharactersConverted);
+        return ascii;
     }
 
     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
     return Nullopt;
 }
 
-inline static bool hasInvalidDomainCharacter(const String& asciiDomain)
+inline static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
 {
-    RELEASE_ASSERT(asciiDomain.is8Bit());
-    const LChar* characters = asciiDomain.characters8();
-    for (size_t i = 0; i < asciiDomain.length(); ++i) {
-        if (isInvalidDomainCharacter(characters[i]))
+    for (size_t i = 0; i < asciiDomain.size(); ++i) {
+        if (isInvalidDomainCharacter(asciiDomain[i]))
             return true;
     }
     return false;
@@ -2095,9 +2107,8 @@ bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
         m_url.m_portEnd = m_asciiBuffer.size();
         return true;
     }
-
-    // FIXME: We probably don't need to make so many buffers and String copies.
-    StringBuilder utf8Encoded;
+    
+    Vector<LChar, defaultInlineBufferSize> utf8Encoded;
     for (; !iterator.atEnd(); ++iterator) {
         if (!serialized && isTabOrNewline(*iterator))
             continue;
@@ -2111,18 +2122,15 @@ bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
         // FIXME: Check error.
         utf8Encoded.append(buffer, offset);
     }
-    RELEASE_ASSERT(utf8Encoded.is8Bit());
-    String percentDecoded = percentDecode(utf8Encoded.characters8(), utf8Encoded.length());
-    RELEASE_ASSERT(percentDecoded.is8Bit());
-    String domain = String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
+    Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
+    String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
     auto asciiDomain = domainToASCII(domain);
     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
         return false;
-    String& asciiDomainValue = asciiDomain.value();
-    RELEASE_ASSERT(asciiDomainValue.is8Bit());
-    const LChar* asciiDomainCharacters = asciiDomainValue.characters8();
+    Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
+    const LChar* asciiDomainCharacters = asciiDomainValue.data();
 
-    if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainCharacters, asciiDomainCharacters + asciiDomainValue.length()))) {
+    if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
         serializeIPv4(address.value(), m_asciiBuffer);
         m_url.m_hostEnd = m_asciiBuffer.size();
         if (iterator.atEnd()) {
@@ -2133,7 +2141,7 @@ bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator)
         return parsePort<serialized>(iterator);
     }
 
-    m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.length());
+    m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size());
     m_url.m_hostEnd = m_asciiBuffer.size();
     if (!iterator.atEnd()) {
         ASSERT(*iterator == ':');
@@ -2150,8 +2158,7 @@ inline static Optional<String> formURLDecode(StringView input)
     if (utf8.isNull())
         return Nullopt;
     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
-    RELEASE_ASSERT(percentDecoded.is8Bit());
-    return String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
+    return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
 }
 
 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm