Convert AppCache manifest parser over to using StringParsingBuffer
authorweinig@apple.com <weinig@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 30 Jun 2020 03:52:16 +0000 (03:52 +0000)
committerweinig@apple.com <weinig@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 30 Jun 2020 03:52:16 +0000 (03:52 +0000)
https://bugs.webkit.org/show_bug.cgi?id=213680

Reviewed by Darin Adler.

- Renames parseManifest to parseApplicationCacheManifest to differentiate between the manifest
  for the application cache and the "application manifest", which is a different thing entirely.
  Also renames the container struct from being called Manifest to ApplicationCacheManifest.
  (The file should be renamed as well, but will do that in a seperate pass).
- Update parser to return an Optional<ApplicationCacheManifest> rather than using bool + out
  parameter.
- Adopt readCharactersForParsing to replace unnecessary call to StringView::upconvertedCharacters().
- Adopt StringParsingBuffer and ParsingUtilities along with some refinements to the code
  to make the intent more clear.

* html/parser/ParsingUtilities.h:
(WebCore::skipUntil):
Fix formatting, putting the whole signature on one line.

* loader/appcache/ApplicationCacheGroup.cpp:
(WebCore::ApplicationCacheGroup::didFinishLoadingManifest):
Update for new parser function name and Optional return type.

* loader/appcache/ManifestParser.cpp:
(WebCore::isManifestWhitespace):
(WebCore::isManifestNewline):
(WebCore::isManifestWhitespaceOrNewline):
(WebCore::makeManifestURL):
(WebCore::parseApplicationCacheManifest):
* loader/appcache/ManifestParser.h:
Update parsing logic to use readCharactersForParsing (to avoid upconvesion) and rework
using StringParsingBuffer/ParsingUtilities to make things more clear.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@263722 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/WebCore/ChangeLog
Source/WebCore/html/parser/ParsingUtilities.h
Source/WebCore/loader/appcache/ApplicationCacheGroup.cpp
Source/WebCore/loader/appcache/ManifestParser.cpp
Source/WebCore/loader/appcache/ManifestParser.h

index 0fecb6e..dc54f1f 100644 (file)
@@ -1,3 +1,39 @@
+2020-06-29  Sam Weinig  <weinig@apple.com>
+
+        Convert AppCache manifest parser over to using StringParsingBuffer
+        https://bugs.webkit.org/show_bug.cgi?id=213680
+
+        Reviewed by Darin Adler.
+
+        - Renames parseManifest to parseApplicationCacheManifest to differentiate between the manifest
+          for the application cache and the "application manifest", which is a different thing entirely.
+          Also renames the container struct from being called Manifest to ApplicationCacheManifest.
+          (The file should be renamed as well, but will do that in a seperate pass). 
+        - Update parser to return an Optional<ApplicationCacheManifest> rather than using bool + out
+          parameter.
+        - Adopt readCharactersForParsing to replace unnecessary call to StringView::upconvertedCharacters().
+        - Adopt StringParsingBuffer and ParsingUtilities along with some refinements to the code
+          to make the intent more clear.
+
+
+        * html/parser/ParsingUtilities.h:
+        (WebCore::skipUntil):
+        Fix formatting, putting the whole signature on one line.
+
+        * loader/appcache/ApplicationCacheGroup.cpp:
+        (WebCore::ApplicationCacheGroup::didFinishLoadingManifest):
+        Update for new parser function name and Optional return type.
+
+        * loader/appcache/ManifestParser.cpp:
+        (WebCore::isManifestWhitespace):
+        (WebCore::isManifestNewline):
+        (WebCore::isManifestWhitespaceOrNewline):
+        (WebCore::makeManifestURL):
+        (WebCore::parseApplicationCacheManifest):
+        * loader/appcache/ManifestParser.h:
+        Update parsing logic to use readCharactersForParsing (to avoid upconvesion) and rework
+        using StringParsingBuffer/ParsingUtilities to make things more clear.
+
 2020-06-29  Zalan Bujtas  <zalan@apple.com>
 
         [LFC][TFC] Adjust baseline when the table cell itself establishes an IFC
index e5e2b19..dd0c288 100644 (file)
@@ -81,8 +81,7 @@ template<typename CharacterType, typename DelimiterType> void skipUntil(const Ch
         ++position;
 }
 
-template<typename CharacterType, typename DelimiterType>
-void skipUntil(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
+template<typename CharacterType, typename DelimiterType> void skipUntil(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
 {
     while (buffer.hasCharactersRemaining() && *buffer != delimiter)
         ++buffer;
index b863e17..11bc37a 100644 (file)
@@ -605,8 +605,8 @@ void ApplicationCacheGroup::didFinishLoadingManifest()
         }
     }
     
-    Manifest manifest;
-    if (!parseManifest(m_manifestURL, m_manifestResource->response().mimeType(), m_manifestResource->data().data(), m_manifestResource->data().size(), manifest)) {
+    auto manifest = parseApplicationCacheManifest(m_manifestURL, m_manifestResource->response().mimeType(), m_manifestResource->data().data(), m_manifestResource->data().size());
+    if (!manifest) {
         // At the time of this writing, lack of "CACHE MANIFEST" signature is the only reason for parseManifest to fail.
         m_frame->document()->addConsoleMessage(MessageSource::AppCache, MessageLevel::Error, "Application Cache manifest could not be parsed. Does it start with CACHE MANIFEST?"_s);
         cacheUpdateFailed();
@@ -635,15 +635,15 @@ void ApplicationCacheGroup::didFinishLoadingManifest()
         }
     }
     
-    for (const auto& explicitURL : manifest.explicitURLs)
+    for (const auto& explicitURL : manifest->explicitURLs)
         addEntry(explicitURL, ApplicationCacheResource::Explicit);
 
-    for (auto& fallbackURL : manifest.fallbackURLs)
+    for (auto& fallbackURL : manifest->fallbackURLs)
         addEntry(fallbackURL.second.string(), ApplicationCacheResource::Fallback);
     
-    m_cacheBeingUpdated->setOnlineAllowlist(manifest.onlineAllowedURLs);
-    m_cacheBeingUpdated->setFallbackURLs(manifest.fallbackURLs);
-    m_cacheBeingUpdated->setAllowsAllNetworkRequests(manifest.allowAllNetworkRequests);
+    m_cacheBeingUpdated->setOnlineAllowlist(manifest->onlineAllowedURLs);
+    m_cacheBeingUpdated->setFallbackURLs(manifest->fallbackURLs);
+    m_cacheBeingUpdated->setAllowsAllNetworkRequests(manifest->allowAllNetworkRequests);
 
     m_progressTotal = m_pendingEntries.size();
     m_progressDone = 0;
index d2f1858..aa54462 100644 (file)
 #include "config.h"
 #include "ManifestParser.h"
 
+#include "ParsingUtilities.h"
 #include "TextResourceDecoder.h"
 #include <wtf/URL.h>
+#include <wtf/text/StringParsingBuffer.h>
 #include <wtf/text/StringView.h>
-#include <wtf/unicode/CharacterNames.h>
 
 namespace WebCore {
 
-enum Mode { Explicit, Fallback, OnlineAllowlist, Unknown };
+enum class ApplicationCacheParserMode { Explicit, Fallback, OnlineAllowlist, Unknown };
 
 static StringView manifestPath(const URL& manifestURL)
 {
@@ -44,160 +45,192 @@ static StringView manifestPath(const URL& manifestURL)
     return manifestPath;
 }
 
-bool parseManifest(const URL& manifestURL, const String& manifestMIMEType, const char* data, int length, Manifest& manifest)
+template<typename CharacterType> static constexpr bool isManifestWhitespace(CharacterType character)
 {
-    ASSERT(manifest.explicitURLs.isEmpty());
-    ASSERT(manifest.onlineAllowedURLs.isEmpty());
-    ASSERT(manifest.fallbackURLs.isEmpty());
-    manifest.allowAllNetworkRequests = false;
+    return character == ' ' || character == '\t';
+}
 
-    auto manifestPath = WebCore::manifestPath(manifestURL);
+template<typename CharacterType> static constexpr bool isManifestNewline(CharacterType character)
+{
+    return character == '\n' || character == '\r';
+}
+
+template<typename CharacterType> static constexpr bool isManifestWhitespaceOrNewline(CharacterType character)
+{
+    return isManifestWhitespace(character) || isManifestNewline(character);
+}
+
+template<typename CharacterType> static URL makeManifestURL(const URL& manifestURL, const CharacterType* start, const CharacterType* end)
+{
+    URL url(manifestURL, String(start, end - start));
+    url.removeFragmentIdentifier();
+    return url;
+}
+
+template<typename CharacterType> static constexpr CharacterType cacheManifestIdentifier[] = { 'C', 'A', 'C', 'H', 'E', ' ', 'M', 'A', 'N', 'I', 'F', 'E', 'S', 'T' };
+template<typename CharacterType> static constexpr CharacterType cacheModeIdentifier[] = { 'C', 'A', 'C', 'H', 'E' };
+template<typename CharacterType> static constexpr CharacterType fallbackModeIdentifier[] = { 'F', 'A', 'L', 'L', 'B', 'A', 'C', 'K' };
+template<typename CharacterType> static constexpr CharacterType networkModeIdentifier[] = { 'N', 'E', 'T', 'W', 'O', 'R', 'K' };
 
-    const char cacheManifestMIMEType[] = "text/cache-manifest";
+Optional<ApplicationCacheManifest> parseApplicationCacheManifest(const URL& manifestURL, const String& manifestMIMEType, const char* data, int length)
+{
+    static constexpr const char cacheManifestMIMEType[] = "text/cache-manifest";
     bool allowFallbackNamespaceOutsideManfestPath = equalLettersIgnoringASCIICase(manifestMIMEType, cacheManifestMIMEType);
+    auto manifestPath = WebCore::manifestPath(manifestURL);
 
-    Mode mode = Explicit;
+    auto manifestString = TextResourceDecoder::create(ASCIILiteral::fromLiteralUnsafe(cacheManifestMIMEType), "UTF-8")->decodeAndFlush(data, length);
 
-    String manifestString = TextResourceDecoder::create(ASCIILiteral::fromLiteralUnsafe(cacheManifestMIMEType), "UTF-8")->decodeAndFlush(data, length);
+    return readCharactersForParsing(manifestString, [&](auto buffer) -> Optional<ApplicationCacheManifest> {
+        using CharacterType = typename decltype(buffer)::CharacterType;
     
-    // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?" (the BOM is removed by TextResourceDecoder).
-    // Example: "CACHE MANIFEST #comment" is a valid signature.
-    // Example: "CACHE MANIFEST;V2" is not.
-    const char manifestSignature[] = "CACHE MANIFEST";
-    if (!manifestString.startsWith(manifestSignature))
-        return false;
+        ApplicationCacheManifest manifest;
+        auto mode = ApplicationCacheParserMode::Explicit;
+
+        // Look for the magic signature: "^\xFEFF?CACHE MANIFEST[ \t]?" (the BOM is removed by TextResourceDecoder).
+        // Example: "CACHE MANIFEST #comment" is a valid signature.
+        // Example: "CACHE MANIFEST;V2" is not.
+        if (!skipCharactersExactly(buffer, cacheManifestIdentifier<CharacterType>))
+            return WTF::nullopt;
     
-    StringView manifestAfterSignature = StringView(manifestString).substring(sizeof(manifestSignature) - 1);
-    auto upconvertedCharacters = manifestAfterSignature.upconvertedCharacters();
-    const UChar* p = upconvertedCharacters;
-    const UChar* end = p + manifestAfterSignature.length();
-
-    if (p < end && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
-        return false;
-
-    // Skip to the end of the line.
-    while (p < end && *p != '\r' && *p != '\n')
-        p++;
-
-    while (1) {
-        // Skip whitespace
-        while (p < end && (*p == '\n' || *p == '\r' || *p == ' ' || *p == '\t'))
-            p++;
-        
-        if (p == end)
-            break;
-        
-        const UChar* lineStart = p;
-        
-        // Find the end of the line
-        while (p < end && *p != '\r' && *p != '\n')
-            p++;
-        
-        // Check if we have a comment
-        if (*lineStart == '#')
-            continue;
-        
-        // Get rid of trailing whitespace
-        const UChar* tmp = p - 1;
-        while (tmp > lineStart && (*tmp == ' ' || *tmp == '\t'))
-            tmp--;
-        
-        String line(lineStart, tmp - lineStart + 1);
-
-        if (line == "CACHE:") 
-            mode = Explicit;
-        else if (line == "FALLBACK:")
-            mode = Fallback;
-        else if (line == "NETWORK:")
-            mode = OnlineAllowlist;
-        else if (line.endsWith(':'))
-            mode = Unknown;
-        else if (mode == Unknown)
-            continue;
-        else if (mode == Explicit || mode == OnlineAllowlist) {
-            auto upconvertedLineCharacters = StringView(line).upconvertedCharacters();
-            const UChar* p = upconvertedLineCharacters;
-            const UChar* lineEnd = p + line.length();
-            
-            // Look for whitespace separating the URL from subsequent ignored tokens.
-            while (p < lineEnd && *p != '\t' && *p != ' ') 
-                p++;
+        if (buffer.hasCharactersRemaining() && !isManifestWhitespaceOrNewline(*buffer))
+            return WTF::nullopt;
 
-            if (mode == OnlineAllowlist && p - upconvertedLineCharacters == 1 && line[0] == '*') {
-                // Wildcard was found.
-                manifest.allowAllNetworkRequests = true;
-                continue;
-            }
+        // Skip to the end of the line.
+        skipUntil<CharacterType, isManifestNewline>(buffer);
 
-            URL url(manifestURL, line.substring(0, p - upconvertedLineCharacters));
+        while (1) {
+            // Skip whitespace
+            skipWhile<CharacterType, isManifestWhitespaceOrNewline>(buffer);
             
-            if (!url.isValid())
-                continue;
-
-            url.removeFragmentIdentifier();
+            if (buffer.atEnd())
+                break;
+            
+            auto lineStart = buffer.position();
             
-            if (!equalIgnoringASCIICase(url.protocol(), manifestURL.protocol()))
+            // Find the end of the line
+            skipUntil<CharacterType, isManifestNewline>(buffer);
+            
+            // Line is a comment, skip to the next line.
+            if (*lineStart == '#')
                 continue;
             
-            if (mode == Explicit && manifestURL.protocolIs("https") && !protocolHostAndPortAreEqual(manifestURL, url))
+            // Get rid of trailing whitespace
+            auto lineEnd = buffer.position() - 1;
+            while (lineEnd > lineStart && isManifestWhitespace(*lineEnd))
+                --lineEnd;
+
+            auto lineBuffer = StringParsingBuffer { lineStart, lineEnd + 1 };
+
+            if (lineBuffer[lineBuffer.lengthRemaining() - 1] == ':') {
+                if (skipCharactersExactly(lineBuffer, cacheModeIdentifier<CharacterType>) && lineBuffer.lengthRemaining() == 1) {
+                    mode = ApplicationCacheParserMode::Explicit;
+                    continue;
+                }
+                if (skipCharactersExactly(lineBuffer, fallbackModeIdentifier<CharacterType>) && lineBuffer.lengthRemaining() == 1) {
+                    mode = ApplicationCacheParserMode::Fallback;
+                    continue;
+                }
+                if (skipCharactersExactly(lineBuffer, networkModeIdentifier<CharacterType>) && lineBuffer.lengthRemaining() == 1) {
+                    mode = ApplicationCacheParserMode::OnlineAllowlist;
+                    continue;
+                }
+
+                // If the line (excluding the trailing whitespace) ends with a ':' and isn't one of the known mode
+                // headers, transition to the 'Unknown' mode.
+                mode = ApplicationCacheParserMode::Unknown;
+                continue;
+            }
+    
+            switch (mode) {
+            case ApplicationCacheParserMode::Unknown:
                 continue;
             
-            if (mode == Explicit)
+            case ApplicationCacheParserMode::Explicit: {
+                // Look for whitespace separating the URL from subsequent ignored tokens.
+                skipUntil<CharacterType, isManifestWhitespace>(lineBuffer);
+
+                auto url = makeManifestURL(manifestURL, lineStart, lineBuffer.position());
+                if (!url.isValid())
+                    continue;
+                
+                if (!equalIgnoringASCIICase(url.protocol(), manifestURL.protocol()))
+                    continue;
+                
+                if (manifestURL.protocolIs("https") && !protocolHostAndPortAreEqual(manifestURL, url))
+                    continue;
+                
                 manifest.explicitURLs.add(url.string());
-            else
-                manifest.onlineAllowedURLs.append(url);
-            
-        } else if (mode == Fallback) {
-            auto upconvertedLineCharacters = StringView(line).upconvertedCharacters();
-            const UChar* p = upconvertedLineCharacters;
-            const UChar* lineEnd = p + line.length();
-            
-            // Look for whitespace separating the two URLs
-            while (p < lineEnd && *p != '\t' && *p != ' ') 
-                p++;
+                continue;
+            }
+
+            case ApplicationCacheParserMode::OnlineAllowlist: {
+                // Look for whitespace separating the URL from subsequent ignored tokens.
+                skipUntil<CharacterType, isManifestWhitespace>(lineBuffer);
+
+                if (lineBuffer.position() - lineStart == 1 && *lineStart == '*') {
+                    // Wildcard was found.
+                    manifest.allowAllNetworkRequests = true;
+                    continue;
+                }
+                
+                auto url = makeManifestURL(manifestURL, lineStart, lineBuffer.position());
+                if (!url.isValid())
+                    continue;
+                
+                if (!equalIgnoringASCIICase(url.protocol(), manifestURL.protocol()))
+                    continue;
 
-            if (p == lineEnd) {
-                // There was no whitespace separating the URLs.
+                manifest.onlineAllowedURLs.append(url);
                 continue;
             }
             
-            URL namespaceURL(manifestURL, line.substring(0, p - upconvertedLineCharacters));
-            if (!namespaceURL.isValid())
-                continue;
-            namespaceURL.removeFragmentIdentifier();
+            case ApplicationCacheParserMode::Fallback: {
+                // Look for whitespace separating the two URLs
+                skipUntil<CharacterType, isManifestWhitespace>(lineBuffer);
 
-            if (!protocolHostAndPortAreEqual(manifestURL, namespaceURL))
-                continue;
+                if (lineBuffer.atEnd()) {
+                    // There was no whitespace separating the URLs.
+                    continue;
+                }
 
-            // Although <https://html.spec.whatwg.org/multipage/offline.html#parsing-cache-manifests> (07/06/2017) saids
-            // that we should always prefix match the manifest path we only do so if the manifest was served with a non-
-            // standard HTTP Content-Type header for web compatibility.
-            if (!allowFallbackNamespaceOutsideManfestPath && !namespaceURL.path().startsWith(manifestPath))
-                continue;
+                auto namespaceURL = makeManifestURL(manifestURL, lineStart, lineBuffer.position());
+                if (!namespaceURL.isValid())
+                    continue;
 
-            // Skip whitespace separating fallback namespace from URL.
-            while (p < lineEnd && (*p == '\t' || *p == ' '))
-                p++;
+                if (!protocolHostAndPortAreEqual(manifestURL, namespaceURL))
+                    continue;
 
-            // Look for whitespace separating the URL from subsequent ignored tokens.
-            const UChar* fallbackStart = p;
-            while (p < lineEnd && *p != '\t' && *p != ' ') 
-                p++;
+                // Although <https://html.spec.whatwg.org/multipage/offline.html#parsing-cache-manifests> (07/06/2017) saids
+                // that we should always prefix match the manifest path we only do so if the manifest was served with a non-
+                // standard HTTP Content-Type header for web compatibility.
+                if (!allowFallbackNamespaceOutsideManfestPath && !namespaceURL.path().startsWith(manifestPath))
+                    continue;
 
-            URL fallbackURL(manifestURL, String(fallbackStart, p - fallbackStart));
-            if (!fallbackURL.isValid())
-                continue;
-            fallbackURL.removeFragmentIdentifier();
+                // Skip whitespace separating fallback namespace from URL.
+                skipWhile<CharacterType, isManifestWhitespace>(lineBuffer);
 
-            if (!protocolHostAndPortAreEqual(manifestURL, fallbackURL))
-                continue;
+                auto fallbackStart = lineBuffer.position();
+
+                // Look for whitespace separating the URL from subsequent ignored tokens.
+                skipUntil<CharacterType, isManifestWhitespace>(lineBuffer);
+
+                auto fallbackURL = makeManifestURL(manifestURL, fallbackStart, lineBuffer.position());
+                if (!fallbackURL.isValid())
+                    continue;
 
-            manifest.fallbackURLs.append(std::make_pair(namespaceURL, fallbackURL));            
-        } else 
+                if (!protocolHostAndPortAreEqual(manifestURL, fallbackURL))
+                    continue;
+
+                manifest.fallbackURLs.append(std::make_pair(namespaceURL, fallbackURL));
+                continue;
+            }
+            }
+            
             ASSERT_NOT_REACHED();
-    }
+        }
 
-    return true;
+        return manifest;
+    });
 }
 
 }
index ab4a6dd..14f19fd 100644 (file)
 
 namespace WebCore {
 
-struct Manifest {
+struct ApplicationCacheManifest {
     Vector<URL> onlineAllowedURLs;
     HashSet<String> explicitURLs;
     FallbackURLVector fallbackURLs;
-    bool allowAllNetworkRequests; // Wildcard found in NETWORK section.
+    bool allowAllNetworkRequests { false }; // Wildcard found in NETWORK section.
 };
 
-bool parseManifest(const URL& manifestURL, const String& manifestMIMEType, const char* data, int length, Manifest&);
+Optional<ApplicationCacheManifest> parseApplicationCacheManifest(const URL& manifestURL, const String& manifestMIMEType, const char* data, int length);
 
 } // namespace WebCore