When building the NFA of the global disjunction, share the prefix subgraph of existin...
authorbenjamin@webkit.org <benjamin@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 15 Jan 2015 22:19:40 +0000 (22:19 +0000)
committerbenjamin@webkit.org <benjamin@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 15 Jan 2015 22:19:40 +0000 (22:19 +0000)
https://bugs.webkit.org/show_bug.cgi?id=140465

Reviewed by Andreas Kling.

This patch updates the parser to produce smaller graphs when multiple patterns
of the rule list share a common prefix.

Previously, GraphBuilder would generate subgraph in place of each parsed
atom. We now only create subgraph if an atom does not appear in the prefix tree.

We accumulate the parsing information into small uint16_t named TrivialAtom.
When generating the subgraph for an new atom, we first check if the prefix tree already
has a corresponding subgraph for that atom. If it does, we do not generate anything and we extend the existing
graph. If there is no existing prefix, we create the subgraph and extend the prefix tree.

Sharing prefix subtrees slows down the subtree generation a bit but the resulting graph is much
simpler for many kind of inputs.

* WebCore.xcodeproj/project.pbxproj:
* contentextensions/ContentExtensionsBackend.cpp:
(WebCore::ContentExtensions::ContentExtensionsBackend::setRuleList):
The URLFilterParser now maintains states (the prefix tree) between patterns.

* contentextensions/DFA.cpp:
(WebCore::ContentExtensions::DFA::debugPrintDot):
* contentextensions/DFANode.h:
Fix a typo :)

* contentextensions/NFA.cpp:
(WebCore::ContentExtensions::NFA::createNode):
(WebCore::ContentExtensions::NFA::setFinal):
(WebCore::ContentExtensions::NFA::restoreToGraphSize):
(WebCore::ContentExtensions::NFA::addRuleId):
(WebCore::ContentExtensions::NFA::debugPrintDot):
* contentextensions/NFA.h:
(WebCore::ContentExtensions::NFA::addRuleId):
* contentextensions/NFANode.cpp: Removed.
* contentextensions/NFANode.h:
NFA nodes from two patterns are now "merged" by construction, thus we need
to keep track of multiple rules per node.

* contentextensions/NFAToDFA.cpp:
(WebCore::ContentExtensions::NodeIdSetToUniqueNodeIdSetTranslator::translate):
* contentextensions/URLFilterParser.cpp:
(WebCore::ContentExtensions::trivialAtomFromAsciiCharacter):
(WebCore::ContentExtensions::quantifyTrivialAtom):
(WebCore::ContentExtensions::trivialAtomForNewlineClassIDBuiltin):
(WebCore::ContentExtensions::GraphBuilder::GraphBuilder):
(WebCore::ContentExtensions::GraphBuilder::m_LastPrefixTreeEntry):
(WebCore::ContentExtensions::GraphBuilder::finalize):
(WebCore::ContentExtensions::GraphBuilder::atomPatternCharacter):
(WebCore::ContentExtensions::GraphBuilder::atomBuiltInCharacterClass):
(WebCore::ContentExtensions::GraphBuilder::quantifyAtom):
(WebCore::ContentExtensions::GraphBuilder::fail):
(WebCore::ContentExtensions::GraphBuilder::generateTransition):
(WebCore::ContentExtensions::GraphBuilder::sinkTrivialAtom):
(WebCore::ContentExtensions::GraphBuilder::sinkPendingAtomIfNecessary):
(WebCore::ContentExtensions::URLFilterParser::URLFilterParser):
(WebCore::ContentExtensions::URLFilterParser::addPattern):
(WebCore::ContentExtensions::GraphBuilder::m_lastAtom): Deleted.
(WebCore::ContentExtensions::URLFilterParser::parse): Deleted.
* contentextensions/URLFilterParser.h:
(WebCore::ContentExtensions::URLFilterParser::hasError): Deleted.
(WebCore::ContentExtensions::URLFilterParser::errorMessage): Deleted.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@178529 268f45cc-cd09-0410-ab3c-d52691b4dbfc

12 files changed:
Source/WebCore/ChangeLog
Source/WebCore/WebCore.xcodeproj/project.pbxproj
Source/WebCore/contentextensions/ContentExtensionsBackend.cpp
Source/WebCore/contentextensions/DFA.cpp
Source/WebCore/contentextensions/DFANode.h
Source/WebCore/contentextensions/NFA.cpp
Source/WebCore/contentextensions/NFA.h
Source/WebCore/contentextensions/NFANode.cpp [deleted file]
Source/WebCore/contentextensions/NFANode.h
Source/WebCore/contentextensions/NFAToDFA.cpp
Source/WebCore/contentextensions/URLFilterParser.cpp
Source/WebCore/contentextensions/URLFilterParser.h

index 34a7151..77c4c32 100644 (file)
@@ -1,3 +1,71 @@
+2015-01-15  Benjamin Poulain  <benjamin@webkit.org>
+
+        When building the NFA of the global disjunction, share the prefix subgraph of existing subpatterns
+        https://bugs.webkit.org/show_bug.cgi?id=140465
+
+        Reviewed by Andreas Kling.
+
+        This patch updates the parser to produce smaller graphs when multiple patterns
+        of the rule list share a common prefix.
+
+        Previously, GraphBuilder would generate subgraph in place of each parsed
+        atom. We now only create subgraph if an atom does not appear in the prefix tree.
+
+        We accumulate the parsing information into small uint16_t named TrivialAtom.
+        When generating the subgraph for an new atom, we first check if the prefix tree already
+        has a corresponding subgraph for that atom. If it does, we do not generate anything and we extend the existing
+        graph. If there is no existing prefix, we create the subgraph and extend the prefix tree.
+
+        Sharing prefix subtrees slows down the subtree generation a bit but the resulting graph is much
+        simpler for many kind of inputs.
+
+        * WebCore.xcodeproj/project.pbxproj:
+        * contentextensions/ContentExtensionsBackend.cpp:
+        (WebCore::ContentExtensions::ContentExtensionsBackend::setRuleList):
+        The URLFilterParser now maintains states (the prefix tree) between patterns.
+
+        * contentextensions/DFA.cpp:
+        (WebCore::ContentExtensions::DFA::debugPrintDot):
+        * contentextensions/DFANode.h:
+        Fix a typo :)
+
+        * contentextensions/NFA.cpp:
+        (WebCore::ContentExtensions::NFA::createNode):
+        (WebCore::ContentExtensions::NFA::setFinal):
+        (WebCore::ContentExtensions::NFA::restoreToGraphSize):
+        (WebCore::ContentExtensions::NFA::addRuleId):
+        (WebCore::ContentExtensions::NFA::debugPrintDot):
+        * contentextensions/NFA.h:
+        (WebCore::ContentExtensions::NFA::addRuleId):
+        * contentextensions/NFANode.cpp: Removed.
+        * contentextensions/NFANode.h:
+        NFA nodes from two patterns are now "merged" by construction, thus we need
+        to keep track of multiple rules per node.
+
+        * contentextensions/NFAToDFA.cpp:
+        (WebCore::ContentExtensions::NodeIdSetToUniqueNodeIdSetTranslator::translate):
+        * contentextensions/URLFilterParser.cpp:
+        (WebCore::ContentExtensions::trivialAtomFromAsciiCharacter):
+        (WebCore::ContentExtensions::quantifyTrivialAtom):
+        (WebCore::ContentExtensions::trivialAtomForNewlineClassIDBuiltin):
+        (WebCore::ContentExtensions::GraphBuilder::GraphBuilder):
+        (WebCore::ContentExtensions::GraphBuilder::m_LastPrefixTreeEntry):
+        (WebCore::ContentExtensions::GraphBuilder::finalize):
+        (WebCore::ContentExtensions::GraphBuilder::atomPatternCharacter):
+        (WebCore::ContentExtensions::GraphBuilder::atomBuiltInCharacterClass):
+        (WebCore::ContentExtensions::GraphBuilder::quantifyAtom):
+        (WebCore::ContentExtensions::GraphBuilder::fail):
+        (WebCore::ContentExtensions::GraphBuilder::generateTransition):
+        (WebCore::ContentExtensions::GraphBuilder::sinkTrivialAtom):
+        (WebCore::ContentExtensions::GraphBuilder::sinkPendingAtomIfNecessary):
+        (WebCore::ContentExtensions::URLFilterParser::URLFilterParser):
+        (WebCore::ContentExtensions::URLFilterParser::addPattern):
+        (WebCore::ContentExtensions::GraphBuilder::m_lastAtom): Deleted.
+        (WebCore::ContentExtensions::URLFilterParser::parse): Deleted.
+        * contentextensions/URLFilterParser.h:
+        (WebCore::ContentExtensions::URLFilterParser::hasError): Deleted.
+        (WebCore::ContentExtensions::URLFilterParser::errorMessage): Deleted.
+
 2015-01-14  Alexey Proskuryakov  <ap@apple.com>
 
         Web Inspector and regular console use different source code locations for messages
index 155c68f..b7c4137 100644 (file)
                267726041A5DF6F2003C24DD /* URLFilterParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 267726021A5DF6F2003C24DD /* URLFilterParser.cpp */; };
                267726051A5DF6F2003C24DD /* URLFilterParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 267726031A5DF6F2003C24DD /* URLFilterParser.h */; };
                269239961505E1AA009E57FC /* JSIDBVersionChangeEvent.h in Headers */ = {isa = PBXBuildFile; fileRef = 269239921505E1AA009E57FC /* JSIDBVersionChangeEvent.h */; };
-               269397211A4A412F00E8349D /* NFANode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2693971F1A4A412F00E8349D /* NFANode.cpp */; };
                269397221A4A412F00E8349D /* NFANode.h in Headers */ = {isa = PBXBuildFile; fileRef = 269397201A4A412F00E8349D /* NFANode.h */; };
                269397241A4A5B6400E8349D /* NFA.h in Headers */ = {isa = PBXBuildFile; fileRef = 269397231A4A5B6400E8349D /* NFA.h */; };
                269397261A4A5FBD00E8349D /* NFA.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 269397251A4A5FBD00E8349D /* NFA.cpp */; };
                267726031A5DF6F2003C24DD /* URLFilterParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = URLFilterParser.h; sourceTree = "<group>"; };
                269239911505E1AA009E57FC /* JSIDBVersionChangeEvent.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = JSIDBVersionChangeEvent.cpp; sourceTree = "<group>"; };
                269239921505E1AA009E57FC /* JSIDBVersionChangeEvent.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSIDBVersionChangeEvent.h; sourceTree = "<group>"; };
-               2693971F1A4A412F00E8349D /* NFANode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NFANode.cpp; sourceTree = "<group>"; };
                269397201A4A412F00E8349D /* NFANode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NFANode.h; sourceTree = "<group>"; };
                269397231A4A5B6400E8349D /* NFA.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NFA.h; sourceTree = "<group>"; };
                269397251A4A5FBD00E8349D /* NFA.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NFA.cpp; sourceTree = "<group>"; };
                                267725F91A5B3AD9003C24DD /* DFANode.h */,
                                269397251A4A5FBD00E8349D /* NFA.cpp */,
                                269397231A4A5B6400E8349D /* NFA.h */,
-                               2693971F1A4A412F00E8349D /* NFANode.cpp */,
                                269397201A4A412F00E8349D /* NFANode.h */,
                                267725FA1A5B3AD9003C24DD /* NFAToDFA.cpp */,
                                267725FB1A5B3AD9003C24DD /* NFAToDFA.h */,
                                B2227A960D00BF220071B782 /* SVGPreserveAspectRatio.cpp in Sources */,
                                B543B85717EB758F003BE93A /* SVGPropertyInfo.cpp in Sources */,
                                B2227A990D00BF220071B782 /* SVGRadialGradientElement.cpp in Sources */,
-                               269397211A4A412F00E8349D /* NFANode.cpp in Sources */,
                                B2227A9D0D00BF220071B782 /* SVGRectElement.cpp in Sources */,
                                BC2274780E8366E200E7F975 /* SVGRenderStyle.cpp in Sources */,
                                BC22747A0E8366E200E7F975 /* SVGRenderStyleDefs.cpp in Sources */,
index 3a7190e..e032964 100644 (file)
@@ -64,16 +64,16 @@ void ContentExtensionsBackend::setRuleList(const String& identifier, const Vecto
 #endif
 
     NFA nfa;
+    URLFilterParser urlFilterParser(nfa);
     for (unsigned ruleIndex = 0; ruleIndex < ruleList.size(); ++ruleIndex) {
         const ContentExtensionRule& contentExtensionRule = ruleList[ruleIndex];
         const ContentExtensionRule::Trigger& trigger = contentExtensionRule.trigger();
         ASSERT(trigger.urlFilter.length());
 
-        URLFilterParser urlFilterParser;
-        urlFilterParser.parse(trigger.urlFilter, ruleIndex, nfa);
+        String error = urlFilterParser.addPattern(trigger.urlFilter, ruleIndex);
 
-        if (urlFilterParser.hasError()) {
-            dataLogF("Error while parsing %s: %s", trigger.urlFilter.utf8().data(), urlFilterParser.errorMessage().utf8().data());
+        if (!error.isNull()) {
+            dataLogF("Error while parsing %s: %s\n", trigger.urlFilter.utf8().data(), error.utf8().data());
             continue;
         }
     }
index 70fb98a..bbd58b0 100644 (file)
@@ -154,13 +154,13 @@ void DFA::debugPrintDot() const
             }
         }
 
-        Vector<unsigned> correspondingDFANodes = m_nodes[i].correspondingDFANodes;
-        ASSERT(!correspondingDFANodes.isEmpty());
+        Vector<unsigned> correspondingNFANodes = m_nodes[i].correspondingNFANodes;
+        ASSERT(!correspondingNFANodes.isEmpty());
         dataLogF("<BR/>NFA Nodes: ");
-        for (unsigned correspondingDFANodeIndex = 0; correspondingDFANodeIndex < correspondingDFANodes.size(); ++correspondingDFANodeIndex) {
+        for (unsigned correspondingDFANodeIndex = 0; correspondingDFANodeIndex < correspondingNFANodes.size(); ++correspondingDFANodeIndex) {
             if (correspondingDFANodeIndex)
                 dataLogF(", ");
-            dataLogF("%d", correspondingDFANodes[correspondingDFANodeIndex]);
+            dataLogF("%d", correspondingNFANodes[correspondingDFANodeIndex]);
         }
 
         dataLogF(">]");
index 46b5183..bce23d6 100644 (file)
@@ -44,7 +44,7 @@ public:
     Vector<uint64_t> actions;
 
 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
-    Vector<unsigned> correspondingDFANodes;
+    Vector<unsigned> correspondingNFANodes;
 #endif
 };
 
index 3ed619a..1ab990a 100644 (file)
@@ -40,10 +40,10 @@ NFA::NFA()
 {
 }
 
-unsigned NFA::createNode(uint64_t ruleId)
+unsigned NFA::createNode()
 {
     unsigned nextId = m_nodes.size();
-    m_nodes.append(NFANode(ruleId));
+    m_nodes.append(NFANode());
     return nextId;
 }
 
@@ -66,10 +66,10 @@ void NFA::addEpsilonTransition(unsigned from, unsigned to)
     addResult.iterator->value.add(to);
 }
 
-void NFA::setFinal(unsigned node)
+void NFA::setFinal(unsigned node, uint64_t ruleId)
 {
-    ASSERT(node < m_nodes.size());
-    m_nodes[node].isFinal = true;
+    ASSERT(!m_nodes[node].finalRuleIds.contains(ruleId));
+    m_nodes[node].finalRuleIds.append(ruleId);
 }
 
 unsigned NFA::graphSize() const
@@ -79,7 +79,7 @@ unsigned NFA::graphSize() const
 
 void NFA::restoreToGraphSize(unsigned size)
 {
-    ASSERT(size > 1);
+    ASSERT(size >= 1);
     ASSERT(size <= graphSize());
 
     m_nodes.shrink(size);
@@ -87,6 +87,12 @@ void NFA::restoreToGraphSize(unsigned size)
 
 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
 
+void NFA::addRuleId(unsigned node, uint64_t ruleId)
+{
+    ASSERT(!m_nodes[node].ruleIds.contains(ruleId));
+    m_nodes[node].ruleIds.append(ruleId);
+}
+
 static void printRange(bool firstRange, uint16_t rangeStart, uint16_t rangeEnd, uint16_t epsilonTransitionCharacter)
 {
     if (!firstRange)
@@ -152,12 +158,33 @@ void NFA::debugPrintDot() const
     dataLogF("    node [shape=circle];\n");
     dataLogF("    {\n");
     for (unsigned i = 0; i < m_nodes.size(); ++i) {
-        if (m_nodes[i].ruleId  == std::numeric_limits<uint64_t>::max())
-            dataLogF("         %d [label=\"Node %d\"]", i, i);
-        else
-            dataLogF("         %d [label=<Node %d<BR/>(Rule %llu)>]", i, i, m_nodes[i].ruleId);
+        dataLogF("         %d [label=<Node %d", i, i);
+
+        const Vector<uint64_t>& originalRules = m_nodes[i].ruleIds;
+        if (!originalRules.isEmpty()) {
+            dataLogF("<BR/>(Rules: ");
+            for (unsigned ruleIndex = 0; ruleIndex < originalRules.size(); ++ruleIndex) {
+                if (ruleIndex)
+                    dataLogF(", ");
+                dataLogF("%llu", originalRules[ruleIndex]);
+            }
+            dataLogF(")");
+        }
+
+        const Vector<uint64_t>& finalRules = m_nodes[i].finalRuleIds;
+        if (!finalRules.isEmpty()) {
+            dataLogF("<BR/>(Final: ");
+            for (unsigned ruleIndex = 0; ruleIndex < finalRules.size(); ++ruleIndex) {
+                if (ruleIndex)
+                    dataLogF(", ");
+                dataLogF("%llu", finalRules[ruleIndex]);
+            }
+            dataLogF(")");
+        }
+
+        dataLogF(">]");
 
-        if (m_nodes[i].isFinal)
+        if (!finalRules.isEmpty())
             dataLogF(" [shape=doublecircle]");
 
         dataLogF(";\n");
index a8baff6..dda74f8 100644 (file)
@@ -30,7 +30,6 @@
 
 #include "ContentExtensionsDebugging.h"
 #include "NFANode.h"
-#include <limits>
 #include <wtf/Vector.h>
 
 namespace WebCore {
@@ -45,17 +44,21 @@ class NFA {
 public:
     NFA();
     unsigned root() const { return m_root; }
-    unsigned createNode(uint64_t ruleId = std::numeric_limits<uint64_t>::max());
+    unsigned createNode();
 
     void addTransition(unsigned from, unsigned to, char character);
     void addEpsilonTransition(unsigned from, unsigned to);
-    void setFinal(unsigned node);
+    void setFinal(unsigned node, uint64_t ruleId);
 
     unsigned graphSize() const;
     void restoreToGraphSize(unsigned);
 
 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
+    void addRuleId(unsigned node, uint64_t ruleId);
+
     void debugPrintDot() const;
+#else
+    void addRuleId(unsigned, uint64_t) { }
 #endif
 
 private:
diff --git a/Source/WebCore/contentextensions/NFANode.cpp b/Source/WebCore/contentextensions/NFANode.cpp
deleted file mode 100644 (file)
index 369917c..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2014 Apple Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "NFANode.h"
-
-#if ENABLE(CONTENT_EXTENSIONS)
-
-namespace WebCore {
-
-namespace ContentExtensions {
-
-NFANode::NFANode(uint64_t ruleId)
-    : isFinal(false)
-    , ruleId(ruleId)
-{
-}
-
-}
-
-} // namespace WebCore
-
-#endif // ENABLE(CONTENT_EXTENSIONS)
index ea24377..90084bb 100644 (file)
 
 #if ENABLE(CONTENT_EXTENSIONS)
 
+#include "ContentExtensionsDebugging.h"
 #include <wtf/HashMap.h>
 #include <wtf/HashSet.h>
+#include <wtf/Vector.h>
 
 namespace WebCore {
 
@@ -38,11 +40,12 @@ namespace ContentExtensions {
 // A NFANode abstract the transition table out of a NFA state.
 class NFANode {
 public:
-    NFANode(uint64_t ruleId);
-
     HashMap<uint16_t, HashSet<unsigned, DefaultHash<unsigned>::Hash, WTF::UnsignedWithZeroKeyHashTraits<unsigned>>> transitions;
-    bool isFinal;
-    const uint64_t ruleId;
+
+    Vector<uint64_t> finalRuleIds;
+#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
+    Vector<uint64_t> ruleIds;
+#endif
 };
 
 }
index 9564f62..cd8115c 100644 (file)
@@ -219,10 +219,9 @@ struct NodeIdSetToUniqueNodeIdSetTranslator {
         HashSet<uint64_t, DefaultHash<uint64_t>::Hash, WTF::UnsignedWithZeroKeyHashTraits<uint64_t>> actions;
         for (unsigned nfaNodeId : source.nodeIdSet) {
             const NFANode& nfaNode = source.nfaGraph[nfaNodeId];
-            if (nfaNode.isFinal)
-                actions.add(nfaNode.ruleId);
+            actions.add(nfaNode.finalRuleIds.begin(), nfaNode.finalRuleIds.end());
 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
-            newDFANode.correspondingDFANodes.append(nfaNodeId);
+            newDFANode.correspondingNFANodes.append(nfaNodeId);
 #endif
         }
         copyToVector(actions, newDFANode.actions);
index 38841be..2b8f786 100644 (file)
@@ -35,6 +35,35 @@ namespace WebCore {
 
 namespace ContentExtensions {
 
+const uint16_t hasNonCharacterMask = 0x0080;
+const uint16_t characterMask = 0x0007F;
+const uint16_t newlineClassIDBuiltinMask = 0x100;
+
+static TrivialAtom trivialAtomFromASCIICharacter(char character)
+{
+    ASSERT(isASCII(character));
+
+    return static_cast<uint16_t>(character);
+}
+
+enum class TrivialAtomQuantifier : uint16_t {
+    ZeroOrOne = 0x1000,
+    ZeroToMany = 0x2000,
+    OneToMany = 0x4000
+};
+
+static void quantifyTrivialAtom(TrivialAtom& trivialAtom, TrivialAtomQuantifier quantifier)
+{
+    ASSERT(trivialAtom & (hasNonCharacterMask | characterMask));
+    ASSERT(!(trivialAtom & 0xf000));
+    trivialAtom |= static_cast<uint16_t>(quantifier);
+}
+
+static TrivialAtom trivialAtomForNewlineClassIDBuiltin()
+{
+    return hasNonCharacterMask | newlineClassIDBuiltinMask;
+}
+
 class GraphBuilder {
 private:
     struct BoundedSubGraph {
@@ -42,11 +71,11 @@ private:
         unsigned end;
     };
 public:
-    GraphBuilder(NFA& nfa, uint64_t patternId)
+    GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, uint64_t patternId)
         : m_nfa(nfa)
         , m_patternId(patternId)
         , m_activeGroup({ nfa.root(), nfa.root() })
-        , m_lastAtom(m_activeGroup)
+        , m_lastPrefixTreeEntry(&prefixTreeRoot)
     {
     }
 
@@ -54,8 +83,11 @@ public:
     {
         if (hasError())
             return;
+
+        sinkPendingAtomIfNecessary();
+
         if (m_activeGroup.start != m_activeGroup.end)
-            m_nfa.setFinal(m_activeGroup.end);
+            m_nfa.setFinal(m_activeGroup.end, m_patternId);
         else
             fail(ASCIILiteral("The pattern cannot match anything."));
     }
@@ -75,12 +107,13 @@ public:
         if (hasError())
             return;
 
+        sinkPendingAtomIfNecessary();
+
+        char asciiChararacter = static_cast<char>(character);
         m_hasValidAtom = true;
-        unsigned newEnd = m_nfa.createNode(m_patternId);
-        m_nfa.addTransition(m_lastAtom.end, newEnd, static_cast<char>(character));
-        m_lastAtom.start = m_lastAtom.end;
-        m_lastAtom.end = newEnd;
-        m_activeGroup.end = m_lastAtom.end;
+
+        ASSERT(m_lastPrefixTreeEntry);
+        m_pendingTrivialAtom = trivialAtomFromASCIICharacter(asciiChararacter);
     }
 
     void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
@@ -89,14 +122,9 @@ public:
             return;
 
         if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted) {
-            // FIXME: handle new line properly.
             m_hasValidAtom = true;
-            unsigned newEnd = m_nfa.createNode(m_patternId);
-            for (unsigned i = 1; i < 128; ++i)
-                m_nfa.addTransition(m_lastAtom.end, newEnd, i);
-            m_lastAtom.start = m_lastAtom.end;
-            m_lastAtom.end = newEnd;
-            m_activeGroup.end = m_lastAtom.end;
+            ASSERT(m_lastPrefixTreeEntry);
+            m_pendingTrivialAtom = trivialAtomForNewlineClassIDBuiltin();
         } else
             fail(ASCIILiteral("Character class is not supported."));
     }
@@ -112,13 +140,13 @@ public:
             return;
         }
 
+        ASSERT(m_lastPrefixTreeEntry);
         if (!minimum && maximum == 1)
-            m_nfa.addEpsilonTransition(m_lastAtom.start, m_lastAtom.end);
-        else if (!minimum && maximum == JSC::Yarr::quantifyInfinite) {
-            m_nfa.addEpsilonTransition(m_lastAtom.start, m_lastAtom.end);
-            m_nfa.addEpsilonTransition(m_lastAtom.end, m_lastAtom.start);
-        } else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
-            m_nfa.addEpsilonTransition(m_lastAtom.end, m_lastAtom.start);
+            quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroOrOne);
+        else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
+            quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroToMany);
+        else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
+            quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::OneToMany);
         else
             fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
     }
@@ -189,7 +217,6 @@ public:
         fail(ASCIILiteral("Disjunctions are not supported yet."));
     }
 
-
 private:
     bool hasError() const
     {
@@ -200,43 +227,154 @@ private:
     {
         if (hasError())
             return;
+
+        if (m_newPrefixSubtreeRoot)
+            m_newPrefixSubtreeRoot->nextPattern.remove(m_newPrefixStaringPoint);
+
         m_errorMessage = errorMessage;
     }
 
+    void generateTransition(TrivialAtom trivialAtom, unsigned source, unsigned target)
+    {
+        if (trivialAtom & hasNonCharacterMask) {
+            ASSERT(trivialAtom & newlineClassIDBuiltinMask);
+            for (unsigned i = 1; i < 128; ++i)
+                m_nfa.addTransition(source, target, i);
+        } else
+            m_nfa.addTransition(source, target, static_cast<char>(trivialAtom & characterMask));
+    }
+
+    BoundedSubGraph sinkTrivialAtom(TrivialAtom trivialAtom, unsigned start)
+    {
+        if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroOrOne)) {
+            unsigned newEnd = m_nfa.createNode();
+            m_nfa.addRuleId(newEnd, m_patternId);
+            generateTransition(trivialAtom, start, newEnd);
+            m_nfa.addEpsilonTransition(start, newEnd);
+            return { start, newEnd };
+        }
+
+        if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroToMany)) {
+            unsigned repeatStart = m_nfa.createNode();
+            m_nfa.addRuleId(repeatStart, m_patternId);
+            unsigned repeatEnd = m_nfa.createNode();
+            m_nfa.addRuleId(repeatEnd, m_patternId);
+
+            generateTransition(trivialAtom, repeatStart, repeatEnd);
+            m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
+
+            m_nfa.addEpsilonTransition(start, repeatStart);
+
+            unsigned kleenEnd = m_nfa.createNode();
+            m_nfa.addRuleId(kleenEnd, m_patternId);
+            m_nfa.addEpsilonTransition(repeatEnd, kleenEnd);
+            m_nfa.addEpsilonTransition(start, kleenEnd);
+            return { start, kleenEnd };
+        }
+
+        if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::OneToMany)) {
+            unsigned repeatStart = m_nfa.createNode();
+            m_nfa.addRuleId(repeatStart, m_patternId);
+            unsigned repeatEnd = m_nfa.createNode();
+            m_nfa.addRuleId(repeatEnd, m_patternId);
+
+            generateTransition(trivialAtom, repeatStart, repeatEnd);
+            m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
+
+            m_nfa.addEpsilonTransition(start, repeatStart);
+
+            unsigned afterRepeat = m_nfa.createNode();
+            m_nfa.addRuleId(afterRepeat, m_patternId);
+            m_nfa.addEpsilonTransition(repeatEnd, afterRepeat);
+            return { start, afterRepeat };
+        }
+
+        unsigned newEnd = m_nfa.createNode();
+        m_nfa.addRuleId(newEnd, m_patternId);
+        generateTransition(trivialAtom, start, newEnd);
+        return { start, newEnd };
+    }
+
+    void sinkPendingAtomIfNecessary()
+    {
+        ASSERT(m_lastPrefixTreeEntry);
+
+        if (!m_hasValidAtom)
+            return;
+
+        ASSERT(m_pendingTrivialAtom);
+
+        auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom);
+        if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
+            m_lastPrefixTreeEntry = nextEntry->value.get();
+            m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
+        } else {
+            std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
+
+            BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode);
+            nextPrefixTreeEntry->nfaNode = newSubGraph.end;
+
+            auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry));
+            ASSERT(addResult.isNewEntry);
+
+            m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
+            m_newPrefixStaringPoint = m_pendingTrivialAtom;
+
+            m_lastPrefixTreeEntry = addResult.iterator->value.get();
+        }
+        ASSERT(m_lastPrefixTreeEntry);
+
+        m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode;
+        m_pendingTrivialAtom = 0;
+        m_hasValidAtom = false;
+    }
+
     NFA& m_nfa;
     const uint64_t m_patternId;
 
     BoundedSubGraph m_activeGroup;
 
+    PrefixTreeEntry* m_lastPrefixTreeEntry;
     bool m_hasValidAtom = false;
-    BoundedSubGraph m_lastAtom;
+    TrivialAtom m_pendingTrivialAtom = 0;
+
+    PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
+    TrivialAtom m_newPrefixStaringPoint = 0;
 
     String m_errorMessage;
 };
 
-void URLFilterParser::parse(const String& pattern, uint64_t patternId, NFA& nfa)
+URLFilterParser::URLFilterParser(NFA& nfa)
+    : m_nfa(nfa)
+{
+    m_prefixTreeRoot.nfaNode = nfa.root();
+}
+
+String URLFilterParser::addPattern(const String& pattern, uint64_t patternId)
 {
     if (!pattern.containsOnlyASCII())
-        m_errorMessage = ASCIILiteral("URLFilterParser only supports ASCII patterns.");
+        return ASCIILiteral("URLFilterParser only supports ASCII patterns.");
     ASSERT(!pattern.isEmpty());
 
     if (pattern.isEmpty())
-        return;
+        return ASCIILiteral("Empty pattern.");
+
+    unsigned oldSize = m_nfa.graphSize();
 
-    unsigned oldSize = nfa.graphSize();
+    String error;
 
-    GraphBuilder graphBuilder(nfa, patternId);
-    const char* error = JSC::Yarr::parse(graphBuilder, pattern, 0);
-    if (error)
-        m_errorMessage = String(error);
-    else
+    GraphBuilder graphBuilder(m_nfa, m_prefixTreeRoot, patternId);
+    error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
+    if (error.isNull())
         graphBuilder.finalize();
 
-    if (!error)
-        m_errorMessage = graphBuilder.errorMessage();
+    if (error.isNull())
+        error = graphBuilder.errorMessage();
+
+    if (!error.isNull())
+        m_nfa.restoreToGraphSize(oldSize);
 
-    if (hasError())
-        nfa.restoreToGraphSize(oldSize);
+    return error;
 }
 
 } // namespace ContentExtensions
index 3ec8024..a66b630 100644 (file)
@@ -28,6 +28,7 @@
 
 #if ENABLE(CONTENT_EXTENSIONS)
 
+#include <wtf/HashMap.h>
 #include <wtf/text/WTFString.h>
 
 namespace WebCore {
@@ -36,20 +37,22 @@ namespace ContentExtensions {
 
 class NFA;
 
+typedef uint16_t TrivialAtom;
+
+struct PrefixTreeEntry {
+    unsigned nfaNode;
+    HashMap<TrivialAtom, std::unique_ptr<PrefixTreeEntry>> nextPattern;
+};
+
+
 class URLFilterParser {
 public:
-    void parse(const String& pattern, uint64_t patternId, NFA&);
-
-    bool hasError() const { return !m_errorMessage.isNull(); }
-    String errorMessage() const { return m_errorMessage; }
+    explicit URLFilterParser(NFA&);
+    String addPattern(const String& pattern, uint64_t patternId);
 
 private:
-    struct BoundedSubGraph {
-        unsigned start;
-        unsigned end;
-    };
-
-    String m_errorMessage;
+    NFA& m_nfa;
+    PrefixTreeEntry m_prefixTreeRoot;
 };
 
 } // namespace ContentExtensions