Extend URL filter's Term definition to support groups/subpatterns
[WebKit-https.git] / Source / WebCore / contentextensions / URLFilterParser.cpp
index ea557cb..4c7d9ce 100644 (file)
 
 #include "NFA.h"
 #include <JavaScriptCore/YarrParser.h>
+#include <wtf/BitVector.h>
+#include <wtf/Deque.h>
 
 namespace WebCore {
 
 namespace ContentExtensions {
 
-const uint16_t hasNonCharacterMask = 0x0080;
-const uint16_t characterMask = 0x0007F;
-const uint16_t newlineClassIDBuiltinMask = 0x100;
-const uint16_t caseInsensitiveMask = 0x200;
+enum class AtomQuantifier : uint8_t {
+    One,
+    ZeroOrOne,
+    ZeroOrMore,
+    OneOrMore
+};
 
-static TrivialAtom trivialAtomFromASCIICharacter(char character, bool caseSensitive)
-{
-    ASSERT(isASCII(character));
+class Term {
+public:
+    Term()
+    {
+    }
 
-    if (caseSensitive || !isASCIIAlpha(character))
-        return static_cast<uint16_t>(character);
+    Term(char character, bool isCaseSensitive)
+        : m_termType(TermType::CharacterSet)
+    {
+        new (NotNull, &m_atomData.characterSet) CharacterSet();
+        addCharacter(character, isCaseSensitive);
+    }
 
-    return static_cast<uint16_t>(toASCIILower(character)) | caseInsensitiveMask;
-}
+    enum UniversalTransitionTag { UniversalTransition };
+    explicit Term(UniversalTransitionTag)
+        : m_termType(TermType::CharacterSet)
+    {
+        new (NotNull, &m_atomData.characterSet) CharacterSet();
+        for (unsigned i = 0; i < 128; ++i)
+            m_atomData.characterSet.characters.set(i);
+    }
 
-enum class TrivialAtomQuantifier : uint16_t {
-    ZeroOrOne = 0x1000,
-    ZeroToMany = 0x2000,
-    OneToMany = 0x4000
-};
+    enum CharacterSetTermTag { CharacterSetTerm };
+    explicit Term(CharacterSetTermTag, bool isInverted)
+        : m_termType(TermType::CharacterSet)
+    {
+        new (NotNull, &m_atomData.characterSet) CharacterSet();
+        m_atomData.characterSet.inverted = isInverted;
+    }
 
-static void quantifyTrivialAtom(TrivialAtom& trivialAtom, TrivialAtomQuantifier quantifier)
-{
-    ASSERT(trivialAtom & (hasNonCharacterMask | characterMask));
-    ASSERT(!(trivialAtom & 0xf000));
-    trivialAtom |= static_cast<uint16_t>(quantifier);
-}
+    enum GroupTermTag { GroupTerm };
+    explicit Term(GroupTermTag)
+        : m_termType(TermType::Group)
+    {
+        new (NotNull, &m_atomData.group) Group();
+    }
 
-static TrivialAtom trivialAtomForNewlineClassIDBuiltin()
-{
-    return hasNonCharacterMask | newlineClassIDBuiltinMask;
-}
+    Term(const Term& other)
+        : m_termType(other.m_termType)
+        , m_quantifier(other.m_quantifier)
+    {
+        switch (m_termType) {
+        case TermType::Empty:
+        case TermType::Deleted:
+            break;
+        case TermType::CharacterSet:
+            new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
+            break;
+        case TermType::Group:
+            new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
+            break;
+        }
+    }
+
+    Term(Term&& other)
+        : m_termType(WTF::move(other.m_termType))
+        , m_quantifier(WTF::move(other.m_quantifier))
+    {
+        switch (m_termType) {
+        case TermType::Empty:
+        case TermType::Deleted:
+            break;
+        case TermType::CharacterSet:
+            new (NotNull, &m_atomData.characterSet) CharacterSet(WTF::move(other.m_atomData.characterSet));
+            break;
+        case TermType::Group:
+            new (NotNull, &m_atomData.group) Group(WTF::move(other.m_atomData.group));
+            break;
+        }
+        other.destroy();
+    }
+
+    enum EmptyValueTag { EmptyValue };
+    Term(EmptyValueTag)
+        : m_termType(TermType::Empty)
+    {
+    }
+
+    enum DeletedValueTag { DeletedValue };
+    Term(DeletedValueTag)
+        : m_termType(TermType::Deleted)
+    {
+    }
+
+    ~Term()
+    {
+        destroy();
+    }
+
+    bool isValid() const
+    {
+        return m_termType != TermType::Empty && m_termType != TermType::Deleted;
+    }
+
+    void addCharacter(UChar character, bool isCaseSensitive)
+    {
+        ASSERT(isASCII(character));
+
+        ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
+        if (m_termType != TermType::CharacterSet)
+            return;
+
+        if (isCaseSensitive || !isASCIIAlpha(character))
+            m_atomData.characterSet.characters.set(character);
+        else {
+            m_atomData.characterSet.characters.set(toASCIIUpper(character));
+            m_atomData.characterSet.characters.set(toASCIILower(character));
+        }
+    }
+
+    void extendGroupSubpattern(const Term& term)
+    {
+        ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
+        if (m_termType != TermType::Group)
+            return;
+        m_atomData.group.terms.append(term);
+    }
+
+    void quantify(const AtomQuantifier& quantifier)
+    {
+        ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
+        m_quantifier = quantifier;
+    }
+
+    unsigned generateGraph(NFA& nfa, uint64_t patternId, unsigned start) const
+    {
+        ASSERT(isValid());
+
+        switch (m_quantifier) {
+        case AtomQuantifier::One: {
+            unsigned newEnd = generateSubgraphForAtom(nfa, patternId, start);
+            return newEnd;
+        }
+        case AtomQuantifier::ZeroOrOne: {
+            unsigned newEnd = generateSubgraphForAtom(nfa, patternId, start);
+            nfa.addEpsilonTransition(start, newEnd);
+            return newEnd;
+        }
+        case AtomQuantifier::ZeroOrMore: {
+            unsigned repeatStart = nfa.createNode();
+            nfa.addRuleId(repeatStart, patternId);
+            nfa.addEpsilonTransition(start, repeatStart);
+
+            unsigned repeatEnd = generateSubgraphForAtom(nfa, patternId, repeatStart);
+            nfa.addEpsilonTransition(repeatEnd, repeatStart);
+
+            unsigned kleenEnd = nfa.createNode();
+            nfa.addRuleId(kleenEnd, patternId);
+            nfa.addEpsilonTransition(repeatEnd, kleenEnd);
+            nfa.addEpsilonTransition(start, kleenEnd);
+            return kleenEnd;
+        }
+        case AtomQuantifier::OneOrMore: {
+            unsigned repeatStart = nfa.createNode();
+            nfa.addRuleId(repeatStart, patternId);
+            nfa.addEpsilonTransition(start, repeatStart);
+
+            unsigned repeatEnd = generateSubgraphForAtom(nfa, patternId, repeatStart);
+            nfa.addEpsilonTransition(repeatEnd, repeatStart);
+
+            unsigned afterRepeat = nfa.createNode();
+            nfa.addRuleId(afterRepeat, patternId);
+            nfa.addEpsilonTransition(repeatEnd, afterRepeat);
+            return afterRepeat;
+        }
+        }
+    }
+
+    Term& operator=(const Term& other)
+    {
+        destroy();
+        new (NotNull, this) Term(other);
+        return *this;
+    }
+
+    Term& operator=(Term&& other)
+    {
+        destroy();
+        new (NotNull, this) Term(WTF::move(other));
+        return *this;
+    }
+
+    bool operator==(const Term& other) const
+    {
+        if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
+            return false;
+
+        switch (m_termType) {
+        case TermType::Empty:
+        case TermType::Deleted:
+            return true;
+        case TermType::CharacterSet:
+            return m_atomData.characterSet == other.m_atomData.characterSet;
+        case TermType::Group:
+            return m_atomData.group == other.m_atomData.group;
+        }
+        ASSERT_NOT_REACHED();
+        return false;
+    }
+
+    unsigned hash() const
+    {
+        unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
+        unsigned secondary = 0;
+        switch (m_termType) {
+        case TermType::Empty:
+            secondary = 52184393;
+            break;
+        case TermType::Deleted:
+            secondary = 40342988;
+            break;
+        case TermType::CharacterSet:
+            secondary = m_atomData.characterSet.hash();
+            break;
+        case TermType::Group:
+            secondary = m_atomData.group.hash();
+            break;
+        }
+        return WTF::pairIntHash(primary, secondary);
+    }
+
+    bool isEmptyValue() const
+    {
+        return m_termType == TermType::Empty;
+    }
+
+    bool isDeletedValue() const
+    {
+        return m_termType == TermType::Deleted;
+    }
 
-class GraphBuilder {
 private:
-    struct BoundedSubGraph {
-        unsigned start;
-        unsigned end;
+    bool isUniversalTransition() const
+    {
+        return m_termType == TermType::CharacterSet
+            && ((m_atomData.characterSet.inverted && !m_atomData.characterSet.characters.bitCount())
+                || (!m_atomData.characterSet.inverted && m_atomData.characterSet.characters.bitCount() == 128));
+    }
+
+    unsigned generateSubgraphForAtom(NFA& nfa, uint64_t patternId, unsigned source) const
+    {
+        switch (m_termType) {
+        case TermType::Empty:
+        case TermType::Deleted:
+            ASSERT_NOT_REACHED();
+            return -1;
+        case TermType::CharacterSet: {
+            unsigned target = nfa.createNode();
+            nfa.addRuleId(target, patternId);
+            if (isUniversalTransition())
+                nfa.addTransitionsOnAnyCharacter(source, target);
+            else {
+                if (!m_atomData.characterSet.inverted) {
+                    for (const auto& characterIterator : m_atomData.characterSet.characters.setBits())
+                        nfa.addTransition(source, target, static_cast<char>(characterIterator));
+                } else {
+                    for (unsigned i = 1; i < m_atomData.characterSet.characters.size(); ++i) {
+                        if (m_atomData.characterSet.characters.get(i))
+                            continue;
+                        nfa.addTransition(source, target, static_cast<char>(i));
+                    }
+                }
+            }
+            return target;
+        }
+        case TermType::Group: {
+            unsigned lastTarget = source;
+            for (const Term& term : m_atomData.group.terms)
+                lastTarget = term.generateGraph(nfa, patternId, lastTarget);
+            return lastTarget;
+        }
+        }
+    }
+
+    void destroy()
+    {
+        switch (m_termType) {
+        case TermType::Empty:
+        case TermType::Deleted:
+            break;
+        case TermType::CharacterSet:
+            m_atomData.characterSet.~CharacterSet();
+            break;
+        case TermType::Group:
+            m_atomData.group.~Group();
+            break;
+        }
+        m_termType = TermType::Deleted;
+    }
+
+    enum class TermType : uint8_t {
+        Empty,
+        Deleted,
+        CharacterSet,
+        Group
+    };
+
+    TermType m_termType { TermType::Empty };
+    AtomQuantifier m_quantifier { AtomQuantifier::One };
+
+    struct CharacterSet {
+        bool inverted { false };
+        BitVector characters { 128 };
+
+        bool operator==(const CharacterSet& other) const
+        {
+            return other.inverted == inverted && other.characters == characters;
+        }
+
+        unsigned hash() const
+        {
+            return WTF::pairIntHash(inverted, characters.hash());
+        }
     };
+
+    struct Group {
+        Vector<Term> terms;
+
+        bool operator==(const Group& other) const
+        {
+            return other.terms == terms;
+        }
+
+        unsigned hash() const
+        {
+            unsigned hash = 6421749;
+            for (const Term& term : terms) {
+                unsigned termHash = term.hash();
+                hash = (hash << 16) ^ ((termHash << 11) ^ hash);
+                hash += hash >> 11;
+            }
+            return hash;
+        }
+    };
+
+    union AtomData {
+        AtomData()
+            : invalidTerm(0)
+        {
+        }
+        ~AtomData()
+        {
+        }
+
+        char invalidTerm;
+        CharacterSet characterSet;
+        Group group;
+    } m_atomData;
+};
+
+struct TermHash {
+    static unsigned hash(const Term& term) { return term.hash(); }
+    static bool equal(const Term& a, const Term& b) { return a == b; }
+    static const bool safeToCompareToEmptyOrDeleted = true;
+};
+
+struct TermHashTraits : public WTF::CustomHashTraits<Term> { };
+
+struct PrefixTreeEntry {
+    unsigned nfaNode;
+    HashMap<Term, std::unique_ptr<PrefixTreeEntry>, TermHash, TermHashTraits> nextPattern;
+};
+
+class GraphBuilder {
 public:
     GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, bool patternIsCaseSensitive, uint64_t patternId)
         : m_nfa(nfa)
         , m_patternIsCaseSensitive(patternIsCaseSensitive)
         , m_patternId(patternId)
-        , m_activeGroup({ nfa.root(), nfa.root() })
+        , m_subtreeStart(nfa.root())
+        , m_subtreeEnd(nfa.root())
         , m_lastPrefixTreeEntry(&prefixTreeRoot)
     {
     }
@@ -89,10 +424,15 @@ public:
         if (hasError())
             return;
 
-        sinkPendingAtomIfNecessary();
+        sinkFloatingTermIfNecessary();
+
+        if (!m_openGroups.isEmpty()) {
+            fail(ASCIILiteral("The expression has unclosed groups."));
+            return;
+        }
 
-        if (m_activeGroup.start != m_activeGroup.end)
-            m_nfa.setFinal(m_activeGroup.end, m_patternId);
+        if (m_subtreeStart != m_subtreeEnd)
+            m_nfa.setFinal(m_subtreeEnd, m_patternId);
         else
             fail(ASCIILiteral("The pattern cannot match anything."));
     }
@@ -104,21 +444,19 @@ public:
 
     void atomPatternCharacter(UChar character)
     {
+        if (hasError())
+            return;
+
         if (!isASCII(character)) {
             fail(ASCIILiteral("Only ASCII characters are supported in pattern."));
             return;
         }
 
-        if (hasError())
-            return;
-
-        sinkPendingAtomIfNecessary();
+        sinkFloatingTermIfNecessary();
+        ASSERT(!m_floatingTerm.isValid());
 
         char asciiChararacter = static_cast<char>(character);
-        m_hasValidAtom = true;
-
-        ASSERT(m_lastPrefixTreeEntry);
-        m_pendingTrivialAtom = trivialAtomFromASCIICharacter(asciiChararacter, m_patternIsCaseSensitive);
+        m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive);
     }
 
     void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
@@ -126,11 +464,12 @@ public:
         if (hasError())
             return;
 
-        if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted) {
-            m_hasValidAtom = true;
-            ASSERT(m_lastPrefixTreeEntry);
-            m_pendingTrivialAtom = trivialAtomForNewlineClassIDBuiltin();
-        } else
+        sinkFloatingTermIfNecessary();
+        ASSERT(!m_floatingTerm.isValid());
+
+        if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted)
+            m_floatingTerm = Term(Term::UniversalTransition);
+        else
             fail(ASCIILiteral("Character class is not supported."));
     }
 
@@ -139,32 +478,22 @@ public:
         if (hasError())
             return;
 
-        ASSERT(m_hasValidAtom);
-        if (!m_hasValidAtom) {
-            fail(ASCIILiteral("Quantifier without corresponding atom to quantify."));
-            return;
-        }
+        if (!m_floatingTerm.isValid())
+            fail(ASCIILiteral("Quantifier without corresponding term to quantify."));
 
-        ASSERT(m_lastPrefixTreeEntry);
         if (!minimum && maximum == 1)
-            quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroOrOne);
+            m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne);
         else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
-            quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroToMany);
+            m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore);
         else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
-            quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::OneToMany);
+            m_floatingTerm.quantify(AtomQuantifier::OneOrMore);
         else
             fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
     }
 
-    NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned)
+    void atomBackReference(unsigned)
     {
         fail(ASCIILiteral("Patterns cannot contain backreferences."));
-        ASSERT_NOT_REACHED();
-    }
-
-    void atomCharacterClassAtom(UChar)
-    {
-        fail(ASCIILiteral("Character class atoms are not supported yet."));
     }
 
     void assertionBOL()
@@ -182,29 +511,62 @@ public:
         fail(ASCIILiteral("Word boundaries assertions are not supported yet."));
     }
 
-    void atomCharacterClassBegin(bool = false)
+    void atomCharacterClassBegin(bool inverted = false)
     {
-        fail(ASCIILiteral("Character class atoms are not supported yet."));
+        if (hasError())
+            return;
+
+        sinkFloatingTermIfNecessary();
+        ASSERT(!m_floatingTerm.isValid());
+
+        m_floatingTerm = Term(Term::CharacterSetTerm, inverted);
     }
 
-    void atomCharacterClassRange(UChar, UChar)
+    void atomCharacterClassAtom(UChar character)
     {
-        fail(ASCIILiteral("Character class ranges are not supported yet."));
+        if (hasError())
+            return;
+
+        if (!isASCII(character)) {
+            fail(ASCIILiteral("Non ASCII Character in a character set."));
+            return;
+        }
+
+        m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive);
     }
 
-    void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
+    void atomCharacterClassRange(UChar a, UChar b)
     {
-        fail(ASCIILiteral("Buildins character class atoms are not supported yet."));
+        if (hasError())
+            return;
+
+        if (!a || !b || !isASCII(a) || !isASCII(b)) {
+            fail(ASCIILiteral("Non ASCII Character in a character range of a character set."));
+            return;
+        }
+
+        for (unsigned i = a; i <= b; ++i)
+            m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive);
     }
 
     void atomCharacterClassEnd()
     {
-        fail(ASCIILiteral("Character class are not supported yet."));
+        // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily.
+    }
+
+    void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
+    {
+        fail(ASCIILiteral("Builtins character class atoms are not supported yet."));
     }
 
     void atomParenthesesSubpatternBegin(bool = true)
     {
-        fail(ASCIILiteral("Groups are not supported yet."));
+        if (hasError())
+            return;
+
+        sinkFloatingTermIfNecessary();
+
+        m_openGroups.append(Term(Term::GroupTerm));
     }
 
     void atomParentheticalAssertionBegin(bool = false)
@@ -214,7 +576,13 @@ public:
 
     void atomParenthesesEnd()
     {
-        fail(ASCIILiteral("Groups are not supported yet."));
+        if (hasError())
+            return;
+
+        sinkFloatingTermIfNecessary();
+        ASSERT(!m_floatingTerm.isValid());
+
+        m_floatingTerm = m_openGroups.takeLast();
     }
 
     void disjunction()
@@ -239,127 +607,71 @@ private:
         m_errorMessage = errorMessage;
     }
 
-    void generateTransition(TrivialAtom trivialAtom, unsigned source, unsigned target)
-    {
-        if (trivialAtom & hasNonCharacterMask) {
-            ASSERT(trivialAtom & newlineClassIDBuiltinMask);
-            for (unsigned i = 1; i < 128; ++i)
-                m_nfa.addTransition(source, target, i);
-        } else {
-            if (trivialAtom & caseInsensitiveMask) {
-                char character = static_cast<char>(trivialAtom & characterMask);
-                m_nfa.addTransition(source, target, character);
-                m_nfa.addTransition(source, target, toASCIIUpper(character));
-            } else
-                m_nfa.addTransition(source, target, static_cast<char>(trivialAtom & characterMask));
-        }
-    }
-
-    BoundedSubGraph sinkTrivialAtom(TrivialAtom trivialAtom, unsigned start)
+    void sinkFloatingTermIfNecessary()
     {
-        if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroOrOne)) {
-            unsigned newEnd = m_nfa.createNode();
-            m_nfa.addRuleId(newEnd, m_patternId);
-            generateTransition(trivialAtom, start, newEnd);
-            m_nfa.addEpsilonTransition(start, newEnd);
-            return { start, newEnd };
-        }
-
-        if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroToMany)) {
-            unsigned repeatStart = m_nfa.createNode();
-            m_nfa.addRuleId(repeatStart, m_patternId);
-            unsigned repeatEnd = m_nfa.createNode();
-            m_nfa.addRuleId(repeatEnd, m_patternId);
-
-            generateTransition(trivialAtom, repeatStart, repeatEnd);
-            m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
-
-            m_nfa.addEpsilonTransition(start, repeatStart);
-
-            unsigned kleenEnd = m_nfa.createNode();
-            m_nfa.addRuleId(kleenEnd, m_patternId);
-            m_nfa.addEpsilonTransition(repeatEnd, kleenEnd);
-            m_nfa.addEpsilonTransition(start, kleenEnd);
-            return { start, kleenEnd };
-        }
-
-        if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::OneToMany)) {
-            unsigned repeatStart = m_nfa.createNode();
-            m_nfa.addRuleId(repeatStart, m_patternId);
-            unsigned repeatEnd = m_nfa.createNode();
-            m_nfa.addRuleId(repeatEnd, m_patternId);
-
-            generateTransition(trivialAtom, repeatStart, repeatEnd);
-            m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
-
-            m_nfa.addEpsilonTransition(start, repeatStart);
-
-            unsigned afterRepeat = m_nfa.createNode();
-            m_nfa.addRuleId(afterRepeat, m_patternId);
-            m_nfa.addEpsilonTransition(repeatEnd, afterRepeat);
-            return { start, afterRepeat };
-        }
-
-        unsigned newEnd = m_nfa.createNode();
-        m_nfa.addRuleId(newEnd, m_patternId);
-        generateTransition(trivialAtom, start, newEnd);
-        return { start, newEnd };
-    }
+        if (!m_floatingTerm.isValid())
+            return;
 
-    void sinkPendingAtomIfNecessary()
-    {
         ASSERT(m_lastPrefixTreeEntry);
 
-        if (!m_hasValidAtom)
+        if (!m_openGroups.isEmpty()) {
+            m_openGroups.last().extendGroupSubpattern(m_floatingTerm);
+            m_floatingTerm = Term();
             return;
+        }
 
-        ASSERT(m_pendingTrivialAtom);
-
-        auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom);
+        auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_floatingTerm);
         if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
             m_lastPrefixTreeEntry = nextEntry->value.get();
             m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
         } else {
             std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
 
-            BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode);
-            nextPrefixTreeEntry->nfaNode = newSubGraph.end;
+            unsigned newEnd = m_floatingTerm.generateGraph(m_nfa, m_patternId, m_lastPrefixTreeEntry->nfaNode);
+            nextPrefixTreeEntry->nfaNode = newEnd;
 
-            auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry));
+            auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_floatingTerm, WTF::move(nextPrefixTreeEntry));
             ASSERT(addResult.isNewEntry);
 
-            m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
-            m_newPrefixStaringPoint = m_pendingTrivialAtom;
+            if (!m_newPrefixSubtreeRoot) {
+                m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
+                m_newPrefixStaringPoint = m_floatingTerm;
+            }
 
             m_lastPrefixTreeEntry = addResult.iterator->value.get();
         }
-        ASSERT(m_lastPrefixTreeEntry);
+        m_subtreeEnd = m_lastPrefixTreeEntry->nfaNode;
 
-        m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode;
-        m_pendingTrivialAtom = 0;
-        m_hasValidAtom = false;
+        m_floatingTerm = Term();
+        ASSERT(m_lastPrefixTreeEntry);
     }
 
     NFA& m_nfa;
     bool m_patternIsCaseSensitive;
     const uint64_t m_patternId;
 
-    BoundedSubGraph m_activeGroup;
+    unsigned m_subtreeStart { 0 };
+    unsigned m_subtreeEnd { 0 };
 
     PrefixTreeEntry* m_lastPrefixTreeEntry;
-    bool m_hasValidAtom = false;
-    TrivialAtom m_pendingTrivialAtom = 0;
+    Deque<Term> m_openGroups;
+    Term m_floatingTerm;
 
     PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
-    TrivialAtom m_newPrefixStaringPoint = 0;
+    Term m_newPrefixStaringPoint;
 
     String m_errorMessage;
 };
 
 URLFilterParser::URLFilterParser(NFA& nfa)
     : m_nfa(nfa)
+    , m_prefixTreeRoot(std::make_unique<PrefixTreeEntry>())
+{
+    m_prefixTreeRoot->nfaNode = nfa.root();
+}
+
+URLFilterParser::~URLFilterParser()
 {
-    m_prefixTreeRoot.nfaNode = nfa.root();
 }
 
 String URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
@@ -375,7 +687,7 @@ String URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSens
 
     String error;
 
-    GraphBuilder graphBuilder(m_nfa, m_prefixTreeRoot, patternIsCaseSensitive, patternId);
+    GraphBuilder graphBuilder(m_nfa, *m_prefixTreeRoot, patternIsCaseSensitive, patternId);
     error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
     if (error.isNull())
         graphBuilder.finalize();