2 * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLFilterParser.h"
29 #if ENABLE(CONTENT_EXTENSIONS)
32 #include <JavaScriptCore/YarrParser.h>
33 #include <wtf/BitVector.h>
34 #include <wtf/Deque.h>
38 namespace ContentExtensions {
40 enum class AtomQuantifier : uint8_t {
53 Term(char character, bool isCaseSensitive)
54 : m_termType(TermType::CharacterSet)
56 new (NotNull, &m_atomData.characterSet) CharacterSet();
57 addCharacter(character, isCaseSensitive);
60 enum UniversalTransitionTag { UniversalTransition };
61 explicit Term(UniversalTransitionTag)
62 : m_termType(TermType::CharacterSet)
64 new (NotNull, &m_atomData.characterSet) CharacterSet();
65 for (unsigned i = 0; i < 128; ++i)
66 m_atomData.characterSet.characters.set(i);
69 enum CharacterSetTermTag { CharacterSetTerm };
70 explicit Term(CharacterSetTermTag, bool isInverted)
71 : m_termType(TermType::CharacterSet)
73 new (NotNull, &m_atomData.characterSet) CharacterSet();
74 m_atomData.characterSet.inverted = isInverted;
77 enum GroupTermTag { GroupTerm };
78 explicit Term(GroupTermTag)
79 : m_termType(TermType::Group)
81 new (NotNull, &m_atomData.group) Group();
84 Term(const Term& other)
85 : m_termType(other.m_termType)
86 , m_quantifier(other.m_quantifier)
90 case TermType::Deleted:
92 case TermType::CharacterSet:
93 new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
96 new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
102 : m_termType(WTF::move(other.m_termType))
103 , m_quantifier(WTF::move(other.m_quantifier))
105 switch (m_termType) {
106 case TermType::Empty:
107 case TermType::Deleted:
109 case TermType::CharacterSet:
110 new (NotNull, &m_atomData.characterSet) CharacterSet(WTF::move(other.m_atomData.characterSet));
112 case TermType::Group:
113 new (NotNull, &m_atomData.group) Group(WTF::move(other.m_atomData.group));
119 enum EmptyValueTag { EmptyValue };
121 : m_termType(TermType::Empty)
125 enum DeletedValueTag { DeletedValue };
126 Term(DeletedValueTag)
127 : m_termType(TermType::Deleted)
138 return m_termType != TermType::Empty && m_termType != TermType::Deleted;
141 void addCharacter(UChar character, bool isCaseSensitive)
143 ASSERT(isASCII(character));
145 ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
146 if (m_termType != TermType::CharacterSet)
149 if (isCaseSensitive || !isASCIIAlpha(character))
150 m_atomData.characterSet.characters.set(character);
152 m_atomData.characterSet.characters.set(toASCIIUpper(character));
153 m_atomData.characterSet.characters.set(toASCIILower(character));
157 void extendGroupSubpattern(const Term& term)
159 ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
160 if (m_termType != TermType::Group)
162 m_atomData.group.terms.append(term);
165 void quantify(const AtomQuantifier& quantifier)
167 ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
168 m_quantifier = quantifier;
171 unsigned generateGraph(NFA& nfa, uint64_t patternId, unsigned start) const
175 switch (m_quantifier) {
176 case AtomQuantifier::One: {
177 unsigned newEnd = generateSubgraphForAtom(nfa, patternId, start);
180 case AtomQuantifier::ZeroOrOne: {
181 unsigned newEnd = generateSubgraphForAtom(nfa, patternId, start);
182 nfa.addEpsilonTransition(start, newEnd);
185 case AtomQuantifier::ZeroOrMore: {
186 unsigned repeatStart = nfa.createNode();
187 nfa.addRuleId(repeatStart, patternId);
188 nfa.addEpsilonTransition(start, repeatStart);
190 unsigned repeatEnd = generateSubgraphForAtom(nfa, patternId, repeatStart);
191 nfa.addEpsilonTransition(repeatEnd, repeatStart);
193 unsigned kleenEnd = nfa.createNode();
194 nfa.addRuleId(kleenEnd, patternId);
195 nfa.addEpsilonTransition(repeatEnd, kleenEnd);
196 nfa.addEpsilonTransition(start, kleenEnd);
199 case AtomQuantifier::OneOrMore: {
200 unsigned repeatStart = nfa.createNode();
201 nfa.addRuleId(repeatStart, patternId);
202 nfa.addEpsilonTransition(start, repeatStart);
204 unsigned repeatEnd = generateSubgraphForAtom(nfa, patternId, repeatStart);
205 nfa.addEpsilonTransition(repeatEnd, repeatStart);
207 unsigned afterRepeat = nfa.createNode();
208 nfa.addRuleId(afterRepeat, patternId);
209 nfa.addEpsilonTransition(repeatEnd, afterRepeat);
215 Term& operator=(const Term& other)
218 new (NotNull, this) Term(other);
222 Term& operator=(Term&& other)
225 new (NotNull, this) Term(WTF::move(other));
229 bool operator==(const Term& other) const
231 if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
234 switch (m_termType) {
235 case TermType::Empty:
236 case TermType::Deleted:
238 case TermType::CharacterSet:
239 return m_atomData.characterSet == other.m_atomData.characterSet;
240 case TermType::Group:
241 return m_atomData.group == other.m_atomData.group;
243 ASSERT_NOT_REACHED();
247 unsigned hash() const
249 unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
250 unsigned secondary = 0;
251 switch (m_termType) {
252 case TermType::Empty:
253 secondary = 52184393;
255 case TermType::Deleted:
256 secondary = 40342988;
258 case TermType::CharacterSet:
259 secondary = m_atomData.characterSet.hash();
261 case TermType::Group:
262 secondary = m_atomData.group.hash();
265 return WTF::pairIntHash(primary, secondary);
268 bool isEmptyValue() const
270 return m_termType == TermType::Empty;
273 bool isDeletedValue() const
275 return m_termType == TermType::Deleted;
279 bool isUniversalTransition() const
281 return m_termType == TermType::CharacterSet
282 && ((m_atomData.characterSet.inverted && !m_atomData.characterSet.characters.bitCount())
283 || (!m_atomData.characterSet.inverted && m_atomData.characterSet.characters.bitCount() == 128));
286 unsigned generateSubgraphForAtom(NFA& nfa, uint64_t patternId, unsigned source) const
288 switch (m_termType) {
289 case TermType::Empty:
290 case TermType::Deleted:
291 ASSERT_NOT_REACHED();
293 case TermType::CharacterSet: {
294 unsigned target = nfa.createNode();
295 nfa.addRuleId(target, patternId);
296 if (isUniversalTransition())
297 nfa.addTransitionsOnAnyCharacter(source, target);
299 if (!m_atomData.characterSet.inverted) {
300 for (const auto& characterIterator : m_atomData.characterSet.characters.setBits())
301 nfa.addTransition(source, target, static_cast<char>(characterIterator));
303 for (unsigned i = 1; i < m_atomData.characterSet.characters.size(); ++i) {
304 if (m_atomData.characterSet.characters.get(i))
306 nfa.addTransition(source, target, static_cast<char>(i));
312 case TermType::Group: {
313 unsigned lastTarget = source;
314 for (const Term& term : m_atomData.group.terms)
315 lastTarget = term.generateGraph(nfa, patternId, lastTarget);
323 switch (m_termType) {
324 case TermType::Empty:
325 case TermType::Deleted:
327 case TermType::CharacterSet:
328 m_atomData.characterSet.~CharacterSet();
330 case TermType::Group:
331 m_atomData.group.~Group();
334 m_termType = TermType::Deleted;
337 enum class TermType : uint8_t {
344 TermType m_termType { TermType::Empty };
345 AtomQuantifier m_quantifier { AtomQuantifier::One };
347 struct CharacterSet {
348 bool inverted { false };
349 BitVector characters { 128 };
351 bool operator==(const CharacterSet& other) const
353 return other.inverted == inverted && other.characters == characters;
356 unsigned hash() const
358 return WTF::pairIntHash(inverted, characters.hash());
365 bool operator==(const Group& other) const
367 return other.terms == terms;
370 unsigned hash() const
372 unsigned hash = 6421749;
373 for (const Term& term : terms) {
374 unsigned termHash = term.hash();
375 hash = (hash << 16) ^ ((termHash << 11) ^ hash);
392 CharacterSet characterSet;
398 static unsigned hash(const Term& term) { return term.hash(); }
399 static bool equal(const Term& a, const Term& b) { return a == b; }
400 static const bool safeToCompareToEmptyOrDeleted = true;
403 struct TermHashTraits : public WTF::CustomHashTraits<Term> { };
405 struct PrefixTreeEntry {
407 HashMap<Term, std::unique_ptr<PrefixTreeEntry>, TermHash, TermHashTraits> nextPattern;
412 GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, bool patternIsCaseSensitive, uint64_t patternId)
414 , m_patternIsCaseSensitive(patternIsCaseSensitive)
415 , m_patternId(patternId)
416 , m_subtreeStart(nfa.root())
417 , m_subtreeEnd(nfa.root())
418 , m_lastPrefixTreeEntry(&prefixTreeRoot)
427 sinkFloatingTermIfNecessary();
429 if (!m_openGroups.isEmpty()) {
430 fail(ASCIILiteral("The expression has unclosed groups."));
434 if (m_subtreeStart != m_subtreeEnd)
435 m_nfa.setFinal(m_subtreeEnd, m_patternId);
437 fail(ASCIILiteral("The pattern cannot match anything."));
440 const String& errorMessage() const
442 return m_errorMessage;
445 void atomPatternCharacter(UChar character)
450 if (!isASCII(character)) {
451 fail(ASCIILiteral("Only ASCII characters are supported in pattern."));
455 sinkFloatingTermIfNecessary();
456 ASSERT(!m_floatingTerm.isValid());
458 char asciiChararacter = static_cast<char>(character);
459 m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive);
462 void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
467 sinkFloatingTermIfNecessary();
468 ASSERT(!m_floatingTerm.isValid());
470 if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted)
471 m_floatingTerm = Term(Term::UniversalTransition);
473 fail(ASCIILiteral("Character class is not supported."));
476 void quantifyAtom(unsigned minimum, unsigned maximum, bool)
481 if (!m_floatingTerm.isValid())
482 fail(ASCIILiteral("Quantifier without corresponding term to quantify."));
484 if (!minimum && maximum == 1)
485 m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne);
486 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
487 m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore);
488 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
489 m_floatingTerm.quantify(AtomQuantifier::OneOrMore);
491 fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
494 void atomBackReference(unsigned)
496 fail(ASCIILiteral("Patterns cannot contain backreferences."));
501 fail(ASCIILiteral("Line boundary assertions are not supported yet."));
506 fail(ASCIILiteral("Line boundary assertions are not supported yet."));
509 void assertionWordBoundary(bool)
511 fail(ASCIILiteral("Word boundaries assertions are not supported yet."));
514 void atomCharacterClassBegin(bool inverted = false)
519 sinkFloatingTermIfNecessary();
520 ASSERT(!m_floatingTerm.isValid());
522 m_floatingTerm = Term(Term::CharacterSetTerm, inverted);
525 void atomCharacterClassAtom(UChar character)
530 if (!isASCII(character)) {
531 fail(ASCIILiteral("Non ASCII Character in a character set."));
535 m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive);
538 void atomCharacterClassRange(UChar a, UChar b)
543 if (!a || !b || !isASCII(a) || !isASCII(b)) {
544 fail(ASCIILiteral("Non ASCII Character in a character range of a character set."));
548 for (unsigned i = a; i <= b; ++i)
549 m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive);
552 void atomCharacterClassEnd()
554 // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily.
557 void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
559 fail(ASCIILiteral("Builtins character class atoms are not supported yet."));
562 void atomParenthesesSubpatternBegin(bool = true)
567 sinkFloatingTermIfNecessary();
569 m_openGroups.append(Term(Term::GroupTerm));
572 void atomParentheticalAssertionBegin(bool = false)
574 fail(ASCIILiteral("Groups are not supported yet."));
577 void atomParenthesesEnd()
582 sinkFloatingTermIfNecessary();
583 ASSERT(!m_floatingTerm.isValid());
585 m_floatingTerm = m_openGroups.takeLast();
590 fail(ASCIILiteral("Disjunctions are not supported yet."));
594 bool hasError() const
596 return !m_errorMessage.isNull();
599 void fail(const String& errorMessage)
604 if (m_newPrefixSubtreeRoot)
605 m_newPrefixSubtreeRoot->nextPattern.remove(m_newPrefixStaringPoint);
607 m_errorMessage = errorMessage;
610 void sinkFloatingTermIfNecessary()
612 if (!m_floatingTerm.isValid())
615 ASSERT(m_lastPrefixTreeEntry);
617 if (!m_openGroups.isEmpty()) {
618 m_openGroups.last().extendGroupSubpattern(m_floatingTerm);
619 m_floatingTerm = Term();
623 auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_floatingTerm);
624 if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
625 m_lastPrefixTreeEntry = nextEntry->value.get();
626 m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
628 std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
630 unsigned newEnd = m_floatingTerm.generateGraph(m_nfa, m_patternId, m_lastPrefixTreeEntry->nfaNode);
631 nextPrefixTreeEntry->nfaNode = newEnd;
633 auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_floatingTerm, WTF::move(nextPrefixTreeEntry));
634 ASSERT(addResult.isNewEntry);
636 if (!m_newPrefixSubtreeRoot) {
637 m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
638 m_newPrefixStaringPoint = m_floatingTerm;
641 m_lastPrefixTreeEntry = addResult.iterator->value.get();
643 m_subtreeEnd = m_lastPrefixTreeEntry->nfaNode;
645 m_floatingTerm = Term();
646 ASSERT(m_lastPrefixTreeEntry);
650 bool m_patternIsCaseSensitive;
651 const uint64_t m_patternId;
653 unsigned m_subtreeStart { 0 };
654 unsigned m_subtreeEnd { 0 };
656 PrefixTreeEntry* m_lastPrefixTreeEntry;
657 Deque<Term> m_openGroups;
660 PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
661 Term m_newPrefixStaringPoint;
663 String m_errorMessage;
666 URLFilterParser::URLFilterParser(NFA& nfa)
668 , m_prefixTreeRoot(std::make_unique<PrefixTreeEntry>())
670 m_prefixTreeRoot->nfaNode = nfa.root();
673 URLFilterParser::~URLFilterParser()
677 String URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
679 if (!pattern.containsOnlyASCII())
680 return ASCIILiteral("URLFilterParser only supports ASCII patterns.");
681 ASSERT(!pattern.isEmpty());
683 if (pattern.isEmpty())
684 return ASCIILiteral("Empty pattern.");
686 unsigned oldSize = m_nfa.graphSize();
690 GraphBuilder graphBuilder(m_nfa, *m_prefixTreeRoot, patternIsCaseSensitive, patternId);
691 error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
693 graphBuilder.finalize();
696 error = graphBuilder.errorMessage();
699 m_nfa.restoreToGraphSize(oldSize);
704 } // namespace ContentExtensions
705 } // namespace WebCore
707 #endif // ENABLE(CONTENT_EXTENSIONS)