Extend URL filter's Term definition to support groups/subpatterns
[WebKit-https.git] / Source / WebCore / contentextensions / URLFilterParser.cpp
1 /*
2  * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLFilterParser.h"
28
29 #if ENABLE(CONTENT_EXTENSIONS)
30
31 #include "NFA.h"
32 #include <JavaScriptCore/YarrParser.h>
33 #include <wtf/BitVector.h>
34 #include <wtf/Deque.h>
35
36 namespace WebCore {
37
38 namespace ContentExtensions {
39
40 enum class AtomQuantifier : uint8_t {
41     One,
42     ZeroOrOne,
43     ZeroOrMore,
44     OneOrMore
45 };
46
47 class Term {
48 public:
49     Term()
50     {
51     }
52
53     Term(char character, bool isCaseSensitive)
54         : m_termType(TermType::CharacterSet)
55     {
56         new (NotNull, &m_atomData.characterSet) CharacterSet();
57         addCharacter(character, isCaseSensitive);
58     }
59
60     enum UniversalTransitionTag { UniversalTransition };
61     explicit Term(UniversalTransitionTag)
62         : m_termType(TermType::CharacterSet)
63     {
64         new (NotNull, &m_atomData.characterSet) CharacterSet();
65         for (unsigned i = 0; i < 128; ++i)
66             m_atomData.characterSet.characters.set(i);
67     }
68
69     enum CharacterSetTermTag { CharacterSetTerm };
70     explicit Term(CharacterSetTermTag, bool isInverted)
71         : m_termType(TermType::CharacterSet)
72     {
73         new (NotNull, &m_atomData.characterSet) CharacterSet();
74         m_atomData.characterSet.inverted = isInverted;
75     }
76
77     enum GroupTermTag { GroupTerm };
78     explicit Term(GroupTermTag)
79         : m_termType(TermType::Group)
80     {
81         new (NotNull, &m_atomData.group) Group();
82     }
83
84     Term(const Term& other)
85         : m_termType(other.m_termType)
86         , m_quantifier(other.m_quantifier)
87     {
88         switch (m_termType) {
89         case TermType::Empty:
90         case TermType::Deleted:
91             break;
92         case TermType::CharacterSet:
93             new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
94             break;
95         case TermType::Group:
96             new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
97             break;
98         }
99     }
100
101     Term(Term&& other)
102         : m_termType(WTF::move(other.m_termType))
103         , m_quantifier(WTF::move(other.m_quantifier))
104     {
105         switch (m_termType) {
106         case TermType::Empty:
107         case TermType::Deleted:
108             break;
109         case TermType::CharacterSet:
110             new (NotNull, &m_atomData.characterSet) CharacterSet(WTF::move(other.m_atomData.characterSet));
111             break;
112         case TermType::Group:
113             new (NotNull, &m_atomData.group) Group(WTF::move(other.m_atomData.group));
114             break;
115         }
116         other.destroy();
117     }
118
119     enum EmptyValueTag { EmptyValue };
120     Term(EmptyValueTag)
121         : m_termType(TermType::Empty)
122     {
123     }
124
125     enum DeletedValueTag { DeletedValue };
126     Term(DeletedValueTag)
127         : m_termType(TermType::Deleted)
128     {
129     }
130
131     ~Term()
132     {
133         destroy();
134     }
135
136     bool isValid() const
137     {
138         return m_termType != TermType::Empty && m_termType != TermType::Deleted;
139     }
140
141     void addCharacter(UChar character, bool isCaseSensitive)
142     {
143         ASSERT(isASCII(character));
144
145         ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
146         if (m_termType != TermType::CharacterSet)
147             return;
148
149         if (isCaseSensitive || !isASCIIAlpha(character))
150             m_atomData.characterSet.characters.set(character);
151         else {
152             m_atomData.characterSet.characters.set(toASCIIUpper(character));
153             m_atomData.characterSet.characters.set(toASCIILower(character));
154         }
155     }
156
157     void extendGroupSubpattern(const Term& term)
158     {
159         ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
160         if (m_termType != TermType::Group)
161             return;
162         m_atomData.group.terms.append(term);
163     }
164
165     void quantify(const AtomQuantifier& quantifier)
166     {
167         ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
168         m_quantifier = quantifier;
169     }
170
171     unsigned generateGraph(NFA& nfa, uint64_t patternId, unsigned start) const
172     {
173         ASSERT(isValid());
174
175         switch (m_quantifier) {
176         case AtomQuantifier::One: {
177             unsigned newEnd = generateSubgraphForAtom(nfa, patternId, start);
178             return newEnd;
179         }
180         case AtomQuantifier::ZeroOrOne: {
181             unsigned newEnd = generateSubgraphForAtom(nfa, patternId, start);
182             nfa.addEpsilonTransition(start, newEnd);
183             return newEnd;
184         }
185         case AtomQuantifier::ZeroOrMore: {
186             unsigned repeatStart = nfa.createNode();
187             nfa.addRuleId(repeatStart, patternId);
188             nfa.addEpsilonTransition(start, repeatStart);
189
190             unsigned repeatEnd = generateSubgraphForAtom(nfa, patternId, repeatStart);
191             nfa.addEpsilonTransition(repeatEnd, repeatStart);
192
193             unsigned kleenEnd = nfa.createNode();
194             nfa.addRuleId(kleenEnd, patternId);
195             nfa.addEpsilonTransition(repeatEnd, kleenEnd);
196             nfa.addEpsilonTransition(start, kleenEnd);
197             return kleenEnd;
198         }
199         case AtomQuantifier::OneOrMore: {
200             unsigned repeatStart = nfa.createNode();
201             nfa.addRuleId(repeatStart, patternId);
202             nfa.addEpsilonTransition(start, repeatStart);
203
204             unsigned repeatEnd = generateSubgraphForAtom(nfa, patternId, repeatStart);
205             nfa.addEpsilonTransition(repeatEnd, repeatStart);
206
207             unsigned afterRepeat = nfa.createNode();
208             nfa.addRuleId(afterRepeat, patternId);
209             nfa.addEpsilonTransition(repeatEnd, afterRepeat);
210             return afterRepeat;
211         }
212         }
213     }
214
215     Term& operator=(const Term& other)
216     {
217         destroy();
218         new (NotNull, this) Term(other);
219         return *this;
220     }
221
222     Term& operator=(Term&& other)
223     {
224         destroy();
225         new (NotNull, this) Term(WTF::move(other));
226         return *this;
227     }
228
229     bool operator==(const Term& other) const
230     {
231         if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
232             return false;
233
234         switch (m_termType) {
235         case TermType::Empty:
236         case TermType::Deleted:
237             return true;
238         case TermType::CharacterSet:
239             return m_atomData.characterSet == other.m_atomData.characterSet;
240         case TermType::Group:
241             return m_atomData.group == other.m_atomData.group;
242         }
243         ASSERT_NOT_REACHED();
244         return false;
245     }
246
247     unsigned hash() const
248     {
249         unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
250         unsigned secondary = 0;
251         switch (m_termType) {
252         case TermType::Empty:
253             secondary = 52184393;
254             break;
255         case TermType::Deleted:
256             secondary = 40342988;
257             break;
258         case TermType::CharacterSet:
259             secondary = m_atomData.characterSet.hash();
260             break;
261         case TermType::Group:
262             secondary = m_atomData.group.hash();
263             break;
264         }
265         return WTF::pairIntHash(primary, secondary);
266     }
267
268     bool isEmptyValue() const
269     {
270         return m_termType == TermType::Empty;
271     }
272
273     bool isDeletedValue() const
274     {
275         return m_termType == TermType::Deleted;
276     }
277
278 private:
279     bool isUniversalTransition() const
280     {
281         return m_termType == TermType::CharacterSet
282             && ((m_atomData.characterSet.inverted && !m_atomData.characterSet.characters.bitCount())
283                 || (!m_atomData.characterSet.inverted && m_atomData.characterSet.characters.bitCount() == 128));
284     }
285
286     unsigned generateSubgraphForAtom(NFA& nfa, uint64_t patternId, unsigned source) const
287     {
288         switch (m_termType) {
289         case TermType::Empty:
290         case TermType::Deleted:
291             ASSERT_NOT_REACHED();
292             return -1;
293         case TermType::CharacterSet: {
294             unsigned target = nfa.createNode();
295             nfa.addRuleId(target, patternId);
296             if (isUniversalTransition())
297                 nfa.addTransitionsOnAnyCharacter(source, target);
298             else {
299                 if (!m_atomData.characterSet.inverted) {
300                     for (const auto& characterIterator : m_atomData.characterSet.characters.setBits())
301                         nfa.addTransition(source, target, static_cast<char>(characterIterator));
302                 } else {
303                     for (unsigned i = 1; i < m_atomData.characterSet.characters.size(); ++i) {
304                         if (m_atomData.characterSet.characters.get(i))
305                             continue;
306                         nfa.addTransition(source, target, static_cast<char>(i));
307                     }
308                 }
309             }
310             return target;
311         }
312         case TermType::Group: {
313             unsigned lastTarget = source;
314             for (const Term& term : m_atomData.group.terms)
315                 lastTarget = term.generateGraph(nfa, patternId, lastTarget);
316             return lastTarget;
317         }
318         }
319     }
320
321     void destroy()
322     {
323         switch (m_termType) {
324         case TermType::Empty:
325         case TermType::Deleted:
326             break;
327         case TermType::CharacterSet:
328             m_atomData.characterSet.~CharacterSet();
329             break;
330         case TermType::Group:
331             m_atomData.group.~Group();
332             break;
333         }
334         m_termType = TermType::Deleted;
335     }
336
337     enum class TermType : uint8_t {
338         Empty,
339         Deleted,
340         CharacterSet,
341         Group
342     };
343
344     TermType m_termType { TermType::Empty };
345     AtomQuantifier m_quantifier { AtomQuantifier::One };
346
347     struct CharacterSet {
348         bool inverted { false };
349         BitVector characters { 128 };
350
351         bool operator==(const CharacterSet& other) const
352         {
353             return other.inverted == inverted && other.characters == characters;
354         }
355
356         unsigned hash() const
357         {
358             return WTF::pairIntHash(inverted, characters.hash());
359         }
360     };
361
362     struct Group {
363         Vector<Term> terms;
364
365         bool operator==(const Group& other) const
366         {
367             return other.terms == terms;
368         }
369
370         unsigned hash() const
371         {
372             unsigned hash = 6421749;
373             for (const Term& term : terms) {
374                 unsigned termHash = term.hash();
375                 hash = (hash << 16) ^ ((termHash << 11) ^ hash);
376                 hash += hash >> 11;
377             }
378             return hash;
379         }
380     };
381
382     union AtomData {
383         AtomData()
384             : invalidTerm(0)
385         {
386         }
387         ~AtomData()
388         {
389         }
390
391         char invalidTerm;
392         CharacterSet characterSet;
393         Group group;
394     } m_atomData;
395 };
396
397 struct TermHash {
398     static unsigned hash(const Term& term) { return term.hash(); }
399     static bool equal(const Term& a, const Term& b) { return a == b; }
400     static const bool safeToCompareToEmptyOrDeleted = true;
401 };
402
403 struct TermHashTraits : public WTF::CustomHashTraits<Term> { };
404
405 struct PrefixTreeEntry {
406     unsigned nfaNode;
407     HashMap<Term, std::unique_ptr<PrefixTreeEntry>, TermHash, TermHashTraits> nextPattern;
408 };
409
410 class GraphBuilder {
411 public:
412     GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, bool patternIsCaseSensitive, uint64_t patternId)
413         : m_nfa(nfa)
414         , m_patternIsCaseSensitive(patternIsCaseSensitive)
415         , m_patternId(patternId)
416         , m_subtreeStart(nfa.root())
417         , m_subtreeEnd(nfa.root())
418         , m_lastPrefixTreeEntry(&prefixTreeRoot)
419     {
420     }
421
422     void finalize()
423     {
424         if (hasError())
425             return;
426
427         sinkFloatingTermIfNecessary();
428
429         if (!m_openGroups.isEmpty()) {
430             fail(ASCIILiteral("The expression has unclosed groups."));
431             return;
432         }
433
434         if (m_subtreeStart != m_subtreeEnd)
435             m_nfa.setFinal(m_subtreeEnd, m_patternId);
436         else
437             fail(ASCIILiteral("The pattern cannot match anything."));
438     }
439
440     const String& errorMessage() const
441     {
442         return m_errorMessage;
443     }
444
445     void atomPatternCharacter(UChar character)
446     {
447         if (hasError())
448             return;
449
450         if (!isASCII(character)) {
451             fail(ASCIILiteral("Only ASCII characters are supported in pattern."));
452             return;
453         }
454
455         sinkFloatingTermIfNecessary();
456         ASSERT(!m_floatingTerm.isValid());
457
458         char asciiChararacter = static_cast<char>(character);
459         m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive);
460     }
461
462     void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
463     {
464         if (hasError())
465             return;
466
467         sinkFloatingTermIfNecessary();
468         ASSERT(!m_floatingTerm.isValid());
469
470         if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted)
471             m_floatingTerm = Term(Term::UniversalTransition);
472         else
473             fail(ASCIILiteral("Character class is not supported."));
474     }
475
476     void quantifyAtom(unsigned minimum, unsigned maximum, bool)
477     {
478         if (hasError())
479             return;
480
481         if (!m_floatingTerm.isValid())
482             fail(ASCIILiteral("Quantifier without corresponding term to quantify."));
483
484         if (!minimum && maximum == 1)
485             m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne);
486         else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
487             m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore);
488         else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
489             m_floatingTerm.quantify(AtomQuantifier::OneOrMore);
490         else
491             fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
492     }
493
494     void atomBackReference(unsigned)
495     {
496         fail(ASCIILiteral("Patterns cannot contain backreferences."));
497     }
498
499     void assertionBOL()
500     {
501         fail(ASCIILiteral("Line boundary assertions are not supported yet."));
502     }
503
504     void assertionEOL()
505     {
506         fail(ASCIILiteral("Line boundary assertions are not supported yet."));
507     }
508
509     void assertionWordBoundary(bool)
510     {
511         fail(ASCIILiteral("Word boundaries assertions are not supported yet."));
512     }
513
514     void atomCharacterClassBegin(bool inverted = false)
515     {
516         if (hasError())
517             return;
518
519         sinkFloatingTermIfNecessary();
520         ASSERT(!m_floatingTerm.isValid());
521
522         m_floatingTerm = Term(Term::CharacterSetTerm, inverted);
523     }
524
525     void atomCharacterClassAtom(UChar character)
526     {
527         if (hasError())
528             return;
529
530         if (!isASCII(character)) {
531             fail(ASCIILiteral("Non ASCII Character in a character set."));
532             return;
533         }
534
535         m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive);
536     }
537
538     void atomCharacterClassRange(UChar a, UChar b)
539     {
540         if (hasError())
541             return;
542
543         if (!a || !b || !isASCII(a) || !isASCII(b)) {
544             fail(ASCIILiteral("Non ASCII Character in a character range of a character set."));
545             return;
546         }
547
548         for (unsigned i = a; i <= b; ++i)
549             m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive);
550     }
551
552     void atomCharacterClassEnd()
553     {
554         // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily.
555     }
556
557     void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
558     {
559         fail(ASCIILiteral("Builtins character class atoms are not supported yet."));
560     }
561
562     void atomParenthesesSubpatternBegin(bool = true)
563     {
564         if (hasError())
565             return;
566
567         sinkFloatingTermIfNecessary();
568
569         m_openGroups.append(Term(Term::GroupTerm));
570     }
571
572     void atomParentheticalAssertionBegin(bool = false)
573     {
574         fail(ASCIILiteral("Groups are not supported yet."));
575     }
576
577     void atomParenthesesEnd()
578     {
579         if (hasError())
580             return;
581
582         sinkFloatingTermIfNecessary();
583         ASSERT(!m_floatingTerm.isValid());
584
585         m_floatingTerm = m_openGroups.takeLast();
586     }
587
588     void disjunction()
589     {
590         fail(ASCIILiteral("Disjunctions are not supported yet."));
591     }
592
593 private:
594     bool hasError() const
595     {
596         return !m_errorMessage.isNull();
597     }
598
599     void fail(const String& errorMessage)
600     {
601         if (hasError())
602             return;
603
604         if (m_newPrefixSubtreeRoot)
605             m_newPrefixSubtreeRoot->nextPattern.remove(m_newPrefixStaringPoint);
606
607         m_errorMessage = errorMessage;
608     }
609
610     void sinkFloatingTermIfNecessary()
611     {
612         if (!m_floatingTerm.isValid())
613             return;
614
615         ASSERT(m_lastPrefixTreeEntry);
616
617         if (!m_openGroups.isEmpty()) {
618             m_openGroups.last().extendGroupSubpattern(m_floatingTerm);
619             m_floatingTerm = Term();
620             return;
621         }
622
623         auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_floatingTerm);
624         if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
625             m_lastPrefixTreeEntry = nextEntry->value.get();
626             m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
627         } else {
628             std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
629
630             unsigned newEnd = m_floatingTerm.generateGraph(m_nfa, m_patternId, m_lastPrefixTreeEntry->nfaNode);
631             nextPrefixTreeEntry->nfaNode = newEnd;
632
633             auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_floatingTerm, WTF::move(nextPrefixTreeEntry));
634             ASSERT(addResult.isNewEntry);
635
636             if (!m_newPrefixSubtreeRoot) {
637                 m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
638                 m_newPrefixStaringPoint = m_floatingTerm;
639             }
640
641             m_lastPrefixTreeEntry = addResult.iterator->value.get();
642         }
643         m_subtreeEnd = m_lastPrefixTreeEntry->nfaNode;
644
645         m_floatingTerm = Term();
646         ASSERT(m_lastPrefixTreeEntry);
647     }
648
649     NFA& m_nfa;
650     bool m_patternIsCaseSensitive;
651     const uint64_t m_patternId;
652
653     unsigned m_subtreeStart { 0 };
654     unsigned m_subtreeEnd { 0 };
655
656     PrefixTreeEntry* m_lastPrefixTreeEntry;
657     Deque<Term> m_openGroups;
658     Term m_floatingTerm;
659
660     PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
661     Term m_newPrefixStaringPoint;
662
663     String m_errorMessage;
664 };
665
666 URLFilterParser::URLFilterParser(NFA& nfa)
667     : m_nfa(nfa)
668     , m_prefixTreeRoot(std::make_unique<PrefixTreeEntry>())
669 {
670     m_prefixTreeRoot->nfaNode = nfa.root();
671 }
672
673 URLFilterParser::~URLFilterParser()
674 {
675 }
676
677 String URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
678 {
679     if (!pattern.containsOnlyASCII())
680         return ASCIILiteral("URLFilterParser only supports ASCII patterns.");
681     ASSERT(!pattern.isEmpty());
682
683     if (pattern.isEmpty())
684         return ASCIILiteral("Empty pattern.");
685
686     unsigned oldSize = m_nfa.graphSize();
687
688     String error;
689
690     GraphBuilder graphBuilder(m_nfa, *m_prefixTreeRoot, patternIsCaseSensitive, patternId);
691     error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
692     if (error.isNull())
693         graphBuilder.finalize();
694
695     if (error.isNull())
696         error = graphBuilder.errorMessage();
697
698     if (!error.isNull())
699         m_nfa.restoreToGraphSize(oldSize);
700
701     return error;
702 }
703
704 } // namespace ContentExtensions
705 } // namespace WebCore
706
707 #endif // ENABLE(CONTENT_EXTENSIONS)