Fix the !ENABLE(WEBGL) build after r180609
[WebKit-https.git] / Source / WebCore / contentextensions / URLFilterParser.cpp
1 /*
2  * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "URLFilterParser.h"
28
29 #if ENABLE(CONTENT_EXTENSIONS)
30
31 #include "NFA.h"
32 #include <JavaScriptCore/YarrParser.h>
33 #include <wtf/BitVector.h>
34
35 namespace WebCore {
36
37 namespace ContentExtensions {
38
39 const uint16_t hasNonCharacterMask = 0x0080;
40 const uint16_t characterMask = 0x0007F;
41 const uint16_t newlineClassIDBuiltinMask = 0x100;
42 const uint16_t caseInsensitiveMask = 0x200;
43
44 static TrivialAtom trivialAtomFromASCIICharacter(char character, bool caseSensitive)
45 {
46     ASSERT(isASCII(character));
47
48     if (caseSensitive || !isASCIIAlpha(character))
49         return static_cast<uint16_t>(character);
50
51     return static_cast<uint16_t>(toASCIILower(character)) | caseInsensitiveMask;
52 }
53
54 enum class AtomQuantifier : uint16_t {
55     One = 0,
56     ZeroOrOne = 0x1000,
57     ZeroOrMore = 0x2000,
58     OneOrMore = 0x4000
59 };
60
61 static void quantifyTrivialAtom(TrivialAtom& trivialAtom, AtomQuantifier quantifier)
62 {
63     ASSERT(trivialAtom & (hasNonCharacterMask | characterMask));
64     ASSERT(!(trivialAtom & 0xf000));
65     trivialAtom |= static_cast<uint16_t>(quantifier);
66 }
67
68 static AtomQuantifier trivialAtomQuantifier(TrivialAtom trivialAtom)
69 {
70     switch (trivialAtom & 0xf000) {
71     case static_cast<unsigned>(AtomQuantifier::One):
72         return AtomQuantifier::One;
73     case static_cast<unsigned>(AtomQuantifier::ZeroOrOne):
74         return AtomQuantifier::ZeroOrOne;
75     case static_cast<unsigned>(AtomQuantifier::ZeroOrMore):
76         return AtomQuantifier::ZeroOrMore;
77     case static_cast<unsigned>(AtomQuantifier::OneOrMore):
78         return AtomQuantifier::OneOrMore;
79     }
80     ASSERT_NOT_REACHED();
81     return AtomQuantifier::One;
82 }
83
84 static TrivialAtom trivialAtomForNewlineClassIDBuiltin()
85 {
86     return hasNonCharacterMask | newlineClassIDBuiltinMask;
87 }
88
89 class GraphBuilder {
90     struct BoundedSubGraph {
91         unsigned start;
92         unsigned end;
93     };
94
95 public:
96     GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, bool patternIsCaseSensitive, uint64_t patternId)
97         : m_nfa(nfa)
98         , m_patternIsCaseSensitive(patternIsCaseSensitive)
99         , m_patternId(patternId)
100         , m_activeGroup({ nfa.root(), nfa.root() })
101         , m_lastPrefixTreeEntry(&prefixTreeRoot)
102     {
103     }
104
105     void finalize()
106     {
107         if (hasError())
108             return;
109
110         sinkPendingAtomIfNecessary();
111
112         if (m_activeGroup.start != m_activeGroup.end)
113             m_nfa.setFinal(m_activeGroup.end, m_patternId);
114         else
115             fail(ASCIILiteral("The pattern cannot match anything."));
116     }
117
118     const String& errorMessage() const
119     {
120         return m_errorMessage;
121     }
122
123     void atomPatternCharacter(UChar character)
124     {
125         if (hasError())
126             return;
127
128         if (!isASCII(character)) {
129             fail(ASCIILiteral("Only ASCII characters are supported in pattern."));
130             return;
131         }
132
133         sinkPendingAtomIfNecessary();
134         ASSERT(m_floatingAtomType == FloatingAtomType::Invalid);
135         ASSERT(!m_pendingTrivialAtom);
136
137         char asciiChararacter = static_cast<char>(character);
138         m_pendingTrivialAtom = trivialAtomFromASCIICharacter(asciiChararacter, m_patternIsCaseSensitive);
139         m_floatingAtomType = FloatingAtomType::Trivial;
140     }
141
142     void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
143     {
144         if (hasError())
145             return;
146
147         sinkPendingAtomIfNecessary();
148         ASSERT(m_floatingAtomType == FloatingAtomType::Invalid);
149         ASSERT(!m_pendingTrivialAtom);
150
151         if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted) {
152             m_pendingTrivialAtom = trivialAtomForNewlineClassIDBuiltin();
153             m_floatingAtomType = FloatingAtomType::Trivial;
154         } else
155             fail(ASCIILiteral("Character class is not supported."));
156     }
157
158     void quantifyAtom(unsigned minimum, unsigned maximum, bool)
159     {
160         if (hasError())
161             return;
162
163         switch (m_floatingAtomType) {
164         case FloatingAtomType::Invalid:
165             fail(ASCIILiteral("Quantifier without corresponding atom to quantify."));
166             break;
167
168         case FloatingAtomType::Trivial:
169             if (!minimum && maximum == 1)
170                 quantifyTrivialAtom(m_pendingTrivialAtom, AtomQuantifier::ZeroOrOne);
171             else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
172                 quantifyTrivialAtom(m_pendingTrivialAtom, AtomQuantifier::ZeroOrMore);
173             else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
174                 quantifyTrivialAtom(m_pendingTrivialAtom, AtomQuantifier::OneOrMore);
175             else
176                 fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
177             break;
178         case FloatingAtomType::CharacterSet: {
179             ASSERT(m_characterSetQuantifier == AtomQuantifier::One);
180             if (!minimum && maximum == 1)
181                 m_characterSetQuantifier = AtomQuantifier::ZeroOrOne;
182             else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
183                 m_characterSetQuantifier = AtomQuantifier::ZeroOrMore;
184             else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
185                 m_characterSetQuantifier = AtomQuantifier::OneOrMore;
186             else
187                 fail(ASCIILiteral("Arbitrary character set repetitions are not supported."));
188             break;
189         }
190         }
191     }
192
193     void atomBackReference(unsigned)
194     {
195         fail(ASCIILiteral("Patterns cannot contain backreferences."));
196     }
197
198     void assertionBOL()
199     {
200         fail(ASCIILiteral("Line boundary assertions are not supported yet."));
201     }
202
203     void assertionEOL()
204     {
205         fail(ASCIILiteral("Line boundary assertions are not supported yet."));
206     }
207
208     void assertionWordBoundary(bool)
209     {
210         fail(ASCIILiteral("Word boundaries assertions are not supported yet."));
211     }
212
213     void atomCharacterClassBegin(bool inverted = false)
214     {
215         if (hasError())
216             return;
217
218         sinkPendingAtomIfNecessary();
219
220         ASSERT_WITH_MESSAGE(!m_pendingCharacterSet.bitCount(), "We should not have nested character classes.");
221         ASSERT(m_floatingAtomType == FloatingAtomType::Invalid);
222
223         m_buildMode = BuildMode::DirectGeneration;
224         m_lastPrefixTreeEntry = nullptr;
225
226         m_isInvertedCharacterSet = inverted;
227         m_floatingAtomType = FloatingAtomType::CharacterSet;
228     }
229
230     void atomCharacterClassAtom(UChar character)
231     {
232         if (hasError())
233             return;
234
235         ASSERT(m_floatingAtomType == FloatingAtomType::CharacterSet);
236
237         if (!isASCII(character)) {
238             fail(ASCIILiteral("Non ASCII Character in a character set."));
239             return;
240         }
241         m_pendingCharacterSet.set(character);
242     }
243
244     void atomCharacterClassRange(UChar a, UChar b)
245     {
246         if (hasError())
247             return;
248
249         ASSERT(m_floatingAtomType == FloatingAtomType::CharacterSet);
250
251         if (!a || !b || !isASCII(a) || !isASCII(b)) {
252             fail(ASCIILiteral("Non ASCII Character in a character range of a character set."));
253             return;
254         }
255         for (unsigned i = a; i <= b; ++i)
256             m_pendingCharacterSet.set(i);
257     }
258
259     void atomCharacterClassEnd()
260     {
261         // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily.
262     }
263
264     void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
265     {
266         fail(ASCIILiteral("Builtins character class atoms are not supported yet."));
267     }
268
269     void atomParenthesesSubpatternBegin(bool = true)
270     {
271         fail(ASCIILiteral("Groups are not supported yet."));
272     }
273
274     void atomParentheticalAssertionBegin(bool = false)
275     {
276         fail(ASCIILiteral("Groups are not supported yet."));
277     }
278
279     void atomParenthesesEnd()
280     {
281         fail(ASCIILiteral("Groups are not supported yet."));
282     }
283
284     void disjunction()
285     {
286         fail(ASCIILiteral("Disjunctions are not supported yet."));
287     }
288
289 private:
290     bool hasError() const
291     {
292         return !m_errorMessage.isNull();
293     }
294
295     void fail(const String& errorMessage)
296     {
297         if (hasError())
298             return;
299
300         if (m_newPrefixSubtreeRoot)
301             m_newPrefixSubtreeRoot->nextPattern.remove(m_newPrefixStaringPoint);
302
303         m_errorMessage = errorMessage;
304     }
305
306     BoundedSubGraph sinkAtom(std::function<void(unsigned, unsigned)> transitionFunction, AtomQuantifier quantifier, unsigned start)
307     {
308         switch (quantifier) {
309         case AtomQuantifier::One: {
310             unsigned newEnd = m_nfa.createNode();
311             m_nfa.addRuleId(newEnd, m_patternId);
312             transitionFunction(start, newEnd);
313             return { start, newEnd };
314         }
315         case AtomQuantifier::ZeroOrOne: {
316             unsigned newEnd = m_nfa.createNode();
317             m_nfa.addRuleId(newEnd, m_patternId);
318             transitionFunction(start, newEnd);
319             return { start, newEnd };
320         }
321         case AtomQuantifier::ZeroOrMore: {
322             unsigned repeatStart = m_nfa.createNode();
323             m_nfa.addRuleId(repeatStart, m_patternId);
324             unsigned repeatEnd = m_nfa.createNode();
325             m_nfa.addRuleId(repeatEnd, m_patternId);
326
327             transitionFunction(repeatStart, repeatEnd);
328             m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
329
330             m_nfa.addEpsilonTransition(start, repeatStart);
331
332             unsigned kleenEnd = m_nfa.createNode();
333             m_nfa.addRuleId(kleenEnd, m_patternId);
334             m_nfa.addEpsilonTransition(repeatEnd, kleenEnd);
335             m_nfa.addEpsilonTransition(start, kleenEnd);
336             return { start, kleenEnd };
337         }
338         case AtomQuantifier::OneOrMore: {
339             unsigned repeatStart = m_nfa.createNode();
340             m_nfa.addRuleId(repeatStart, m_patternId);
341             unsigned repeatEnd = m_nfa.createNode();
342             m_nfa.addRuleId(repeatEnd, m_patternId);
343
344             transitionFunction(repeatStart, repeatEnd);
345             m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
346
347             m_nfa.addEpsilonTransition(start, repeatStart);
348
349             unsigned afterRepeat = m_nfa.createNode();
350             m_nfa.addRuleId(afterRepeat, m_patternId);
351             m_nfa.addEpsilonTransition(repeatEnd, afterRepeat);
352             return { start, afterRepeat };
353         }
354         }
355     }
356
357     void generateTransition(TrivialAtom trivialAtom, unsigned source, unsigned target)
358     {
359         if (trivialAtom & hasNonCharacterMask) {
360             ASSERT(trivialAtom & newlineClassIDBuiltinMask);
361             m_nfa.addTransitionsOnAnyCharacter(source, target);
362         } else {
363             if (trivialAtom & caseInsensitiveMask) {
364                 char character = static_cast<char>(trivialAtom & characterMask);
365                 m_nfa.addTransition(source, target, character);
366                 m_nfa.addTransition(source, target, toASCIIUpper(character));
367             } else
368                 m_nfa.addTransition(source, target, static_cast<char>(trivialAtom & characterMask));
369         }
370     }
371
372     BoundedSubGraph sinkTrivialAtom(TrivialAtom trivialAtom, unsigned start)
373     {
374         auto transitionFunction = [this, trivialAtom](unsigned source, unsigned target)
375         {
376             generateTransition(trivialAtom, source, target);
377         };
378         return sinkAtom(transitionFunction, trivialAtomQuantifier(trivialAtom), start);
379     }
380
381     void generateTransition(const BitVector& characterSet, bool isInverted, unsigned source, unsigned target)
382     {
383         ASSERT(characterSet.bitCount());
384         if (!isInverted) {
385             for (const auto& characterIterator : characterSet.setBits()) {
386                 char character = static_cast<char>(characterIterator);
387                 if (!m_patternIsCaseSensitive && isASCIIAlpha(character)) {
388                     m_nfa.addTransition(source, target, toASCIIUpper(character));
389                     m_nfa.addTransition(source, target, toASCIILower(character));
390                 } else
391                     m_nfa.addTransition(source, target, character);
392             }
393         } else {
394             for (unsigned i = 1; i < characterSet.size(); ++i) {
395                 if (characterSet.get(i))
396                     continue;
397                 char character = static_cast<char>(i);
398
399                 if (!m_patternIsCaseSensitive && (characterSet.get(toASCIIUpper(character)) || characterSet.get(toASCIILower(character))))
400                     continue;
401
402                 m_nfa.addTransition(source, target, character);
403             }
404         }
405     }
406
407     BoundedSubGraph sinkCharacterSet(const BitVector& characterSet, bool isInverted, unsigned start)
408     {
409         auto transitionFunction = [this, &characterSet, isInverted](unsigned source, unsigned target)
410         {
411             generateTransition(characterSet, isInverted, source, target);
412         };
413         return sinkAtom(transitionFunction, m_characterSetQuantifier, start);
414     }
415
416     void sinkPendingAtomIfNecessary()
417     {
418         if (m_floatingAtomType == FloatingAtomType::Invalid)
419             return;
420
421         switch (m_buildMode) {
422         case BuildMode::PrefixTree: {
423             ASSERT(m_lastPrefixTreeEntry);
424             ASSERT_WITH_MESSAGE(m_floatingAtomType == FloatingAtomType::Trivial, "Only trivial atoms are handled with a prefix tree.");
425
426             auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom);
427             if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
428                 m_lastPrefixTreeEntry = nextEntry->value.get();
429                 m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
430             } else {
431                 std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
432
433                 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode);
434                 nextPrefixTreeEntry->nfaNode = newSubGraph.end;
435
436                 auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry));
437                 ASSERT(addResult.isNewEntry);
438
439                 if (!m_newPrefixSubtreeRoot) {
440                     m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
441                     m_newPrefixStaringPoint = m_pendingTrivialAtom;
442                 }
443
444                 m_lastPrefixTreeEntry = addResult.iterator->value.get();
445             }
446             m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode;
447             ASSERT(m_lastPrefixTreeEntry);
448             break;
449         }
450         case BuildMode::DirectGeneration: {
451             ASSERT(!m_lastPrefixTreeEntry);
452
453             switch (m_floatingAtomType) {
454             case FloatingAtomType::Invalid:
455                 ASSERT_NOT_REACHED();
456                 break;
457             case FloatingAtomType::Trivial: {
458                 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_activeGroup.end);
459                 m_activeGroup.end = newSubGraph.end;
460                 break;
461             }
462             case FloatingAtomType::CharacterSet:
463                 if (!m_pendingCharacterSet.bitCount()) {
464                     fail(ASCIILiteral("Empty character set."));
465                     return;
466                 }
467                 BoundedSubGraph newSubGraph = sinkCharacterSet(m_pendingCharacterSet, m_isInvertedCharacterSet, m_activeGroup.end);
468                 m_activeGroup.end = newSubGraph.end;
469
470                 m_isInvertedCharacterSet = false;
471                 m_characterSetQuantifier = AtomQuantifier::One;
472                 m_pendingCharacterSet.clearAll();
473                 break;
474             }
475             break;
476             }
477         }
478
479         m_pendingTrivialAtom = 0;
480         m_floatingAtomType = FloatingAtomType::Invalid;
481     }
482
483     NFA& m_nfa;
484     bool m_patternIsCaseSensitive;
485     const uint64_t m_patternId;
486
487     BoundedSubGraph m_activeGroup;
488
489     PrefixTreeEntry* m_lastPrefixTreeEntry;
490     enum class FloatingAtomType {
491         Invalid,
492         Trivial,
493         CharacterSet
494     };
495     FloatingAtomType m_floatingAtomType { FloatingAtomType::Invalid };
496     TrivialAtom m_pendingTrivialAtom = 0;
497
498     bool m_isInvertedCharacterSet { false };
499     BitVector m_pendingCharacterSet { 128 };
500     AtomQuantifier m_characterSetQuantifier { AtomQuantifier::One };
501
502     enum class BuildMode {
503         PrefixTree,
504         DirectGeneration
505     };
506     BuildMode m_buildMode { BuildMode::PrefixTree };
507
508     PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
509     TrivialAtom m_newPrefixStaringPoint = 0;
510
511     String m_errorMessage;
512 };
513
514 URLFilterParser::URLFilterParser(NFA& nfa)
515     : m_nfa(nfa)
516 {
517     m_prefixTreeRoot.nfaNode = nfa.root();
518 }
519
520 String URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
521 {
522     if (!pattern.containsOnlyASCII())
523         return ASCIILiteral("URLFilterParser only supports ASCII patterns.");
524     ASSERT(!pattern.isEmpty());
525
526     if (pattern.isEmpty())
527         return ASCIILiteral("Empty pattern.");
528
529     unsigned oldSize = m_nfa.graphSize();
530
531     String error;
532
533     GraphBuilder graphBuilder(m_nfa, m_prefixTreeRoot, patternIsCaseSensitive, patternId);
534     error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
535     if (error.isNull())
536         graphBuilder.finalize();
537
538     if (error.isNull())
539         error = graphBuilder.errorMessage();
540
541     if (!error.isNull())
542         m_nfa.restoreToGraphSize(oldSize);
543
544     return error;
545 }
546
547 } // namespace ContentExtensions
548 } // namespace WebCore
549
550 #endif // ENABLE(CONTENT_EXTENSIONS)