2 * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLFilterParser.h"
29 #if ENABLE(CONTENT_EXTENSIONS)
32 #include <JavaScriptCore/YarrParser.h>
36 namespace ContentExtensions {
38 const uint16_t hasNonCharacterMask = 0x0080;
39 const uint16_t characterMask = 0x0007F;
40 const uint16_t newlineClassIDBuiltinMask = 0x100;
41 const uint16_t caseInsensitiveMask = 0x200;
43 static TrivialAtom trivialAtomFromASCIICharacter(char character, bool caseSensitive)
45 ASSERT(isASCII(character));
47 if (caseSensitive || !isASCIIAlpha(character))
48 return static_cast<uint16_t>(character);
50 return static_cast<uint16_t>(toASCIILower(character)) | caseInsensitiveMask;
53 enum class TrivialAtomQuantifier : uint16_t {
59 static void quantifyTrivialAtom(TrivialAtom& trivialAtom, TrivialAtomQuantifier quantifier)
61 ASSERT(trivialAtom & (hasNonCharacterMask | characterMask));
62 ASSERT(!(trivialAtom & 0xf000));
63 trivialAtom |= static_cast<uint16_t>(quantifier);
66 static TrivialAtom trivialAtomForNewlineClassIDBuiltin()
68 return hasNonCharacterMask | newlineClassIDBuiltinMask;
73 struct BoundedSubGraph {
78 GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, bool patternIsCaseSensitive, uint64_t patternId)
80 , m_patternIsCaseSensitive(patternIsCaseSensitive)
81 , m_patternId(patternId)
82 , m_activeGroup({ nfa.root(), nfa.root() })
83 , m_lastPrefixTreeEntry(&prefixTreeRoot)
92 sinkPendingAtomIfNecessary();
94 if (m_activeGroup.start != m_activeGroup.end)
95 m_nfa.setFinal(m_activeGroup.end, m_patternId);
97 fail(ASCIILiteral("The pattern cannot match anything."));
100 const String& errorMessage() const
102 return m_errorMessage;
105 void atomPatternCharacter(UChar character)
107 if (!isASCII(character)) {
108 fail(ASCIILiteral("Only ASCII characters are supported in pattern."));
115 sinkPendingAtomIfNecessary();
117 char asciiChararacter = static_cast<char>(character);
118 m_hasValidAtom = true;
120 ASSERT(m_lastPrefixTreeEntry);
121 m_pendingTrivialAtom = trivialAtomFromASCIICharacter(asciiChararacter, m_patternIsCaseSensitive);
124 void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
129 if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted) {
130 m_hasValidAtom = true;
131 ASSERT(m_lastPrefixTreeEntry);
132 m_pendingTrivialAtom = trivialAtomForNewlineClassIDBuiltin();
134 fail(ASCIILiteral("Character class is not supported."));
137 void quantifyAtom(unsigned minimum, unsigned maximum, bool)
142 ASSERT(m_hasValidAtom);
143 if (!m_hasValidAtom) {
144 fail(ASCIILiteral("Quantifier without corresponding atom to quantify."));
148 ASSERT(m_lastPrefixTreeEntry);
149 if (!minimum && maximum == 1)
150 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroOrOne);
151 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
152 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroToMany);
153 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
154 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::OneToMany);
156 fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
159 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned)
161 fail(ASCIILiteral("Patterns cannot contain backreferences."));
162 ASSERT_NOT_REACHED();
165 void atomCharacterClassAtom(UChar)
167 fail(ASCIILiteral("Character class atoms are not supported yet."));
172 fail(ASCIILiteral("Line boundary assertions are not supported yet."));
177 fail(ASCIILiteral("Line boundary assertions are not supported yet."));
180 void assertionWordBoundary(bool)
182 fail(ASCIILiteral("Word boundaries assertions are not supported yet."));
185 void atomCharacterClassBegin(bool = false)
187 fail(ASCIILiteral("Character class atoms are not supported yet."));
190 void atomCharacterClassRange(UChar, UChar)
192 fail(ASCIILiteral("Character class ranges are not supported yet."));
195 void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
197 fail(ASCIILiteral("Buildins character class atoms are not supported yet."));
200 void atomCharacterClassEnd()
202 fail(ASCIILiteral("Character class are not supported yet."));
205 void atomParenthesesSubpatternBegin(bool = true)
207 fail(ASCIILiteral("Groups are not supported yet."));
210 void atomParentheticalAssertionBegin(bool = false)
212 fail(ASCIILiteral("Groups are not supported yet."));
215 void atomParenthesesEnd()
217 fail(ASCIILiteral("Groups are not supported yet."));
222 fail(ASCIILiteral("Disjunctions are not supported yet."));
226 bool hasError() const
228 return !m_errorMessage.isNull();
231 void fail(const String& errorMessage)
236 if (m_newPrefixSubtreeRoot)
237 m_newPrefixSubtreeRoot->nextPattern.remove(m_newPrefixStaringPoint);
239 m_errorMessage = errorMessage;
242 void generateTransition(TrivialAtom trivialAtom, unsigned source, unsigned target)
244 if (trivialAtom & hasNonCharacterMask) {
245 ASSERT(trivialAtom & newlineClassIDBuiltinMask);
246 for (unsigned i = 1; i < 128; ++i)
247 m_nfa.addTransition(source, target, i);
249 if (trivialAtom & caseInsensitiveMask) {
250 char character = static_cast<char>(trivialAtom & characterMask);
251 m_nfa.addTransition(source, target, character);
252 m_nfa.addTransition(source, target, toASCIIUpper(character));
254 m_nfa.addTransition(source, target, static_cast<char>(trivialAtom & characterMask));
258 BoundedSubGraph sinkTrivialAtom(TrivialAtom trivialAtom, unsigned start)
260 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroOrOne)) {
261 unsigned newEnd = m_nfa.createNode();
262 m_nfa.addRuleId(newEnd, m_patternId);
263 generateTransition(trivialAtom, start, newEnd);
264 m_nfa.addEpsilonTransition(start, newEnd);
265 return { start, newEnd };
268 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroToMany)) {
269 unsigned repeatStart = m_nfa.createNode();
270 m_nfa.addRuleId(repeatStart, m_patternId);
271 unsigned repeatEnd = m_nfa.createNode();
272 m_nfa.addRuleId(repeatEnd, m_patternId);
274 generateTransition(trivialAtom, repeatStart, repeatEnd);
275 m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
277 m_nfa.addEpsilonTransition(start, repeatStart);
279 unsigned kleenEnd = m_nfa.createNode();
280 m_nfa.addRuleId(kleenEnd, m_patternId);
281 m_nfa.addEpsilonTransition(repeatEnd, kleenEnd);
282 m_nfa.addEpsilonTransition(start, kleenEnd);
283 return { start, kleenEnd };
286 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::OneToMany)) {
287 unsigned repeatStart = m_nfa.createNode();
288 m_nfa.addRuleId(repeatStart, m_patternId);
289 unsigned repeatEnd = m_nfa.createNode();
290 m_nfa.addRuleId(repeatEnd, m_patternId);
292 generateTransition(trivialAtom, repeatStart, repeatEnd);
293 m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
295 m_nfa.addEpsilonTransition(start, repeatStart);
297 unsigned afterRepeat = m_nfa.createNode();
298 m_nfa.addRuleId(afterRepeat, m_patternId);
299 m_nfa.addEpsilonTransition(repeatEnd, afterRepeat);
300 return { start, afterRepeat };
303 unsigned newEnd = m_nfa.createNode();
304 m_nfa.addRuleId(newEnd, m_patternId);
305 generateTransition(trivialAtom, start, newEnd);
306 return { start, newEnd };
309 void sinkPendingAtomIfNecessary()
311 ASSERT(m_lastPrefixTreeEntry);
316 ASSERT(m_pendingTrivialAtom);
318 auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom);
319 if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
320 m_lastPrefixTreeEntry = nextEntry->value.get();
321 m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
323 std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
325 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode);
326 nextPrefixTreeEntry->nfaNode = newSubGraph.end;
328 auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry));
329 ASSERT(addResult.isNewEntry);
331 m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
332 m_newPrefixStaringPoint = m_pendingTrivialAtom;
334 m_lastPrefixTreeEntry = addResult.iterator->value.get();
336 ASSERT(m_lastPrefixTreeEntry);
338 m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode;
339 m_pendingTrivialAtom = 0;
340 m_hasValidAtom = false;
344 bool m_patternIsCaseSensitive;
345 const uint64_t m_patternId;
347 BoundedSubGraph m_activeGroup;
349 PrefixTreeEntry* m_lastPrefixTreeEntry;
350 bool m_hasValidAtom = false;
351 TrivialAtom m_pendingTrivialAtom = 0;
353 PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
354 TrivialAtom m_newPrefixStaringPoint = 0;
356 String m_errorMessage;
359 URLFilterParser::URLFilterParser(NFA& nfa)
362 m_prefixTreeRoot.nfaNode = nfa.root();
365 String URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
367 if (!pattern.containsOnlyASCII())
368 return ASCIILiteral("URLFilterParser only supports ASCII patterns.");
369 ASSERT(!pattern.isEmpty());
371 if (pattern.isEmpty())
372 return ASCIILiteral("Empty pattern.");
374 unsigned oldSize = m_nfa.graphSize();
378 GraphBuilder graphBuilder(m_nfa, m_prefixTreeRoot, patternIsCaseSensitive, patternId);
379 error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
381 graphBuilder.finalize();
384 error = graphBuilder.errorMessage();
387 m_nfa.restoreToGraphSize(oldSize);
392 } // namespace ContentExtensions
393 } // namespace WebCore
395 #endif // ENABLE(CONTENT_EXTENSIONS)