2 * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
27 #include "URLFilterParser.h"
29 #if ENABLE(CONTENT_EXTENSIONS)
32 #include <JavaScriptCore/YarrParser.h>
36 namespace ContentExtensions {
38 const uint16_t hasNonCharacterMask = 0x0080;
39 const uint16_t characterMask = 0x0007F;
40 const uint16_t newlineClassIDBuiltinMask = 0x100;
42 static TrivialAtom trivialAtomFromASCIICharacter(char character)
44 ASSERT(isASCII(character));
46 return static_cast<uint16_t>(character);
49 enum class TrivialAtomQuantifier : uint16_t {
55 static void quantifyTrivialAtom(TrivialAtom& trivialAtom, TrivialAtomQuantifier quantifier)
57 ASSERT(trivialAtom & (hasNonCharacterMask | characterMask));
58 ASSERT(!(trivialAtom & 0xf000));
59 trivialAtom |= static_cast<uint16_t>(quantifier);
62 static TrivialAtom trivialAtomForNewlineClassIDBuiltin()
64 return hasNonCharacterMask | newlineClassIDBuiltinMask;
69 struct BoundedSubGraph {
74 GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, uint64_t patternId)
76 , m_patternId(patternId)
77 , m_activeGroup({ nfa.root(), nfa.root() })
78 , m_lastPrefixTreeEntry(&prefixTreeRoot)
87 sinkPendingAtomIfNecessary();
89 if (m_activeGroup.start != m_activeGroup.end)
90 m_nfa.setFinal(m_activeGroup.end, m_patternId);
92 fail(ASCIILiteral("The pattern cannot match anything."));
95 const String& errorMessage() const
97 return m_errorMessage;
100 void atomPatternCharacter(UChar character)
102 if (!isASCII(character)) {
103 fail(ASCIILiteral("Only ASCII characters are supported in pattern."));
110 sinkPendingAtomIfNecessary();
112 char asciiChararacter = static_cast<char>(character);
113 m_hasValidAtom = true;
115 ASSERT(m_lastPrefixTreeEntry);
116 m_pendingTrivialAtom = trivialAtomFromASCIICharacter(asciiChararacter);
119 void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
124 if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted) {
125 m_hasValidAtom = true;
126 ASSERT(m_lastPrefixTreeEntry);
127 m_pendingTrivialAtom = trivialAtomForNewlineClassIDBuiltin();
129 fail(ASCIILiteral("Character class is not supported."));
132 void quantifyAtom(unsigned minimum, unsigned maximum, bool)
137 ASSERT(m_hasValidAtom);
138 if (!m_hasValidAtom) {
139 fail(ASCIILiteral("Quantifier without corresponding atom to quantify."));
143 ASSERT(m_lastPrefixTreeEntry);
144 if (!minimum && maximum == 1)
145 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroOrOne);
146 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
147 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroToMany);
148 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
149 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::OneToMany);
151 fail(ASCIILiteral("Arbitrary atom repetitions are not supported."));
154 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned)
156 fail(ASCIILiteral("Patterns cannot contain backreferences."));
157 ASSERT_NOT_REACHED();
160 void atomCharacterClassAtom(UChar)
162 fail(ASCIILiteral("Character class atoms are not supported yet."));
167 fail(ASCIILiteral("Line boundary assertions are not supported yet."));
172 fail(ASCIILiteral("Line boundary assertions are not supported yet."));
175 void assertionWordBoundary(bool)
177 fail(ASCIILiteral("Word boundaries assertions are not supported yet."));
180 void atomCharacterClassBegin(bool = false)
182 fail(ASCIILiteral("Character class atoms are not supported yet."));
185 void atomCharacterClassRange(UChar, UChar)
187 fail(ASCIILiteral("Character class ranges are not supported yet."));
190 void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
192 fail(ASCIILiteral("Buildins character class atoms are not supported yet."));
195 void atomCharacterClassEnd()
197 fail(ASCIILiteral("Character class are not supported yet."));
200 void atomParenthesesSubpatternBegin(bool = true)
202 fail(ASCIILiteral("Groups are not supported yet."));
205 void atomParentheticalAssertionBegin(bool = false)
207 fail(ASCIILiteral("Groups are not supported yet."));
210 void atomParenthesesEnd()
212 fail(ASCIILiteral("Groups are not supported yet."));
217 fail(ASCIILiteral("Disjunctions are not supported yet."));
221 bool hasError() const
223 return !m_errorMessage.isNull();
226 void fail(const String& errorMessage)
231 if (m_newPrefixSubtreeRoot)
232 m_newPrefixSubtreeRoot->nextPattern.remove(m_newPrefixStaringPoint);
234 m_errorMessage = errorMessage;
237 void generateTransition(TrivialAtom trivialAtom, unsigned source, unsigned target)
239 if (trivialAtom & hasNonCharacterMask) {
240 ASSERT(trivialAtom & newlineClassIDBuiltinMask);
241 for (unsigned i = 1; i < 128; ++i)
242 m_nfa.addTransition(source, target, i);
244 m_nfa.addTransition(source, target, static_cast<char>(trivialAtom & characterMask));
247 BoundedSubGraph sinkTrivialAtom(TrivialAtom trivialAtom, unsigned start)
249 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroOrOne)) {
250 unsigned newEnd = m_nfa.createNode();
251 m_nfa.addRuleId(newEnd, m_patternId);
252 generateTransition(trivialAtom, start, newEnd);
253 m_nfa.addEpsilonTransition(start, newEnd);
254 return { start, newEnd };
257 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroToMany)) {
258 unsigned repeatStart = m_nfa.createNode();
259 m_nfa.addRuleId(repeatStart, m_patternId);
260 unsigned repeatEnd = m_nfa.createNode();
261 m_nfa.addRuleId(repeatEnd, m_patternId);
263 generateTransition(trivialAtom, repeatStart, repeatEnd);
264 m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
266 m_nfa.addEpsilonTransition(start, repeatStart);
268 unsigned kleenEnd = m_nfa.createNode();
269 m_nfa.addRuleId(kleenEnd, m_patternId);
270 m_nfa.addEpsilonTransition(repeatEnd, kleenEnd);
271 m_nfa.addEpsilonTransition(start, kleenEnd);
272 return { start, kleenEnd };
275 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::OneToMany)) {
276 unsigned repeatStart = m_nfa.createNode();
277 m_nfa.addRuleId(repeatStart, m_patternId);
278 unsigned repeatEnd = m_nfa.createNode();
279 m_nfa.addRuleId(repeatEnd, m_patternId);
281 generateTransition(trivialAtom, repeatStart, repeatEnd);
282 m_nfa.addEpsilonTransition(repeatEnd, repeatStart);
284 m_nfa.addEpsilonTransition(start, repeatStart);
286 unsigned afterRepeat = m_nfa.createNode();
287 m_nfa.addRuleId(afterRepeat, m_patternId);
288 m_nfa.addEpsilonTransition(repeatEnd, afterRepeat);
289 return { start, afterRepeat };
292 unsigned newEnd = m_nfa.createNode();
293 m_nfa.addRuleId(newEnd, m_patternId);
294 generateTransition(trivialAtom, start, newEnd);
295 return { start, newEnd };
298 void sinkPendingAtomIfNecessary()
300 ASSERT(m_lastPrefixTreeEntry);
305 ASSERT(m_pendingTrivialAtom);
307 auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom);
308 if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) {
309 m_lastPrefixTreeEntry = nextEntry->value.get();
310 m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId);
312 std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>();
314 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode);
315 nextPrefixTreeEntry->nfaNode = newSubGraph.end;
317 auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry));
318 ASSERT(addResult.isNewEntry);
320 m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry;
321 m_newPrefixStaringPoint = m_pendingTrivialAtom;
323 m_lastPrefixTreeEntry = addResult.iterator->value.get();
325 ASSERT(m_lastPrefixTreeEntry);
327 m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode;
328 m_pendingTrivialAtom = 0;
329 m_hasValidAtom = false;
333 const uint64_t m_patternId;
335 BoundedSubGraph m_activeGroup;
337 PrefixTreeEntry* m_lastPrefixTreeEntry;
338 bool m_hasValidAtom = false;
339 TrivialAtom m_pendingTrivialAtom = 0;
341 PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
342 TrivialAtom m_newPrefixStaringPoint = 0;
344 String m_errorMessage;
347 URLFilterParser::URLFilterParser(NFA& nfa)
350 m_prefixTreeRoot.nfaNode = nfa.root();
353 String URLFilterParser::addPattern(const String& pattern, uint64_t patternId)
355 if (!pattern.containsOnlyASCII())
356 return ASCIILiteral("URLFilterParser only supports ASCII patterns.");
357 ASSERT(!pattern.isEmpty());
359 if (pattern.isEmpty())
360 return ASCIILiteral("Empty pattern.");
362 unsigned oldSize = m_nfa.graphSize();
366 GraphBuilder graphBuilder(m_nfa, m_prefixTreeRoot, patternId);
367 error = String(JSC::Yarr::parse(graphBuilder, pattern, 0));
369 graphBuilder.finalize();
372 error = graphBuilder.errorMessage();
375 m_nfa.restoreToGraphSize(oldSize);
380 } // namespace ContentExtensions
381 } // namespace WebCore
383 #endif // ENABLE(CONTENT_EXTENSIONS)