2 * Copyright (C) 2014-2015 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
29 #if ENABLE(CONTENT_EXTENSIONS)
31 #include "ContentExtensionsDebugging.h"
33 #include <unicode/utypes.h>
34 #include <wtf/ASCIICType.h>
35 #include <wtf/HashMap.h>
36 #include <wtf/Vector.h>
37 #include <wtf/text/StringBuilder.h>
38 #include <wtf/text/WTFString.h>
42 namespace ContentExtensions {
44 enum class AtomQuantifier : uint8_t {
54 Term(char character, bool isCaseSensitive);
56 enum UniversalTransitionTag { UniversalTransition };
57 explicit Term(UniversalTransitionTag);
59 enum CharacterSetTermTag { CharacterSetTerm };
60 explicit Term(CharacterSetTermTag, bool isInverted);
62 enum GroupTermTag { GroupTerm };
63 explicit Term(GroupTermTag);
65 enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm };
66 explicit Term(EndOfLineAssertionTermTag);
68 Term(const Term& other);
71 enum EmptyValueTag { EmptyValue };
74 enum DeletedValueTag { DeletedValue };
75 Term(DeletedValueTag);
81 // CharacterSet terms only.
82 void addCharacter(UChar character, bool isCaseSensitive);
85 void extendGroupSubpattern(const Term&);
87 void quantify(const AtomQuantifier&);
89 unsigned generateGraph(NFA&, unsigned start, const ActionList& finalActions) const;
91 bool isEndOfLineAssertion() const;
93 bool matchesAtLeastOneCharacter() const;
95 // Matches any string, the empty string included.
96 // This is very conservative. Patterns matching any string can return false here.
97 bool isKnownToMatchAnyString() const;
99 // Return true if the term can only match input of a single fixed length.
100 bool hasFixedLength() const;
102 Term& operator=(const Term& other);
103 Term& operator=(Term&& other);
105 bool operator==(const Term& other) const;
107 unsigned hash() const;
109 bool isEmptyValue() const;
110 bool isDeletedValue() const;
112 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
113 String toString() const;
117 // This is exact for character sets but conservative for groups.
118 // The return value can be false for a group equivalent to a universal transition.
119 bool isUniversalTransition() const;
121 unsigned generateSubgraphForAtom(NFA&, unsigned source) const;
125 enum class TermType : uint8_t {
132 TermType m_termType { TermType::Empty };
133 AtomQuantifier m_quantifier { AtomQuantifier::One };
137 void set(UChar character)
139 RELEASE_ASSERT(character < 128);
140 m_characters[character / 64] |= (uint64_t(1) << (character % 64));
143 bool get(UChar character) const
145 RELEASE_ASSERT(character < 128);
146 return m_characters[character / 64] & (uint64_t(1) << (character % 64));
155 bool inverted() const { return m_inverted; }
157 unsigned bitCount() const
159 return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]);
162 bool operator==(const CharacterSet& other) const
164 return other.m_inverted == m_inverted
165 && other.m_characters[0] == m_characters[0]
166 && other.m_characters[1] == m_characters[1];
169 unsigned hash() const
171 return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted);
174 bool m_inverted { false };
175 uint64_t m_characters[2] { 0, 0 };
181 bool operator==(const Group& other) const
183 return other.terms == terms;
186 unsigned hash() const
188 unsigned hash = 6421749;
189 for (const Term& term : terms) {
190 unsigned termHash = term.hash();
191 hash = (hash << 16) ^ ((termHash << 11) ^ hash);
208 CharacterSet characterSet;
213 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
214 inline String quantifierToString(AtomQuantifier quantifier)
216 switch (quantifier) {
217 case AtomQuantifier::One:
219 case AtomQuantifier::ZeroOrOne:
221 case AtomQuantifier::ZeroOrMore:
223 case AtomQuantifier::OneOrMore:
228 inline String Term::toString() const
230 switch (m_termType) {
231 case TermType::Empty:
233 case TermType::Deleted:
235 case TermType::CharacterSet: {
236 StringBuilder builder;
238 for (UChar c = 0; c < 128; c++) {
239 if (m_atomData.characterSet.get(c)) {
240 if (isASCIIPrintable(c) && !isASCIISpace(c))
243 builder.append('\\');
245 builder.appendNumber(c);
250 builder.append(quantifierToString(m_quantifier));
251 return builder.toString();
253 case TermType::Group: {
254 StringBuilder builder;
256 for (const Term& term : m_atomData.group.terms)
257 builder.append(term.toString());
259 builder.append(quantifierToString(m_quantifier));
260 return builder.toString();
267 static unsigned hash(const Term& term) { return term.hash(); }
268 static bool equal(const Term& a, const Term& b) { return a == b; }
269 static const bool safeToCompareToEmptyOrDeleted = true;
272 struct TermHashTraits : public WTF::CustomHashTraits<Term> { };
278 inline Term::Term(char character, bool isCaseSensitive)
279 : m_termType(TermType::CharacterSet)
281 new (NotNull, &m_atomData.characterSet) CharacterSet();
282 addCharacter(character, isCaseSensitive);
285 inline Term::Term(UniversalTransitionTag)
286 : m_termType(TermType::CharacterSet)
288 new (NotNull, &m_atomData.characterSet) CharacterSet();
289 for (UChar i = 0; i < 128; ++i)
290 m_atomData.characterSet.set(i);
293 inline Term::Term(CharacterSetTermTag, bool isInverted)
294 : m_termType(TermType::CharacterSet)
296 new (NotNull, &m_atomData.characterSet) CharacterSet();
298 m_atomData.characterSet.invert();
301 inline Term::Term(GroupTermTag)
302 : m_termType(TermType::Group)
304 new (NotNull, &m_atomData.group) Group();
307 inline Term::Term(EndOfLineAssertionTermTag)
308 : Term(CharacterSetTerm, false)
310 m_atomData.characterSet.set(0);
313 inline Term::Term(const Term& other)
314 : m_termType(other.m_termType)
315 , m_quantifier(other.m_quantifier)
317 switch (m_termType) {
318 case TermType::Empty:
319 case TermType::Deleted:
321 case TermType::CharacterSet:
322 new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
324 case TermType::Group:
325 new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
330 inline Term::Term(Term&& other)
331 : m_termType(WTF::move(other.m_termType))
332 , m_quantifier(WTF::move(other.m_quantifier))
334 switch (m_termType) {
335 case TermType::Empty:
336 case TermType::Deleted:
338 case TermType::CharacterSet:
339 new (NotNull, &m_atomData.characterSet) CharacterSet(WTF::move(other.m_atomData.characterSet));
341 case TermType::Group:
342 new (NotNull, &m_atomData.group) Group(WTF::move(other.m_atomData.group));
348 inline Term::Term(EmptyValueTag)
349 : m_termType(TermType::Empty)
353 inline Term::Term(DeletedValueTag)
354 : m_termType(TermType::Deleted)
363 inline bool Term::isValid() const
365 return m_termType != TermType::Empty && m_termType != TermType::Deleted;
368 inline void Term::addCharacter(UChar character, bool isCaseSensitive)
370 ASSERT(isASCII(character));
372 ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
373 if (m_termType != TermType::CharacterSet)
376 if (isCaseSensitive || !isASCIIAlpha(character))
377 m_atomData.characterSet.set(character);
379 m_atomData.characterSet.set(toASCIIUpper(character));
380 m_atomData.characterSet.set(toASCIILower(character));
384 inline void Term::extendGroupSubpattern(const Term& term)
386 ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
387 if (m_termType != TermType::Group)
389 m_atomData.group.terms.append(term);
392 inline void Term::quantify(const AtomQuantifier& quantifier)
394 ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
395 m_quantifier = quantifier;
398 inline unsigned Term::Term::generateGraph(NFA& nfa, unsigned start, const ActionList& finalActions) const
404 switch (m_quantifier) {
405 case AtomQuantifier::One: {
406 newEnd = generateSubgraphForAtom(nfa, start);
409 case AtomQuantifier::ZeroOrOne: {
410 newEnd = generateSubgraphForAtom(nfa, start);
411 nfa.addEpsilonTransition(start, newEnd);
414 case AtomQuantifier::ZeroOrMore: {
415 unsigned repeatStart = nfa.createNode();
416 nfa.addEpsilonTransition(start, repeatStart);
418 unsigned repeatEnd = generateSubgraphForAtom(nfa, repeatStart);
419 nfa.addEpsilonTransition(repeatEnd, repeatStart);
421 unsigned kleenEnd = nfa.createNode();
422 nfa.addEpsilonTransition(repeatEnd, kleenEnd);
423 nfa.addEpsilonTransition(start, kleenEnd);
427 case AtomQuantifier::OneOrMore: {
428 unsigned repeatStart = nfa.createNode();
429 nfa.addEpsilonTransition(start, repeatStart);
431 unsigned repeatEnd = generateSubgraphForAtom(nfa, repeatStart);
432 nfa.addEpsilonTransition(repeatEnd, repeatStart);
434 unsigned afterRepeat = nfa.createNode();
435 nfa.addEpsilonTransition(repeatEnd, afterRepeat);
436 newEnd = afterRepeat;
440 nfa.setActions(newEnd, finalActions);
444 inline bool Term::isEndOfLineAssertion() const
446 return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0);
449 inline bool Term::matchesAtLeastOneCharacter() const
453 if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore)
455 if (isEndOfLineAssertion())
458 if (m_termType == TermType::Group) {
459 for (const Term& term : m_atomData.group.terms) {
460 if (term.matchesAtLeastOneCharacter())
468 inline bool Term::isKnownToMatchAnyString() const
472 switch (m_termType) {
473 case TermType::Empty:
474 case TermType::Deleted:
475 ASSERT_NOT_REACHED();
477 case TermType::CharacterSet:
478 // ".*" is the only simple term matching any string.
479 return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore;
481 case TermType::Group: {
482 // There are infinitely many ways to match anything with groups, we just handle simple cases
483 if (m_atomData.group.terms.size() != 1)
486 const Term& firstTermInGroup = m_atomData.group.terms.first();
487 // -(.*) with any quantifier.
488 if (firstTermInGroup.isKnownToMatchAnyString())
491 if (firstTermInGroup.isUniversalTransition()) {
492 // -(.)*, (.+)*, (.?)* etc.
493 if (m_quantifier == AtomQuantifier::ZeroOrMore)
497 if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore)
501 if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne)
510 inline bool Term::hasFixedLength() const
514 switch (m_termType) {
515 case TermType::Empty:
516 case TermType::Deleted:
517 ASSERT_NOT_REACHED();
519 case TermType::CharacterSet:
520 return m_quantifier == AtomQuantifier::One;
521 case TermType::Group: {
522 if (m_quantifier != AtomQuantifier::One)
524 for (const Term& term : m_atomData.group.terms) {
525 if (!term.hasFixedLength())
534 inline Term& Term::operator=(const Term& other)
537 new (NotNull, this) Term(other);
541 inline Term& Term::operator=(Term&& other)
544 new (NotNull, this) Term(WTF::move(other));
548 inline bool Term::operator==(const Term& other) const
550 if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
553 switch (m_termType) {
554 case TermType::Empty:
555 case TermType::Deleted:
557 case TermType::CharacterSet:
558 return m_atomData.characterSet == other.m_atomData.characterSet;
559 case TermType::Group:
560 return m_atomData.group == other.m_atomData.group;
562 ASSERT_NOT_REACHED();
566 inline unsigned Term::hash() const
568 unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
569 unsigned secondary = 0;
570 switch (m_termType) {
571 case TermType::Empty:
572 secondary = 52184393;
574 case TermType::Deleted:
575 secondary = 40342988;
577 case TermType::CharacterSet:
578 secondary = m_atomData.characterSet.hash();
580 case TermType::Group:
581 secondary = m_atomData.group.hash();
584 return WTF::pairIntHash(primary, secondary);
587 inline bool Term::isEmptyValue() const
589 return m_termType == TermType::Empty;
592 inline bool Term::isDeletedValue() const
594 return m_termType == TermType::Deleted;
597 inline bool Term::isUniversalTransition() const
601 switch (m_termType) {
602 case TermType::Empty:
603 case TermType::Deleted:
604 ASSERT_NOT_REACHED();
606 case TermType::CharacterSet:
607 return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount())
608 || (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 128);
609 case TermType::Group:
610 return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition();
615 inline unsigned Term::generateSubgraphForAtom(NFA& nfa, unsigned source) const
617 switch (m_termType) {
618 case TermType::Empty:
619 case TermType::Deleted:
620 ASSERT_NOT_REACHED();
622 case TermType::CharacterSet: {
623 unsigned target = nfa.createNode();
624 if (isUniversalTransition())
625 nfa.addTransitionsOnAnyCharacter(source, target);
627 if (!m_atomData.characterSet.inverted()) {
628 for (UChar i = 0; i < 128; ++i) {
629 if (m_atomData.characterSet.get(i))
630 nfa.addTransition(source, target, static_cast<char>(i));
633 for (UChar i = 1; i < 128; ++i) {
634 if (!m_atomData.characterSet.get(i))
635 nfa.addTransition(source, target, static_cast<char>(i));
641 case TermType::Group: {
642 unsigned lastTarget = source;
643 for (const Term& term : m_atomData.group.terms)
644 lastTarget = term.generateGraph(nfa, lastTarget, ActionList());
650 inline void Term::destroy()
652 switch (m_termType) {
653 case TermType::Empty:
654 case TermType::Deleted:
656 case TermType::CharacterSet:
657 m_atomData.characterSet.~CharacterSet();
659 case TermType::Group:
660 m_atomData.group.~Group();
663 m_termType = TermType::Deleted;
666 } // namespace ContentExtensions
667 } // namespace WebCore
669 #endif // ENABLE(CONTENT_EXTENSIONS)