2010-11-17 Peter Varga <pvarga@inf.u-szeged.hu>
authorcommit-queue@webkit.org <commit-queue@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 17 Nov 2010 09:42:41 +0000 (09:42 +0000)
committercommit-queue@webkit.org <commit-queue@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 17 Nov 2010 09:42:41 +0000 (09:42 +0000)
        Reviewed by Gavin Barraclough.

        Collect the beginning characters in a RegExp pattern for look-up
        optimization
        https://bugs.webkit.org/show_bug.cgi?id=45748

        Extend the YARR's parser with an algorithm which collects the potential
        beginning characters from a RegExp pattern for later look-up optimization.

        * yarr/RegexCompiler.cpp:
        (JSC::Yarr::BeginCharHelper::BeginCharHelper):
        (JSC::Yarr::BeginCharHelper::addBeginChar):
        (JSC::Yarr::BeginCharHelper::merge):
        (JSC::Yarr::BeginCharHelper::addCharacter):
        (JSC::Yarr::BeginCharHelper::linkHotTerms):
        (JSC::Yarr::RegexPatternConstructor::RegexPatternConstructor):
        (JSC::Yarr::RegexPatternConstructor::addBeginTerm):
        (JSC::Yarr::RegexPatternConstructor::setupDisjunctionBeginTerms):
        (JSC::Yarr::RegexPatternConstructor::setupAlternativeBeginTerms):
        (JSC::Yarr::RegexPatternConstructor::setupBeginChars):
        (JSC::Yarr::compileRegex):
        * yarr/RegexPattern.h:
        (JSC::Yarr::TermChain::TermChain):
        (JSC::Yarr::BeginChar::BeginChar):
        (JSC::Yarr::RegexPattern::RegexPattern):
        (JSC::Yarr::RegexPattern::reset):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@72180 268f45cc-cd09-0410-ab3c-d52691b4dbfc

JavaScriptCore/ChangeLog
JavaScriptCore/yarr/RegexCompiler.cpp
JavaScriptCore/yarr/RegexPattern.h

index 6737862..512ae90 100644 (file)
@@ -1,3 +1,32 @@
+2010-11-17  Peter Varga  <pvarga@inf.u-szeged.hu>
+
+        Reviewed by Gavin Barraclough.
+
+        Collect the beginning characters in a RegExp pattern for look-up
+        optimization
+        https://bugs.webkit.org/show_bug.cgi?id=45748
+
+        Extend the YARR's parser with an algorithm which collects the potential
+        beginning characters from a RegExp pattern for later look-up optimization.
+
+        * yarr/RegexCompiler.cpp:
+        (JSC::Yarr::BeginCharHelper::BeginCharHelper):
+        (JSC::Yarr::BeginCharHelper::addBeginChar):
+        (JSC::Yarr::BeginCharHelper::merge):
+        (JSC::Yarr::BeginCharHelper::addCharacter):
+        (JSC::Yarr::BeginCharHelper::linkHotTerms):
+        (JSC::Yarr::RegexPatternConstructor::RegexPatternConstructor):
+        (JSC::Yarr::RegexPatternConstructor::addBeginTerm):
+        (JSC::Yarr::RegexPatternConstructor::setupDisjunctionBeginTerms):
+        (JSC::Yarr::RegexPatternConstructor::setupAlternativeBeginTerms):
+        (JSC::Yarr::RegexPatternConstructor::setupBeginChars):
+        (JSC::Yarr::compileRegex):
+        * yarr/RegexPattern.h:
+        (JSC::Yarr::TermChain::TermChain):
+        (JSC::Yarr::BeginChar::BeginChar):
+        (JSC::Yarr::RegexPattern::RegexPattern):
+        (JSC::Yarr::RegexPattern::reset):
+
 2010-11-17  Sheriff Bot  <webkit.review.bot@gmail.com>
 
         Unreviewed, rolling out r72160.
 2010-11-17  Sheriff Bot  <webkit.review.bot@gmail.com>
 
         Unreviewed, rolling out r72160.
index 9f9e028..ccfc94e 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -235,11 +236,117 @@ private:
     Vector<CharacterRange> m_rangesUnicode;
 };
 
     Vector<CharacterRange> m_rangesUnicode;
 };
 
+struct BeginCharHelper {
+    BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
+        : m_beginChars(beginChars)
+        , m_isCaseInsensitive(isCaseInsensitive)
+    {}
+
+    void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
+    {
+        if (quantityType == QuantifierFixedCount && quantityCount > 1) {
+            // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
+            beginChar.value |= beginChar.value << 16;
+            beginChar.mask |= beginChar.mask << 16;
+            addCharacter(beginChar);
+        } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
+            // In case of characters with fixed quantifier we should check the next character as well.
+            linkHotTerms(beginChar, hotTerms);
+        else
+            // In case of greedy matching the next character checking is unnecessary therefore we just store
+            // the first character.
+            addCharacter(beginChar);
+    }
+
+    // Merge two following BeginChars in the vector to reduce the number of character checks.
+    void merge(unsigned size)
+    {
+        for (unsigned i = 0; i < size; i++) {
+            BeginChar* curr = &m_beginChars->at(i);
+            BeginChar* next = &m_beginChars->at(i + 1);
+
+            // If the current and the next size of value is different we should skip the merge process
+            // because the 16bit and 32bit values are unmergable.
+            if (curr->value <= 0xFFFF && next->value > 0xFFFF)
+                continue;
+
+            unsigned diff = curr->value ^ next->value;
+
+            curr->mask |= diff;
+            curr->value |= curr->mask;
+
+            m_beginChars->remove(i + 1);
+            size--;
+        }
+    }
+
+private:
+    void addCharacter(BeginChar beginChar)
+    {
+        unsigned pos = 0;
+        unsigned range = m_beginChars->size();
+
+        // binary chop, find position to insert char.
+        while (range) {
+            unsigned index = range >> 1;
+
+            int val = m_beginChars->at(pos+index).value - beginChar.value;
+            if (!val)
+                return;
+            if (val < 0)
+                range = index;
+            else {
+                pos += (index+1);
+                range -= (index+1);
+            }
+        }
+
+        if (pos == m_beginChars->size())
+            m_beginChars->append(beginChar);
+        else
+            m_beginChars->insert(pos, beginChar);
+    }
+
+    // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
+    void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
+    {
+        for (unsigned i = 0; i < hotTerms->size(); i++) {
+            PatternTerm hotTerm = hotTerms->at(i).term;
+            ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
+
+            UChar characterNext = hotTerm.patternCharacter;
+
+            // Append a character to an existing BeginChar object.
+            if (characterNext <= 0x7f) {
+                unsigned mask = 0;
+
+                if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
+                    mask = 32;
+                    characterNext = toASCIILower(characterNext);
+                }
+
+                addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
+            } else {
+                UChar upper, lower;
+                if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
+                    addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
+                    addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
+                } else
+                    addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
+            }
+        }
+    }
+
+    Vector<BeginChar>* m_beginChars;
+    bool m_isCaseInsensitive;
+};
+
 class RegexPatternConstructor {
 public:
     RegexPatternConstructor(RegexPattern& pattern)
         : m_pattern(pattern)
         , m_characterClassConstructor(pattern.m_ignoreCase)
 class RegexPatternConstructor {
 public:
     RegexPatternConstructor(RegexPattern& pattern)
         : m_pattern(pattern)
         , m_characterClassConstructor(pattern.m_ignoreCase)
+        , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
         , m_invertParentheticalAssertion(false)
     {
     }
         , m_invertParentheticalAssertion(false)
     {
     }
@@ -649,12 +756,153 @@ public:
             loopDisjunction->m_alternatives.clear();
         }
     }
             loopDisjunction->m_alternatives.clear();
         }
     }
-    
-    
+
+    bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth)
+    {
+        if (term.quantityType == QuantifierFixedCount) {
+            beginTerms->append(TermChain(term));
+            if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
+                setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1);
+        } else if (termIndex != numTerms - 1) {
+            beginTerms->append(TermChain(term));
+            return true;
+        }
+
+        return false;
+    }
+
+    // This function collects the terms which are potentially matching the first number of depth characters in the result.
+    // If this function returns false then it found at least one term which makes the beginning character
+    // look-up optimization inefficient.
+    bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
+    {
+        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+            PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+            if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
+                return false;
+        }
+
+        return true;
+    }
+
+    bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
+    {
+        bool checkNext = true;
+        unsigned numTerms = alternative->m_terms.size();
+
+        while (checkNext && termIndex < numTerms) {
+            PatternTerm term = alternative->m_terms[termIndex];
+            checkNext = false;
+
+            switch (term.type) {
+            case PatternTerm::TypeAssertionBOL:
+            case PatternTerm::TypeAssertionEOL:
+            case PatternTerm::TypeAssertionWordBoundary:
+                return false;
+
+            case PatternTerm::TypeBackReference:
+            case PatternTerm::TypeForwardReference:
+                return false;
+
+            case PatternTerm::TypePatternCharacter:
+                if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) {
+                    termIndex++;
+                    checkNext = true;
+                }
+                break;
+
+            case PatternTerm::TypeCharacterClass:
+                return false;
+
+            case PatternTerm::TypeParentheticalAssertion:
+                if (term.invertOrCapture)
+                    return false;
+
+            case PatternTerm::TypeParenthesesSubpattern:
+                if (term.quantityType != QuantifierFixedCount) {
+                    if (termIndex == numTerms - 1)
+                        break;
+
+                    termIndex++;
+                    checkNext = true;
+
+                }
+
+                if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
+                    return false;
+
+                break;
+            }
+        }
+
+        return true;
+    }
+
+    void setupBeginChars()
+    {
+        Vector<TermChain> beginTerms;
+        bool containsFixedCharacter = false;
+
+        if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
+                && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
+            unsigned size = beginTerms.size();
+
+            // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
+            if (!size)
+                return;
+
+            m_pattern.m_containsBeginChars = true;
+
+            for (unsigned i = 0; i < size; i++) {
+                PatternTerm term = beginTerms[i].term;
+
+                // We have just collected PatternCharacter terms, other terms are not allowed.
+                ASSERT(term.type == PatternTerm::TypePatternCharacter);
+
+                if (term.quantityType == QuantifierFixedCount)
+                    containsFixedCharacter = true;
+
+                UChar character = term.patternCharacter;
+                unsigned mask = 0;
+
+                if (character <= 0x7f) {
+                    if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
+                        mask = 32;
+                        character = toASCIILower(character);
+                    }
+
+                    m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                } else {
+                    UChar upper, lower;
+                    if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
+                        m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                        m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                    } else
+                        m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+                }
+            }
+
+            // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
+            if (!containsFixedCharacter) {
+                m_pattern.m_containsBeginChars = false;
+                return;
+            }
+
+            size = m_pattern.m_beginChars.size();
+
+            if (size > 2)
+                m_beginCharHelper.merge(size - 1);
+            else if (size <= 1)
+                m_pattern.m_containsBeginChars = false;
+        }
+    }
+
 private:
     RegexPattern& m_pattern;
     PatternAlternative* m_alternative;
     CharacterClassConstructor m_characterClassConstructor;
 private:
     RegexPattern& m_pattern;
     PatternAlternative* m_alternative;
     CharacterClassConstructor m_characterClassConstructor;
+    BeginCharHelper m_beginCharHelper;
     bool m_invertCharacterClass;
     bool m_invertParentheticalAssertion;
 };
     bool m_invertCharacterClass;
     bool m_invertParentheticalAssertion;
 };
@@ -687,6 +935,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
     constructor.optimizeBOL();
         
     constructor.setupOffsets();
     constructor.optimizeBOL();
         
     constructor.setupOffsets();
+    constructor.setupBeginChars();
 
     return 0;
 };
 
     return 0;
 };
index eecbd43..be31fcd 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
 /*
  * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -283,11 +284,36 @@ CharacterClass* nondigitsCreate();
 CharacterClass* nonspacesCreate();
 CharacterClass* nonwordcharCreate();
 
 CharacterClass* nonspacesCreate();
 CharacterClass* nonwordcharCreate();
 
+struct TermChain {
+    TermChain(PatternTerm term)
+        : term(term)
+    {}
+
+    PatternTerm term;
+    Vector<TermChain> hotTerms;
+};
+
+struct BeginChar {
+    BeginChar()
+        : value(0)
+        , mask(0)
+    {}
+
+    BeginChar(unsigned value, unsigned mask)
+        : value(value)
+        , mask(mask)
+    {}
+
+    unsigned value;
+    unsigned mask;
+};
+
 struct RegexPattern {
     RegexPattern(bool ignoreCase, bool multiline)
         : m_ignoreCase(ignoreCase)
         , m_multiline(multiline)
         , m_containsBackreferences(false)
 struct RegexPattern {
     RegexPattern(bool ignoreCase, bool multiline)
         : m_ignoreCase(ignoreCase)
         , m_multiline(multiline)
         , m_containsBackreferences(false)
+        , m_containsBeginChars(false)
         , m_containsBOL(false)
         , m_numSubpatterns(0)
         , m_maxBackReference(0)
         , m_containsBOL(false)
         , m_numSubpatterns(0)
         , m_maxBackReference(0)
@@ -313,6 +339,7 @@ struct RegexPattern {
         m_maxBackReference = 0;
 
         m_containsBackreferences = false;
         m_maxBackReference = 0;
 
         m_containsBackreferences = false;
+        m_containsBeginChars = false;
         m_containsBOL = false;
 
         newlineCached = 0;
         m_containsBOL = false;
 
         newlineCached = 0;
@@ -327,6 +354,7 @@ struct RegexPattern {
         m_disjunctions.clear();
         deleteAllValues(m_userCharacterClasses);
         m_userCharacterClasses.clear();
         m_disjunctions.clear();
         deleteAllValues(m_userCharacterClasses);
         m_userCharacterClasses.clear();
+        m_beginChars.clear();
     }
 
     bool containsIllegalBackReference()
     }
 
     bool containsIllegalBackReference()
@@ -380,12 +408,14 @@ struct RegexPattern {
     bool m_ignoreCase : 1;
     bool m_multiline : 1;
     bool m_containsBackreferences : 1;
     bool m_ignoreCase : 1;
     bool m_multiline : 1;
     bool m_containsBackreferences : 1;
+    bool m_containsBeginChars : 1;
     bool m_containsBOL : 1;
     unsigned m_numSubpatterns;
     unsigned m_maxBackReference;
     PatternDisjunction* m_body;
     Vector<PatternDisjunction*, 4> m_disjunctions;
     Vector<CharacterClass*> m_userCharacterClasses;
     bool m_containsBOL : 1;
     unsigned m_numSubpatterns;
     unsigned m_maxBackReference;
     PatternDisjunction* m_body;
     Vector<PatternDisjunction*, 4> m_disjunctions;
     Vector<CharacterClass*> m_userCharacterClasses;
+    Vector<BeginChar> m_beginChars;
 
 private:
     CharacterClass* newlineCached;
 
 private:
     CharacterClass* newlineCached;