Bug 39795 - Add support for YARR JIT generation of greedy quantified parens at the...
authorbarraclough@apple.com <barraclough@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 27 May 2010 03:44:28 +0000 (03:44 +0000)
committerbarraclough@apple.com <barraclough@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 27 May 2010 03:44:28 +0000 (03:44 +0000)
Reviewed by Oliver Hunt.

If the last item in a main disjunction is a quantified set of parentheses,
this is easier to code generate for than the general case for quantified
parentheses. This is because we never need to backtrack into the parentheses
- the first match will be the final and accepted match.

This patch also somewhat reverts a recent change to when fallback to PCRE
occurs. At the minute the compiler is tracking on patterns which will
require JIT fallback. This is handy from a performance perspective (it saves
the failed attempt at JIT compilation), but it means introducing knowledge
of the JITs capabilities into the other layers of the regex compilers. For
the specific feature of back-references, add a flag tracking their presence
on the pattern, and make these expressions fallback without attempting to
JIT. For parentheses, return to detecting which cases are have or have not
been handled during JIT compilation.

18% progression on tagcloud, ~1.5% overall on sunspidey.

* yarr/RegexCompiler.cpp:
(JSC::Yarr::RegexPatternConstructor::atomBackReference):
(JSC::Yarr::RegexPatternConstructor::quantifyAtom):
* yarr/RegexJIT.cpp:
(JSC::Yarr::RegexGenerator::TermGenerationState::isLastTerm):
(JSC::Yarr::RegexGenerator::TermGenerationState::isMainDisjunction):
(JSC::Yarr::RegexGenerator::generateParenthesesGreedyNoBacktrack):
(JSC::Yarr::RegexGenerator::generateTerm):
(JSC::Yarr::RegexGenerator::RegexGenerator):
(JSC::Yarr::RegexGenerator::shouldFallBack):
(JSC::Yarr::jitCompileRegex):
* yarr/RegexPattern.h:
(JSC::Yarr::RegexPattern::RegexPattern):
(JSC::Yarr::RegexPattern::reset):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@60267 268f45cc-cd09-0410-ab3c-d52691b4dbfc

JavaScriptCore/ChangeLog
JavaScriptCore/yarr/RegexCompiler.cpp
JavaScriptCore/yarr/RegexJIT.cpp
JavaScriptCore/yarr/RegexPattern.h

index 33c1418..d689596 100644 (file)
@@ -1,3 +1,41 @@
+2010-05-26  Gavin Barraclough  <barraclough@apple.com>
+
+        Reviewed by Oliver Hunt.
+
+        Bug 39795 - Add support for YARR JIT generation of greedy quantified parens at the end of the main disjunction.
+
+        If the last item in a main disjunction is a quantified set of parentheses,
+        this is easier to code generate for than the general case for quantified
+        parentheses. This is because we never need to backtrack into the parentheses
+        - the first match will be the final and accepted match.
+
+        This patch also somewhat reverts a recent change to when fallback to PCRE
+        occurs. At the minute the compiler is tracking on patterns which will
+        require JIT fallback. This is handy from a performance perspective (it saves
+        the failed attempt at JIT compilation), but it means introducing knowledge
+        of the JITs capabilities into the other layers of the regex compilers. For
+        the specific feature of back-references, add a flag tracking their presence
+        on the pattern, and make these expressions fallback without attempting to
+        JIT. For parentheses, return to detecting which cases are have or have not
+        been handled during JIT compilation.
+
+        18% progression on tagcloud, ~1.5% overall on sunspidey.
+
+        * yarr/RegexCompiler.cpp:
+        (JSC::Yarr::RegexPatternConstructor::atomBackReference):
+        (JSC::Yarr::RegexPatternConstructor::quantifyAtom):
+        * yarr/RegexJIT.cpp:
+        (JSC::Yarr::RegexGenerator::TermGenerationState::isLastTerm):
+        (JSC::Yarr::RegexGenerator::TermGenerationState::isMainDisjunction):
+        (JSC::Yarr::RegexGenerator::generateParenthesesGreedyNoBacktrack):
+        (JSC::Yarr::RegexGenerator::generateTerm):
+        (JSC::Yarr::RegexGenerator::RegexGenerator):
+        (JSC::Yarr::RegexGenerator::shouldFallBack):
+        (JSC::Yarr::jitCompileRegex):
+        * yarr/RegexPattern.h:
+        (JSC::Yarr::RegexPattern::RegexPattern):
+        (JSC::Yarr::RegexPattern::reset):
+
 2010-05-26  Geoffrey Garen  <ggaren@apple.com>
 
         Reviewed by Sam Weinig.
 2010-05-26  Geoffrey Garen  <ggaren@apple.com>
 
         Reviewed by Sam Weinig.
index 9fbe213..bcfc188 100644 (file)
@@ -372,7 +372,7 @@ public:
     void atomBackReference(unsigned subpatternId)
     {
         ASSERT(subpatternId);
     void atomBackReference(unsigned subpatternId)
     {
         ASSERT(subpatternId);
-        m_pattern.m_shouldFallBack = true;
+        m_pattern.m_containsBackreferences = true;
         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
 
         if (subpatternId > m_pattern.m_numSubpatterns) {
         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
 
         if (subpatternId > m_pattern.m_numSubpatterns) {
@@ -448,9 +448,6 @@ public:
             return;
         }
 
             return;
         }
 
-        if (max > 1 && term.type == PatternTerm::TypeParenthesesSubpattern)
-            m_pattern.m_shouldFallBack = true;
-
         if (min == 0)
             term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
         else if (min == max)
         if (min == 0)
             term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
         else if (min == max)
index e33dba0..b277a9c 100644 (file)
@@ -345,6 +345,15 @@ class RegexGenerator : private MacroAssembler {
             ASSERT(alternativeValid());
             return alternative()->m_terms[t];
         }
             ASSERT(alternativeValid());
             return alternative()->m_terms[t];
         }
+        bool isLastTerm()
+        {
+            ASSERT(alternativeValid());
+            return (t + 1) == alternative()->m_terms.size();
+        }
+        bool isMainDisjunction()
+        {
+            return !disjunction->m_parent;
+        }
 
         PatternTerm& lookaheadTerm()
         {
 
         PatternTerm& lookaheadTerm()
         {
@@ -902,6 +911,11 @@ class RegexGenerator : private MacroAssembler {
         PatternDisjunction* disjunction = term.parentheses.disjunction;
         ASSERT(term.quantityCount == 1);
 
         PatternDisjunction* disjunction = term.parentheses.disjunction;
         ASSERT(term.quantityCount == 1);
 
+        if (!term.parentheses.isCopy) {
+            m_shouldFallBack = true;
+            return;
+        }
+
         unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0;
 
         unsigned parenthesesFrameLocation = term.frameLocation;
         unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0;
 
         unsigned parenthesesFrameLocation = term.frameLocation;
@@ -989,6 +1003,58 @@ class RegexGenerator : private MacroAssembler {
         }
     }
 
         }
     }
 
+    void generateParenthesesGreedyNoBacktrack(TermGenerationState& state)
+    {
+        PatternTerm& parenthesesTerm = state.term();
+        PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction;
+        ASSERT(parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern);
+        ASSERT(parenthesesTerm.quantityCount != 1); // Handled by generateParenthesesSingle.
+
+        // Capturing not yet implemented!
+        if (parenthesesTerm.invertOrCapture) {
+            m_shouldFallBack = true;
+            return;
+        }
+
+        // Quantification limit not yet implemented!
+        if (parenthesesTerm.quantityCount != 0xffffffff) {
+            m_shouldFallBack = true;
+            return;
+        }
+
+        TermGenerationState parenthesesState(disjunction, state.checkedTotal);
+
+        Label matchAgain(this);
+        for (parenthesesState.resetAlternative(); parenthesesState.alternativeValid(); parenthesesState.nextAlternative()) {
+
+            PatternAlternative* alternative = parenthesesState.alternative();
+            optimizeAlternative(alternative);
+
+            int countToCheck = alternative->m_minimumSize;
+            if (countToCheck) {
+                parenthesesState.addBacktrackJump(jumpIfNoAvailableInput(countToCheck));
+                parenthesesState.checkedTotal += countToCheck;
+            }
+
+            for (parenthesesState.resetTerm(); parenthesesState.termValid(); parenthesesState.nextTerm())
+                generateTerm(parenthesesState);
+
+            // If we get here, we matched! Limit not yet supported, so just try to match more!
+            jump(matchAgain);
+            
+            parenthesesState.linkAlternativeBacktracks(this);
+            // We get here if the alternative fails to match - fall through to the next iteration, or out of the loop.
+
+            if (countToCheck) {
+                sub32(Imm32(countToCheck), index);
+                parenthesesState.checkedTotal -= countToCheck;
+            }
+        }
+
+        // If the last alternative falls through to here, we have a failed match...
+        // Which means that we match whatever we have matched up to this point (even if nothing).
+    }
+
     void generateParentheticalAssertion(TermGenerationState& state)
     {
         PatternTerm& term = state.term();
     void generateParentheticalAssertion(TermGenerationState& state)
     {
         PatternTerm& term = state.term();
@@ -1100,15 +1166,24 @@ class RegexGenerator : private MacroAssembler {
             break;
 
         case PatternTerm::TypeBackReference:
             break;
 
         case PatternTerm::TypeBackReference:
-            ASSERT_NOT_REACHED();
+            m_shouldFallBack = true;
             break;
 
         case PatternTerm::TypeForwardReference:
             break;
 
         case PatternTerm::TypeParenthesesSubpattern:
             break;
 
         case PatternTerm::TypeForwardReference:
             break;
 
         case PatternTerm::TypeParenthesesSubpattern:
-            ASSERT((term.quantityCount == 1) && !term.parentheses.isCopy); // must fallback to pcre before this point
-            generateParenthesesSingle(state);
+            if (term.quantityCount == 1) {
+                generateParenthesesSingle(state);
+                break;
+            } else if (state.isLastTerm() && state.isMainDisjunction()) { // Is this is the last term of the main disjunction?
+                // If this has a greedy quantifier, then it will never need to backtrack!
+                if (term.quantityType == QuantifierGreedy) {
+                    generateParenthesesGreedyNoBacktrack(state);
+                    break;
+                }
+            }
+            m_shouldFallBack = true;
             break;
 
         case PatternTerm::TypeParentheticalAssertion:
             break;
 
         case PatternTerm::TypeParentheticalAssertion:
@@ -1361,6 +1436,7 @@ class RegexGenerator : private MacroAssembler {
 public:
     RegexGenerator(RegexPattern& pattern)
         : m_pattern(pattern)
 public:
     RegexGenerator(RegexPattern& pattern)
         : m_pattern(pattern)
+        , m_shouldFallBack(false)
     {
     }
 
     {
     }
 
@@ -1390,28 +1466,34 @@ public:
         jitObject.set(patchBuffer.finalizeCode());
     }
 
         jitObject.set(patchBuffer.finalizeCode());
     }
 
+    bool shouldFallBack()
+    {
+        return m_shouldFallBack;
+    }
+
 private:
     RegexPattern& m_pattern;
 private:
     RegexPattern& m_pattern;
+    bool m_shouldFallBack;
     Vector<AlternativeBacktrackRecord> m_backtrackRecords;
 };
 
 void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
 {
     RegexPattern pattern(ignoreCase, multiline);
     Vector<AlternativeBacktrackRecord> m_backtrackRecords;
 };
 
 void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
 {
     RegexPattern pattern(ignoreCase, multiline);
-
     if ((error = compileRegex(patternString, pattern)))
         return;
     if ((error = compileRegex(patternString, pattern)))
         return;
-
     numSubpatterns = pattern.m_numSubpatterns;
 
     numSubpatterns = pattern.m_numSubpatterns;
 
-    if (pattern.m_shouldFallBack) {
-        JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase;
-        JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine;
-        jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.data()), patternString.size(), ignoreCaseOption, multilineOption, &numSubpatterns, &error));
-    } else {
+    if (!pattern.m_containsBackreferences) {
         RegexGenerator generator(pattern);
         generator.compile(globalData, jitObject);
         RegexGenerator generator(pattern);
         generator.compile(globalData, jitObject);
+        if (!generator.shouldFallBack())
+            return;
     }
     }
+
+    JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase;
+    JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine;
+    jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.data()), patternString.size(), ignoreCaseOption, multilineOption, &numSubpatterns, &error));
 }
 
 }}
 }
 
 }}
index 3271cc1..61d6ad6 100644 (file)
@@ -271,7 +271,7 @@ struct RegexPattern {
         , m_multiline(multiline)
         , m_numSubpatterns(0)
         , m_maxBackReference(0)
         , m_multiline(multiline)
         , m_numSubpatterns(0)
         , m_maxBackReference(0)
-        , m_shouldFallBack(false)
+        , m_containsBackreferences(false)
         , newlineCached(0)
         , digitsCached(0)
         , spacesCached(0)
         , newlineCached(0)
         , digitsCached(0)
         , spacesCached(0)
@@ -293,7 +293,7 @@ struct RegexPattern {
         m_numSubpatterns = 0;
         m_maxBackReference = 0;
 
         m_numSubpatterns = 0;
         m_maxBackReference = 0;
 
-        m_shouldFallBack = false;
+        m_containsBackreferences = false;
 
         newlineCached = 0;
         digitsCached = 0;
 
         newlineCached = 0;
         digitsCached = 0;
@@ -361,7 +361,7 @@ struct RegexPattern {
     bool m_multiline;
     unsigned m_numSubpatterns;
     unsigned m_maxBackReference;
     bool m_multiline;
     unsigned m_numSubpatterns;
     unsigned m_maxBackReference;
-    bool m_shouldFallBack;
+    bool m_containsBackreferences;
     PatternDisjunction* m_body;
     Vector<PatternDisjunction*, 4> m_disjunctions;
     Vector<CharacterClass*> m_userCharacterClasses;
     PatternDisjunction* m_body;
     Vector<PatternDisjunction*, 4> m_disjunctions;
     Vector<CharacterClass*> m_userCharacterClasses;