Extend YARR Interpreter with beginning character look-up optimization
authorossy@webkit.org <ossy@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 17 Nov 2010 11:03:40 +0000 (11:03 +0000)
committerossy@webkit.org <ossy@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 17 Nov 2010 11:03:40 +0000 (11:03 +0000)
https://bugs.webkit.org/show_bug.cgi?id=45751

Patch by Peter Varga <pvarga@inf.u-szeged.hu> on 2010-11-17
Reviewed by Gavin Barraclough.

Add beginning character look-up optimization which sets the start
index to the first possible successful pattern match.
Extend YARR Interpreter with lookupForBeginChars function which
implements the beginning character look-up optimization.

* yarr/RegexInterpreter.cpp:
(JSC::Yarr::Interpreter::InputStream::readPair):
(JSC::Yarr::Interpreter::InputStream::isNotAvailableInput):
(JSC::Yarr::Interpreter::lookupForBeginChars):
(JSC::Yarr::Interpreter::matchDisjunction):
(JSC::Yarr::Interpreter::interpret):
* yarr/RegexInterpreter.h:
(JSC::Yarr::BytecodePattern::BytecodePattern):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@72186 268f45cc-cd09-0410-ab3c-d52691b4dbfc

JavaScriptCore/ChangeLog
JavaScriptCore/yarr/RegexInterpreter.cpp
JavaScriptCore/yarr/RegexInterpreter.h

index cb6c07c..1be329a 100644 (file)
@@ -1,3 +1,24 @@
+2010-11-17  Peter Varga  <pvarga@inf.u-szeged.hu>
+
+        Reviewed by Gavin Barraclough.
+
+        Extend YARR Interpreter with beginning character look-up optimization
+        https://bugs.webkit.org/show_bug.cgi?id=45751
+
+        Add beginning character look-up optimization which sets the start
+        index to the first possible successful pattern match.
+        Extend YARR Interpreter with lookupForBeginChars function which
+        implements the beginning character look-up optimization.
+
+        * yarr/RegexInterpreter.cpp:
+        (JSC::Yarr::Interpreter::InputStream::readPair):
+        (JSC::Yarr::Interpreter::InputStream::isNotAvailableInput):
+        (JSC::Yarr::Interpreter::lookupForBeginChars):
+        (JSC::Yarr::Interpreter::matchDisjunction):
+        (JSC::Yarr::Interpreter::interpret):
+        * yarr/RegexInterpreter.h:
+        (JSC::Yarr::BytecodePattern::BytecodePattern):
+
 2010-11-17  Alexis Menard  <alexis.menard@nokia.com>, Simon Hausmann  <simon.hausmann@nokia.com>
 
         Reviewed by Kenneth Christiansen, Tor Arne Vestbø.
index 80440d9..dc3024a 100644 (file)
@@ -196,6 +196,12 @@ public:
             return -1;
         }
 
+        int readPair()
+        {
+            ASSERT(pos + 1 < length);
+            return input[pos] | input[pos + 1] << 16;
+        }
+
         int readChecked(int position)
         {
             ASSERT(position < 0);
@@ -263,6 +269,11 @@ public:
             return (pos + position) == length;
         }
 
+        bool isNotAvailableInput(int position)
+        {
+            return (pos + position) > length;
+        }
+
     private:
         const UChar* input;
         unsigned pos;
@@ -993,10 +1004,39 @@ public:
         return JSRegExpErrorNoMatch;
     }
 
+    void lookupForBeginChars()
+    {
+        int character;
+        bool firstSingleCharFound;
+
+        while (true) {
+            if (input.isNotAvailableInput(2))
+                return;
+
+            firstSingleCharFound = false;
+
+            character = input.readPair();
+
+            for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
+                BeginChar bc = pattern->m_beginChars[i];
+
+                if (!firstSingleCharFound && bc.value <= 0xFFFF) {
+                    firstSingleCharFound = true;
+                    character &= 0xFFFF;
+                }
+
+                if ((character | bc.mask) == bc.value)
+                    return;
+            }
+
+            input.next();
+        }
+    }
+
 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
 #define BACKTRACK() { --context->term; goto backtrack; }
 #define currentTerm() (disjunction->terms[context->term])
-    JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+    JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
     {
         if (!--remainingMatchCount)
             return JSRegExpErrorHitLimit;
@@ -1004,6 +1044,9 @@ public:
         if (btrack)
             BACKTRACK();
 
+        if (pattern->m_containsBeginChars && isBody)
+            lookupForBeginChars();
+
         context->matchBegin = input.getPos();
         context->term = 0;
 
@@ -1168,6 +1211,10 @@ public:
                 return JSRegExpNoMatch;
 
             input.next();
+
+            if (pattern->m_containsBeginChars && isBody)
+                lookupForBeginChars();
+
             context->matchBegin = input.getPos();
 
             if (currentTerm().alternative.onceThrough)
@@ -1284,7 +1331,7 @@ public:
 
         DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
 
-        JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context);
+        JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
         if (result == JSRegExpMatch) {
             output[0] = context->matchBegin;
             output[1] = context->matchEnd;
index fe775a4..dae8f9d 100644 (file)
@@ -324,6 +324,7 @@ struct BytecodePattern : FastAllocBase {
         : m_body(body)
         , m_ignoreCase(pattern.m_ignoreCase)
         , m_multiline(pattern.m_multiline)
+        , m_containsBeginChars(pattern.m_containsBeginChars)
         , m_allocator(allocator)
     {
         newlineCharacterClass = pattern.newlineCharacterClass();
@@ -335,6 +336,8 @@ struct BytecodePattern : FastAllocBase {
         // array, so that it won't delete them on destruction.  We'll
         // take responsibility for that.
         pattern.m_userCharacterClasses.clear();
+
+        m_beginChars.append(pattern.m_beginChars);
     }
 
     ~BytecodePattern()
@@ -346,12 +349,16 @@ struct BytecodePattern : FastAllocBase {
     OwnPtr<ByteDisjunction> m_body;
     bool m_ignoreCase;
     bool m_multiline;
+    bool m_containsBeginChars;
     // Each BytecodePattern is associated with a RegExp, each RegExp is associated
     // with a JSGlobalData.  Cache a pointer to out JSGlobalData's m_regexAllocator.
     BumpPointerAllocator* m_allocator;
 
     CharacterClass* newlineCharacterClass;
     CharacterClass* wordcharCharacterClass;
+
+    Vector<BeginChar> m_beginChars;
+
 private:
     Vector<ByteDisjunction*> m_allParenthesesInfo;
     Vector<CharacterClass*> m_userCharacterClasses;