Add case-insensitive checks to DFA bytecode.
authorachristensen@apple.com <achristensen@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 25 Mar 2015 23:05:36 +0000 (23:05 +0000)
committerachristensen@apple.com <achristensen@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 25 Mar 2015 23:05:36 +0000 (23:05 +0000)
https://bugs.webkit.org/show_bug.cgi?id=142977

Reviewed by Benjamin Poulain.

* contentextensions/DFABytecode.h:
(WebCore::ContentExtensions::instructionSizeWithArguments):
* contentextensions/DFABytecodeCompiler.cpp:
(WebCore::ContentExtensions::DFABytecodeCompiler::emitCheckValue):
(WebCore::ContentExtensions::DFABytecodeCompiler::emitCheckValueRange):
Add case-insensitive bytecode.
(WebCore::ContentExtensions::DFABytecodeCompiler::compileNodeTransitions):
Check to see if case-insensitive bytecodes can be used.
(WebCore::ContentExtensions::DFABytecodeCompiler::compileCheckForRange):
* contentextensions/DFABytecodeCompiler.h:
(WebCore::ContentExtensions::DFABytecodeCompiler::Range::Range):
Added Range structure to be able to count the ranges in a future patch deciding if we want to use jump tables.
* contentextensions/DFABytecodeInterpreter.cpp:
(WebCore::ContentExtensions::DFABytecodeInterpreter::interpret):
Interpret case-insensitive bytecodes.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@181980 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/WebCore/ChangeLog
Source/WebCore/contentextensions/DFABytecode.h
Source/WebCore/contentextensions/DFABytecodeCompiler.cpp
Source/WebCore/contentextensions/DFABytecodeCompiler.h
Source/WebCore/contentextensions/DFABytecodeInterpreter.cpp

index 7f56dc9..e5967be 100644 (file)
@@ -1,3 +1,26 @@
+2015-03-25  Alex Christensen  <achristensen@webkit.org>
+
+        Add case-insensitive checks to DFA bytecode.
+        https://bugs.webkit.org/show_bug.cgi?id=142977
+
+        Reviewed by Benjamin Poulain.
+
+        * contentextensions/DFABytecode.h:
+        (WebCore::ContentExtensions::instructionSizeWithArguments):
+        * contentextensions/DFABytecodeCompiler.cpp:
+        (WebCore::ContentExtensions::DFABytecodeCompiler::emitCheckValue):
+        (WebCore::ContentExtensions::DFABytecodeCompiler::emitCheckValueRange):
+        Add case-insensitive bytecode.
+        (WebCore::ContentExtensions::DFABytecodeCompiler::compileNodeTransitions):
+        Check to see if case-insensitive bytecodes can be used.
+        (WebCore::ContentExtensions::DFABytecodeCompiler::compileCheckForRange):
+        * contentextensions/DFABytecodeCompiler.h:
+        (WebCore::ContentExtensions::DFABytecodeCompiler::Range::Range):
+        Added Range structure to be able to count the ranges in a future patch deciding if we want to use jump tables.
+        * contentextensions/DFABytecodeInterpreter.cpp:
+        (WebCore::ContentExtensions::DFABytecodeInterpreter::interpret):
+        Interpret case-insensitive bytecodes.
+
 2015-03-25  Sam Weinig  <sam@webkit.org>
 
         Address additional review feedback from https://bugs.webkit.org/show_bug.cgi?id=143059.
index b716c25..ec4b31e 100644 (file)
@@ -39,13 +39,15 @@ enum class DFABytecodeInstruction : uint8_t {
     // CheckValue has two arguments:
     // The value to check (1 byte),
     // The index to jump to if the values are equal (4 bytes).
-    CheckValue,
+    CheckValueCaseInsensitive,
+    CheckValueCaseSensitive,
 
     // Jump to an offset if the input value is within a certain range.
     // The lower value (1 byte).
     // The higher value (1 byte).
     // The index to jump to if the value is in the range (4 bytes).
-    CheckValueRange,
+    CheckValueRangeCaseInsensitive,
+    CheckValueRangeCaseSensitive,
 
     // AppendAction has one argument:
     // The action to append (4 bytes).
@@ -67,9 +69,11 @@ enum class DFABytecodeInstruction : uint8_t {
 static inline size_t instructionSizeWithArguments(DFABytecodeInstruction instruction)
 {
     switch (instruction) {
-    case DFABytecodeInstruction::CheckValue:
+    case DFABytecodeInstruction::CheckValueCaseSensitive:
+    case DFABytecodeInstruction::CheckValueCaseInsensitive:
         return sizeof(DFABytecodeInstruction) + sizeof(uint8_t) + sizeof(unsigned);
-    case DFABytecodeInstruction::CheckValueRange:
+    case DFABytecodeInstruction::CheckValueRangeCaseSensitive:
+    case DFABytecodeInstruction::CheckValueRangeCaseInsensitive:
         return sizeof(DFABytecodeInstruction) + sizeof(uint8_t) + sizeof(uint8_t) + sizeof(unsigned);
     case DFABytecodeInstruction::AppendAction:
         return sizeof(DFABytecodeInstruction) + sizeof(unsigned);
index 5989ed4..2434afb 100644 (file)
@@ -69,24 +69,23 @@ void DFABytecodeCompiler::emitJump(unsigned destinationNodeIndex)
     append<unsigned>(m_bytecode, 0); // This value will be set when linking.
 }
 
-void DFABytecodeCompiler::emitCheckValue(uint8_t value, unsigned destinationNodeIndex)
+void DFABytecodeCompiler::emitCheckValue(uint8_t value, unsigned destinationNodeIndex, bool caseSensitive)
 {
-    append<DFABytecodeInstruction>(m_bytecode, DFABytecodeInstruction::CheckValue);
+    append<DFABytecodeInstruction>(m_bytecode, caseSensitive ? DFABytecodeInstruction::CheckValueCaseSensitive : DFABytecodeInstruction::CheckValueCaseInsensitive);
     append<uint8_t>(m_bytecode, value);
     m_linkRecords.append(std::make_pair(m_bytecode.size(), destinationNodeIndex));
     append<unsigned>(m_bytecode, 0); // This value will be set when linking.
 }
 
-void DFABytecodeCompiler::emitCheckValueRange(uint8_t lowValue, uint8_t highValue, unsigned destinationNodeIndex)
+void DFABytecodeCompiler::emitCheckValueRange(uint8_t lowValue, uint8_t highValue, unsigned destinationNodeIndex, bool caseSensitive)
 {
-    ASSERT_WITH_MESSAGE(lowValue != highValue, "A single value check should be emitted for single values.");
-    ASSERT_WITH_MESSAGE(lowValue < highValue, "The instruction semantic impose lowValue is smaller than highValue.");
+    ASSERT_WITH_MESSAGE(lowValue < highValue, "The instruction semantic impose lowValue is strictly less than highValue.");
 
-    append<DFABytecodeInstruction>(m_bytecode, DFABytecodeInstruction::CheckValueRange);
+    append<DFABytecodeInstruction>(m_bytecode, caseSensitive ? DFABytecodeInstruction::CheckValueRangeCaseSensitive : DFABytecodeInstruction::CheckValueRangeCaseInsensitive);
     append<uint8_t>(m_bytecode, lowValue);
     append<uint8_t>(m_bytecode, highValue);
     m_linkRecords.append(std::make_pair(m_bytecode.size(), destinationNodeIndex));
-    append<unsigned>(m_bytecode, 0);
+    append<unsigned>(m_bytecode, 0); // This value will be set when linking.
 }
 
 void DFABytecodeCompiler::emitTerminate()
@@ -113,56 +112,84 @@ void DFABytecodeCompiler::compileNode(unsigned index)
 
 void DFABytecodeCompiler::compileNodeTransitions(const DFANode& node)
 {
+    unsigned destinations[128];
+    const unsigned noDestination = std::numeric_limits<unsigned>::max();
+    for (uint16_t i = 0; i < 128; i++) {
+        auto it = node.transitions.find(i);
+        if (it == node.transitions.end())
+            destinations[i] = noDestination;
+        else
+            destinations[i] = it->value;
+    }
+
+    Vector<Range> ranges;
+    uint8_t rangeMin;
     bool hasRangeMin = false;
-    uint16_t rangeMin;
-    unsigned rangeDestination = 0;
-
-    for (unsigned char i = 0; i < 128; ++i) {
-        auto transitionIterator = node.transitions.find(i);
-        if (transitionIterator == node.transitions.end()) {
-            if (hasRangeMin) {
-                ASSERT_WITH_MESSAGE(!(node.hasFallbackTransition && node.fallbackTransition == rangeDestination), "Individual transitions to the fallback transitions should have been eliminated by the optimizer.");
-
-                unsigned char lastHighValue = i - 1;
-                compileCheckForRange(rangeMin, lastHighValue, rangeDestination);
-                hasRangeMin = false;
+    for (uint8_t i = 0; i < 128; i++) {
+        if (hasRangeMin) {
+            ASSERT_WITH_MESSAGE(!(node.hasFallbackTransition && node.fallbackTransition == destinations[rangeMin]), "Individual transitions to the fallback transitions should have been eliminated by the optimizer.");
+            if (destinations[i] != destinations[rangeMin]) {
+
+                // This is the end of a range. Check if it can be case insensitive.
+                uint8_t rangeMax = i - 1;
+                bool caseSensitive = true;
+                if (rangeMin >= 'A' && rangeMax <= 'Z') {
+                    caseSensitive = false;
+                    for (uint8_t rangeIndex = rangeMin; rangeIndex <= rangeMax; rangeIndex++) {
+                        if (destinations[rangeMin] != destinations[toASCIILower(rangeIndex)]) {
+                            caseSensitive = true;
+                            break;
+                        }
+                    }
+                }
+
+                if (!caseSensitive) {
+                    // If all the lower-case destinations are the same as the upper-case destinations,
+                    // then they will be covered by a case-insensitive range and will not need their own range.
+                    for (uint8_t rangeIndex = rangeMin; rangeIndex <= rangeMax; rangeIndex++) {
+                        ASSERT(destinations[rangeMin] == destinations[toASCIILower(rangeIndex)]);
+                        destinations[toASCIILower(rangeIndex)] = noDestination;
+                    }
+                    ranges.append(Range(toASCIILower(rangeMin), toASCIILower(rangeMax), destinations[rangeMin], caseSensitive));
+                } else
+                    ranges.append(Range(rangeMin, rangeMax, destinations[rangeMin], caseSensitive));
+
+                if (destinations[i] == noDestination)
+                    hasRangeMin = false;
+                else
+                    rangeMin = i;
             }
-            continue;
-        }
-
-        if (!hasRangeMin) {
-            hasRangeMin = true;
-            rangeMin = transitionIterator->key;
-            rangeDestination = transitionIterator->value;
         } else {
-            if (transitionIterator->value == rangeDestination)
-                continue;
-
-            unsigned char lastHighValue = i - 1;
-            compileCheckForRange(rangeMin, lastHighValue, rangeDestination);
-            rangeMin = i;
-            rangeDestination = transitionIterator->value;
+            if (destinations[i] != noDestination) {
+                rangeMin = i;
+                hasRangeMin = true;
+            }
         }
     }
-    if (hasRangeMin)
-        compileCheckForRange(rangeMin, 127, rangeDestination);
+    if (hasRangeMin) {
+        // Ranges are appended after passing the end of them.
+        // If a range goes to 127, we will have an uncommitted rangeMin because the loop does not check 128.
+        // If a range goes to 127, there will never be values higher than it, so checking for case-insensitive ranges would always fail.
+        ranges.append(Range(rangeMin, 127, destinations[rangeMin], true));
+    }
 
+    for (const auto& range : ranges)
+        compileCheckForRange(range);
     if (node.hasFallbackTransition)
         emitJump(node.fallbackTransition);
     else
         emitTerminate();
 }
 
-void DFABytecodeCompiler::compileCheckForRange(uint16_t lowValue, uint16_t highValue, unsigned destinationNodeIndex)
+void DFABytecodeCompiler::compileCheckForRange(const Range& range)
 {
-    ASSERT_WITH_MESSAGE(lowValue < 128, "The DFA engine only supports the ASCII alphabet.");
-    ASSERT_WITH_MESSAGE(highValue < 128, "The DFA engine only supports the ASCII alphabet.");
-    ASSERT(lowValue <= highValue);
+    ASSERT_WITH_MESSAGE(range.max < 128, "The DFA engine only supports the ASCII alphabet.");
+    ASSERT(range.min <= range.max);
 
-    if (lowValue == highValue)
-        emitCheckValue(lowValue, destinationNodeIndex);
+    if (range.min == range.max)
+        emitCheckValue(range.min, range.destination, range.caseSensitive);
     else
-        emitCheckValueRange(lowValue, highValue, destinationNodeIndex);
+        emitCheckValueRange(range.min, range.max, range.destination, range.caseSensitive);
 }
 
 void DFABytecodeCompiler::compile()
index 46659bf..1a56e5a 100644 (file)
@@ -49,15 +49,28 @@ public:
     void compile();
 
 private:
+    struct Range {
+        Range(uint8_t min, uint8_t max, unsigned destination, bool caseSensitive)
+            : min(min)
+            , max(max)
+            , destination(destination)
+            , caseSensitive(caseSensitive)
+        {
+        }
+        uint8_t min;
+        uint8_t max;
+        unsigned destination;
+        bool caseSensitive;
+    };
     void compileNode(unsigned);
     void compileNodeTransitions(const DFANode&);
-    void compileCheckForRange(uint16_t lowValue, uint16_t highValue, unsigned destinationNodeIndex);
+    void compileCheckForRange(const Range&);
 
     void emitAppendAction(unsigned);
     void emitTestFlagsAndAppendAction(uint16_t flags, unsigned);
     void emitJump(unsigned destinationNodeIndex);
-    void emitCheckValue(uint8_t value, unsigned destinationNodeIndex);
-    void emitCheckValueRange(uint8_t lowValue, uint8_t highValue, unsigned destinationNodeIndex);
+    void emitCheckValue(uint8_t value, unsigned destinationNodeIndex, bool caseSensitive);
+    void emitCheckValueRange(uint8_t lowValue, uint8_t highValue, unsigned destinationNodeIndex, bool caseSensitive);
     void emitTerminate();
 
     Vector<DFABytecode>& m_bytecode;
index 4e4ac28..6c2696e 100644 (file)
@@ -85,33 +85,46 @@ DFABytecodeInterpreter::Actions DFABytecodeInterpreter::interpret(const CString&
             case DFABytecodeInstruction::Terminate:
                 goto nextDFA;
                     
-            case DFABytecodeInstruction::CheckValue:
+            case DFABytecodeInstruction::CheckValueCaseSensitive:
+            case DFABytecodeInstruction::CheckValueCaseInsensitive: {
                 if (urlIndexIsAfterEndOfString)
                     goto nextDFA;
 
+                char character = url[urlIndex];
+                if (static_cast<DFABytecodeInstruction>(m_bytecode[programCounter]) == DFABytecodeInstruction::CheckValueCaseInsensitive)
+                    character = toASCIILower(character);
+
                 // Check to see if the next character in the url is the value stored with the bytecode.
-                if (url[urlIndex] == getBits<uint8_t>(m_bytecode, m_bytecodeLength, programCounter + sizeof(DFABytecode))) {
+                if (character == getBits<uint8_t>(m_bytecode, m_bytecodeLength, programCounter + sizeof(DFABytecode))) {
                     programCounter = getBits<unsigned>(m_bytecode, m_bytecodeLength, programCounter + sizeof(DFABytecode) + sizeof(uint8_t));
-                    if (!url[urlIndex])
+                    if (!character)
                         urlIndexIsAfterEndOfString = true;
                     urlIndex++; // This represents an edge in the DFA.
-                } else
-                    programCounter += instructionSizeWithArguments(DFABytecodeInstruction::CheckValue);
+                } else {
+                    programCounter += instructionSizeWithArguments(DFABytecodeInstruction::CheckValueCaseSensitive);
+                    ASSERT(instructionSizeWithArguments(DFABytecodeInstruction::CheckValueCaseSensitive) == instructionSizeWithArguments(DFABytecodeInstruction::CheckValueCaseInsensitive));
+                }
                 break;
+            }
                     
-            case DFABytecodeInstruction::CheckValueRange: {
+            case DFABytecodeInstruction::CheckValueRangeCaseSensitive:
+            case DFABytecodeInstruction::CheckValueRangeCaseInsensitive: {
                 if (urlIndexIsAfterEndOfString)
                     goto nextDFA;
                 
                 char character = url[urlIndex];
+                if (static_cast<DFABytecodeInstruction>(m_bytecode[programCounter]) == DFABytecodeInstruction::CheckValueRangeCaseInsensitive)
+                    character = toASCIILower(character);
                 if (character >= getBits<uint8_t>(m_bytecode, m_bytecodeLength, programCounter + sizeof(DFABytecode))
                     && character <= getBits<uint8_t>(m_bytecode, m_bytecodeLength, programCounter + sizeof(DFABytecode) + sizeof(uint8_t))) {
                     programCounter = getBits<unsigned>(m_bytecode, m_bytecodeLength, programCounter + sizeof(DFABytecode) + sizeof(uint8_t) + sizeof(uint8_t));
                     if (!character)
                         urlIndexIsAfterEndOfString = true;
                     urlIndex++; // This represents an edge in the DFA.
-                } else
-                    programCounter += instructionSizeWithArguments(DFABytecodeInstruction::CheckValueRange);
+                } else {
+                    programCounter += instructionSizeWithArguments(DFABytecodeInstruction::CheckValueRangeCaseSensitive);
+                    ASSERT(instructionSizeWithArguments(DFABytecodeInstruction::CheckValueRangeCaseSensitive) == instructionSizeWithArguments(DFABytecodeInstruction::CheckValueRangeCaseInsensitive));
+                }
                 break;
             }