Bogus uses of regexp matching should realize that they will OOM before they start...
authorfpizlo@apple.com <fpizlo@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 27 May 2016 14:59:46 +0000 (14:59 +0000)
committerfpizlo@apple.com <fpizlo@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 27 May 2016 14:59:46 +0000 (14:59 +0000)
https://bugs.webkit.org/show_bug.cgi?id=158142

Reviewed by Michael Saboff.

Refactored the RegExpObject::matchGlobal() code so that there is less duplication. Took
advantage of this to make the code more resilient in case of absurd situations: if the
result array gets large, it proceeds with a dry run to detect how many matches there will
be. This allows it to OOM before it starts swapping.

This also improves the overall performance of the code by using lightweight substrings and
skipping the whole intermediate argument array.

This makes some jsfunfuzz tests run a lot faster and use a lot less memory.

* builtins/RegExpPrototype.js:
* CMakeLists.txt:
* JavaScriptCore.xcodeproj/project.pbxproj:
* runtime/MatchResult.cpp: Added.
(JSC::MatchResult::dump):
* runtime/MatchResult.h:
(JSC::MatchResult::empty):
(MatchResult::empty): Deleted.
* runtime/RegExpObject.cpp:
(JSC::RegExpObject::match):
(JSC::collectMatches):
(JSC::RegExpObject::matchGlobal):
* runtime/StringObject.h:
(JSC::jsStringWithReuse):
(JSC::jsSubstring):
* tests/stress/big-match.js: Added. Make sure that this optimization doesn't break big matches.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@201451 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/JavaScriptCore/CMakeLists.txt
Source/JavaScriptCore/ChangeLog
Source/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj
Source/JavaScriptCore/builtins/RegExpPrototype.js
Source/JavaScriptCore/runtime/MatchResult.cpp [new file with mode: 0644]
Source/JavaScriptCore/runtime/MatchResult.h
Source/JavaScriptCore/runtime/RegExpObject.cpp
Source/JavaScriptCore/runtime/StringObject.h
Source/JavaScriptCore/tests/stress/big-match.js [new file with mode: 0644]

index 13108dc..50c5540 100644 (file)
@@ -744,6 +744,7 @@ set(JavaScriptCore_SOURCES
     runtime/MapConstructor.cpp
     runtime/MapIteratorPrototype.cpp
     runtime/MapPrototype.cpp
+    runtime/MatchResult.cpp
     runtime/MathCommon.cpp
     runtime/MathObject.cpp
     runtime/MemoryStatistics.cpp
index f998e58..763b276 100644 (file)
@@ -1,3 +1,37 @@
+2016-05-26  Filip Pizlo  <fpizlo@apple.com>
+
+        Bogus uses of regexp matching should realize that they will OOM before they start swapping
+        https://bugs.webkit.org/show_bug.cgi?id=158142
+
+        Reviewed by Michael Saboff.
+        
+        Refactored the RegExpObject::matchGlobal() code so that there is less duplication. Took
+        advantage of this to make the code more resilient in case of absurd situations: if the
+        result array gets large, it proceeds with a dry run to detect how many matches there will
+        be. This allows it to OOM before it starts swapping.
+        
+        This also improves the overall performance of the code by using lightweight substrings and
+        skipping the whole intermediate argument array.
+        
+        This makes some jsfunfuzz tests run a lot faster and use a lot less memory.
+        
+        * builtins/RegExpPrototype.js:
+        * CMakeLists.txt:
+        * JavaScriptCore.xcodeproj/project.pbxproj:
+        * runtime/MatchResult.cpp: Added.
+        (JSC::MatchResult::dump):
+        * runtime/MatchResult.h:
+        (JSC::MatchResult::empty):
+        (MatchResult::empty): Deleted.
+        * runtime/RegExpObject.cpp:
+        (JSC::RegExpObject::match):
+        (JSC::collectMatches):
+        (JSC::RegExpObject::matchGlobal):
+        * runtime/StringObject.h:
+        (JSC::jsStringWithReuse):
+        (JSC::jsSubstring):
+        * tests/stress/big-match.js: Added. Make sure that this optimization doesn't break big matches.
+
 2016-05-26  Gavin & Ellie Barraclough  <barraclough@apple.com>
 
         Static table property lookup should not require getOwnPropertySlot override.
index 3aa8a72..8d20866 100644 (file)
                DC605B5E1CE26EA200593718 /* ProfilerEvent.h in Headers */ = {isa = PBXBuildFile; fileRef = DC605B5A1CE26E9800593718 /* ProfilerEvent.h */; settings = {ATTRIBUTES = (Private, ); }; };
                DC605B5F1CE26EA500593718 /* ProfilerUID.cpp in Sources */ = {isa = PBXBuildFile; fileRef = DC605B5B1CE26E9800593718 /* ProfilerUID.cpp */; };
                DC605B601CE26EA700593718 /* ProfilerUID.h in Headers */ = {isa = PBXBuildFile; fileRef = DC605B5C1CE26E9800593718 /* ProfilerUID.h */; settings = {ATTRIBUTES = (Private, ); }; };
+               DC69AA661CF7A1F200C6272F /* MatchResult.cpp in Sources */ = {isa = PBXBuildFile; fileRef = DC69AA651CF7A1EF00C6272F /* MatchResult.cpp */; };
                DC7997831CDE9FA0004D4A09 /* TagRegistersMode.h in Headers */ = {isa = PBXBuildFile; fileRef = DC7997821CDE9F9E004D4A09 /* TagRegistersMode.h */; settings = {ATTRIBUTES = (Private, ); }; };
                DC7997841CDE9FA2004D4A09 /* TagRegistersMode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = DC7997811CDE9F9E004D4A09 /* TagRegistersMode.cpp */; };
                DCEE22091CEB9895000C2396 /* DFGBackwardsCFG.h in Headers */ = {isa = PBXBuildFile; fileRef = DCEE22061CEB9890000C2396 /* DFGBackwardsCFG.h */; };
                DC605B5A1CE26E9800593718 /* ProfilerEvent.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ProfilerEvent.h; path = profiler/ProfilerEvent.h; sourceTree = "<group>"; };
                DC605B5B1CE26E9800593718 /* ProfilerUID.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ProfilerUID.cpp; path = profiler/ProfilerUID.cpp; sourceTree = "<group>"; };
                DC605B5C1CE26E9800593718 /* ProfilerUID.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ProfilerUID.h; path = profiler/ProfilerUID.h; sourceTree = "<group>"; };
+               DC69AA651CF7A1EF00C6272F /* MatchResult.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MatchResult.cpp; sourceTree = "<group>"; };
                DC7997811CDE9F9E004D4A09 /* TagRegistersMode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = TagRegistersMode.cpp; sourceTree = "<group>"; };
                DC7997821CDE9F9E004D4A09 /* TagRegistersMode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TagRegistersMode.h; sourceTree = "<group>"; };
                DCEE22061CEB9890000C2396 /* DFGBackwardsCFG.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = DFGBackwardsCFG.h; path = dfg/DFGBackwardsCFG.h; sourceTree = "<group>"; };
                7EF6E0BB0EB7A1EC0079AFAF /* runtime */ = {
                        isa = PBXGroup;
                        children = (
-                               708EBE231CE8F35000453146 /* IntlObjectInlines.h */,
-                               DCF3D5641CD29468003D5C65 /* LazyClassStructure.cpp */,
-                               DCF3D5651CD29468003D5C65 /* LazyClassStructure.h */,
-                               DCF3D5661CD29468003D5C65 /* LazyClassStructureInlines.h */,
-                               DCF3D5671CD29468003D5C65 /* LazyProperty.h */,
-                               DCF3D5681CD29468003D5C65 /* LazyPropertyInlines.h */,
                                BCF605110E203EF800B9A64D /* ArgList.cpp */,
                                BCF605120E203EF800B9A64D /* ArgList.h */,
                                0FE0500C1AA9091100D33B33 /* ArgumentsMode.h */,
                                A1D792FB1B43864B004516F5 /* IntlNumberFormatPrototype.h */,
                                A12BBFF31B044A9800664B69 /* IntlObject.cpp */,
                                A12BBFF11B044A8B00664B69 /* IntlObject.h */,
+                               708EBE231CE8F35000453146 /* IntlObjectInlines.h */,
                                86BF642A148DB2B5004DE36A /* Intrinsic.h */,
                                FE4D55B71AE716CA0052E459 /* IterationStatus.h */,
                                70113D491A8DB093003848C4 /* IteratorOperations.cpp */,
                                1442566015EDE98D0066A49B /* JSWithScope.h */,
                                65C7A1710A8EAACB00FA37EA /* JSWrapperObject.cpp */,
                                65C7A1720A8EAACB00FA37EA /* JSWrapperObject.h */,
+                               DCF3D5641CD29468003D5C65 /* LazyClassStructure.cpp */,
+                               DCF3D5651CD29468003D5C65 /* LazyClassStructure.h */,
+                               DCF3D5661CD29468003D5C65 /* LazyClassStructureInlines.h */,
+                               DCF3D5671CD29468003D5C65 /* LazyProperty.h */,
+                               DCF3D5681CD29468003D5C65 /* LazyPropertyInlines.h */,
                                A7E2EA6A0FB460CF00601F06 /* LiteralParser.cpp */,
                                A7E2EA690FB460CF00601F06 /* LiteralParser.h */,
                                F692A8680255597D01FF60F7 /* Lookup.cpp */,
                                A74DEF8E182D991400522C22 /* MapIteratorPrototype.h */,
                                A700873B17CBE8D300C3E643 /* MapPrototype.cpp */,
                                A700873C17CBE8D300C3E643 /* MapPrototype.h */,
+                               DC69AA651CF7A1EF00C6272F /* MatchResult.cpp */,
                                8612E4CB1522918400C836BE /* MatchResult.h */,
                                4340A4821A9051AF00D73CCA /* MathCommon.cpp */,
                                4340A4831A9051AF00D73CCA /* MathCommon.h */,
                                14469DE1107EC7E700650446 /* NativeErrorPrototype.cpp in Sources */,
                                E33E8D201B9013DE00346B52 /* NativeStdFunctionCell.cpp in Sources */,
                                148F21B7107EC5470042EC2C /* Nodes.cpp in Sources */,
+                               DC69AA661CF7A1F200C6272F /* MatchResult.cpp in Sources */,
                                E3963CEE1B73F75000EB4CE5 /* NodesAnalyzeModule.cpp in Sources */,
                                655EB29B10CE2581001A990E /* NodesCodegen.cpp in Sources */,
                                6546F5211A32B313006F07D5 /* NullGetterFunction.cpp in Sources */,
index f54ceea..2957f14 100644 (file)
@@ -98,6 +98,9 @@ function match(strArg)
     regexp.lastIndex = 0;
     let resultList = [];
 
+    // FIXME: It would be great to implement a solution similar to what we do in
+    // RegExpObject::matchGlobal(). It's not clear if this is possible, since this loop has
+    // effects. https://bugs.webkit.org/show_bug.cgi?id=158145
     const maximumReasonableMatchSize = 100000000;
 
     while (true) {
diff --git a/Source/JavaScriptCore/runtime/MatchResult.cpp b/Source/JavaScriptCore/runtime/MatchResult.cpp
new file mode 100644 (file)
index 0000000..cc52b31
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2016 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "MatchResult.h"
+
+namespace JSC {
+
+void MatchResult::dump(PrintStream& out) const
+{
+    if (start == WTF::notFound)
+        out.print("notFound");
+    else
+        out.print(start, "...", end);
+}
+
+} // namespace JSC
+
index 347bbe9..13dd7f0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012 Apple Inc. All rights reserved.
+ * Copyright (C) 2012, 2016 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
 #ifndef MatchResult_h
 #define MatchResult_h
 
+#include <wtf/PrintStream.h>
+#include <wtf/Vector.h> // for notFound
+
+namespace JSC {
+
 typedef uint64_t EncodedMatchResult;
 
 struct MatchResult {
@@ -69,9 +74,13 @@ struct MatchResult {
     {
         return start == end;
     }
+    
+    void dump(PrintStream&) const;
 
     size_t start;
     size_t end;
 };
 
+} // namespace JSC
+
 #endif
index e8d8021..403cf78 100644 (file)
@@ -169,6 +169,65 @@ MatchResult RegExpObject::match(ExecState* exec, JSGlobalObject* globalObject, J
     return matchInline(exec, globalObject, string);
 }
 
+template<typename FixEndFunc>
+JSValue collectMatches(VM& vm, ExecState* exec, JSString* string, const String& s, RegExpConstructor* constructor, RegExp* regExp, const FixEndFunc& fixEnd)
+{
+    MatchResult result = constructor->performMatch(vm, regExp, string, s, 0);
+    if (!result)
+        return jsNull();
+    
+    static unsigned maxSizeForDirectPath = 100000;
+    
+    JSArray* array = constructEmptyArray(exec, nullptr);
+
+    auto iterate = [&] () {
+        size_t end = result.end;
+        size_t length = end - result.start;
+        array->push(exec, JSRopeString::createSubstringOfResolved(vm, string, result.start, length));
+        if (!length)
+            end = fixEnd(end);
+        result = constructor->performMatch(vm, regExp, string, s, end);
+    };
+    
+    do {
+        if (array->length() >= maxSizeForDirectPath) {
+            // First do a throw-away match to see how many matches we'll get.
+            unsigned matchCount = 0;
+            MatchResult savedResult = result;
+            do {
+                if (array->length() + matchCount >= MAX_STORAGE_VECTOR_LENGTH) {
+                    throwOutOfMemoryError(exec);
+                    return jsUndefined();
+                }
+                
+                size_t end = result.end;
+                matchCount++;
+                if (result.empty())
+                    end = fixEnd(end);
+                
+                // Using RegExpConstructor::performMatch() instead of calling RegExp::match()
+                // directly is a surprising but profitable choice: it means that when we do OOM, we
+                // will leave the cached result in the state it ought to have had just before the
+                // OOM! On the other hand, if this loop concludes that the result is small enough,
+                // then the iterate() loop below will overwrite the cached result anyway.
+                result = constructor->performMatch(vm, regExp, string, s, end);
+            } while (result);
+            
+            // OK, we have a sensible number of matches. Now we can create them for reals.
+            result = savedResult;
+            do
+                iterate();
+            while (result);
+            
+            return array;
+        }
+        
+        iterate();
+    } while (result);
+    
+    return array;
+}
+
 JSValue RegExpObject::matchGlobal(ExecState* exec, JSGlobalObject* globalObject, JSString* string)
 {
     RegExp* regExp = this->regExp();
@@ -183,52 +242,21 @@ JSValue RegExpObject::matchGlobal(ExecState* exec, JSGlobalObject* globalObject,
 
     String s = string->value(exec);
     RegExpConstructor* regExpConstructor = globalObject->regExpConstructor();
-    MatchResult result = regExpConstructor->performMatch(*vm, regExp, string, s, 0);
-
-    // return array of matches
-    MarkedArgumentBuffer list;
-    // We defend ourselves from crazy.
-    const size_t maximumReasonableMatchSize = 1000000000;
-
+    
     if (regExp->unicode()) {
         unsigned stringLength = s.length();
-        while (result) {
-            if (list.size() > maximumReasonableMatchSize) {
-                throwOutOfMemoryError(exec);
-                return jsUndefined();
-            }
-
-            size_t end = result.end;
-            size_t length = end - result.start;
-            list.append(jsSubstring(exec, s, result.start, length));
-            if (!length)
-                end = advanceStringUnicode(s, stringLength, end);
-            result = regExpConstructor->performMatch(*vm, regExp, string, s, end);
-        }
-    } else {
-        while (result) {
-            if (list.size() > maximumReasonableMatchSize) {
-                throwOutOfMemoryError(exec);
-                return jsUndefined();
-            }
-
-            size_t end = result.end;
-            size_t length = end - result.start;
-            list.append(jsSubstring(exec, s, result.start, length));
-            if (!length)
-                ++end;
-            result = regExpConstructor->performMatch(*vm, regExp, string, s, end);
-        }
-    }
-
-    if (list.isEmpty()) {
-        // if there are no matches at all, it's important to return
-        // Null instead of an empty array, because this matches
-        // other browsers and because Null is a false value.
-        return jsNull();
+        return collectMatches(
+            *vm, exec, string, s, regExpConstructor, regExp,
+            [&] (size_t end) -> size_t {
+                return advanceStringUnicode(s, stringLength, end);
+            });
     }
-
-    return constructArray(exec, static_cast<ArrayAllocationProfile*>(0), list);
+    
+    return collectMatches(
+        *vm, exec, string, s, regExpConstructor, regExp,
+        [&] (size_t end) -> size_t {
+            return end + 1;
+        });
 }
 
 } // namespace JSC
index ed2a5b5..37e517e 100644 (file)
@@ -94,6 +94,11 @@ static inline JSString* jsStringWithReuse(ExecState* exec, JSValue originalValue
 }
 
 // Helper that tries to use the JSString substring sharing mechanism if 'originalValue' is a JSString.
+// FIXME: It would be even better if toString returned a JSString*, or if anyone who called
+// toString with the intent of later calling this functon first created a jsString from the String
+// that toString returned. That way, we'd get the substring optimization even when the input was
+// not a JSString.
+// https://bugs.webkit.org/show_bug.cgi?id=158140
 static inline JSString* jsSubstring(ExecState* exec, JSValue originalValue, const String& string, unsigned offset, unsigned length)
 {
     if (originalValue.isString()) {
diff --git a/Source/JavaScriptCore/tests/stress/big-match.js b/Source/JavaScriptCore/tests/stress/big-match.js
new file mode 100644 (file)
index 0000000..4eb441c
--- /dev/null
@@ -0,0 +1,20 @@
+"use strict";
+
+var bigString = "x";
+while (bigString.length < 200000)
+    bigString = bigString + bigString;
+
+if (bigString.length != 262144)
+    throw "Error: bad string length: " + bigString.length;
+
+var result = /x/g[Symbol.match](bigString);
+
+if (result.length != 262144)
+    throw "Error: bad result array length: " + result.length;
+
+for (var i = 0; i < result.length; ++i) {
+    if (result[i] != "x")
+        throw "Error: array does not contain \"x\" at i = " + i + ": " + result[i];
+}
+
+