Identify memcpy loops in b3 master
authorjustin_michaud@apple.com <justin_michaud@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 21 Aug 2019 06:30:05 +0000 (06:30 +0000)
committerjustin_michaud@apple.com <justin_michaud@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Wed, 21 Aug 2019 06:30:05 +0000 (06:30 +0000)
https://bugs.webkit.org/show_bug.cgi?id=200181

Reviewed by Saam Barati.

JSTests:

* microbenchmarks/memcpy-loop.js: Added.
(doTest):
(let.arr1):
* microbenchmarks/memcpy-typed-loop-large.js: Added.
(doTest):
(let.arr1.new.Int32Array.1000000.let.arr2.new.Int32Array.1000000):
(arr2):
* microbenchmarks/memcpy-typed-loop-small.js: Added.
(doTest):
(16.let.arr1.new.Int32Array.size.let.arr2.new.Int32Array.size):
(16.arr2):
* microbenchmarks/memcpy-typed-loop-speculative.js: Added.
(doTest):
(let.arr1.new.Int32Array.10.let.arr2.new.Int32Array.10):
(arr2):
* microbenchmarks/memcpy-wasm-large.js: Added.
(typeof.WebAssembly.string_appeared_here.eq):
(typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
* microbenchmarks/memcpy-wasm-medium.js: Added.
(typeof.WebAssembly.string_appeared_here.eq):
(typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
* microbenchmarks/memcpy-wasm-small.js: Added.
(typeof.WebAssembly.string_appeared_here.eq):
(typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
* microbenchmarks/memcpy-wasm.js: Added.
(typeof.WebAssembly.string_appeared_here.eq):
(typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
* stress/memcpy-typed-loops.js: Added.
(noLoop):
(invalidStart):
(const.size.10.let.arr1.new.Int32Array.size.let.arr2.new.Int32Array.size):
(arr2):
* wasm/function-tests/memcpy-wasm-loop.js: Added.
(0.GetLocal.3.I32Const.1.I32Add.SetLocal.3.Br.1.End.End.End.WebAssembly):
(string_appeared_here):

Source/JavaScriptCore:

Add a new pass in B3 to identify one type of forward byte copy loop and replace it with a call to a custom version of memcpy
that will not cause GC tearing and have the correct behaviour when overlapping regions are passed in.

Microbenchmarks show memcpy-typed-loop-large is about 6x faster, and everything else is neutral. The optimization is disabled
on arm for now, until we add a memcpy implementation for it.

* JavaScriptCore.xcodeproj/project.pbxproj:
* Sources.txt:
* b3/B3Generate.cpp:
(JSC::B3::generateToAir):
* b3/B3ReduceLoopStrength.cpp: Added.
(JSC::B3::fastForwardCopy32):
(JSC::B3::ReduceLoopStrength::AddrInfo::appendAddr):
(JSC::B3::ReduceLoopStrength::ReduceLoopStrength):
(JSC::B3::ReduceLoopStrength::reduceByteCopyLoopsToMemcpy):
(JSC::B3::ReduceLoopStrength::hoistValue):
(JSC::B3::ReduceLoopStrength::run):
(JSC::B3::reduceLoopStrength):
* b3/B3ReduceLoopStrength.h: Added.
* b3/testb3.h:
* b3/testb3_1.cpp:
(run):
* b3/testb3_8.cpp:
(testFastForwardCopy32):
(testByteCopyLoop):
(testByteCopyLoopStartIsLoopDependent):
(testByteCopyLoopBoundIsLoopDependent):
(addCopyTests):

git-svn-id: http://svn.webkit.org/repository/webkit/trunk@248938 268f45cc-cd09-0410-ab3c-d52691b4dbfc

20 files changed:
JSTests/ChangeLog
JSTests/microbenchmarks/memcpy-loop.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-typed-loop-large.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-typed-loop-small.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-typed-loop-speculative.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-wasm-large.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-wasm-medium.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-wasm-small.js [new file with mode: 0644]
JSTests/microbenchmarks/memcpy-wasm.js [new file with mode: 0644]
JSTests/stress/memcpy-typed-loops.js [new file with mode: 0644]
JSTests/wasm/function-tests/memcpy-wasm-loop.js [new file with mode: 0644]
Source/JavaScriptCore/ChangeLog
Source/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj
Source/JavaScriptCore/Sources.txt
Source/JavaScriptCore/b3/B3Generate.cpp
Source/JavaScriptCore/b3/B3ReduceLoopStrength.cpp [new file with mode: 0644]
Source/JavaScriptCore/b3/B3ReduceLoopStrength.h [new file with mode: 0644]
Source/JavaScriptCore/b3/testb3.h
Source/JavaScriptCore/b3/testb3_1.cpp
Source/JavaScriptCore/b3/testb3_8.cpp

index b37bd7e..c03c2e5 100644 (file)
@@ -1,3 +1,46 @@
+2019-08-20  Justin Michaud  <justin_michaud@apple.com>
+
+        Identify memcpy loops in b3
+        https://bugs.webkit.org/show_bug.cgi?id=200181
+
+        Reviewed by Saam Barati.
+
+        * microbenchmarks/memcpy-loop.js: Added.
+        (doTest):
+        (let.arr1):
+        * microbenchmarks/memcpy-typed-loop-large.js: Added.
+        (doTest):
+        (let.arr1.new.Int32Array.1000000.let.arr2.new.Int32Array.1000000):
+        (arr2):
+        * microbenchmarks/memcpy-typed-loop-small.js: Added.
+        (doTest):
+        (16.let.arr1.new.Int32Array.size.let.arr2.new.Int32Array.size):
+        (16.arr2):
+        * microbenchmarks/memcpy-typed-loop-speculative.js: Added.
+        (doTest):
+        (let.arr1.new.Int32Array.10.let.arr2.new.Int32Array.10):
+        (arr2):
+        * microbenchmarks/memcpy-wasm-large.js: Added.
+        (typeof.WebAssembly.string_appeared_here.eq):
+        (typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
+        * microbenchmarks/memcpy-wasm-medium.js: Added.
+        (typeof.WebAssembly.string_appeared_here.eq):
+        (typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
+        * microbenchmarks/memcpy-wasm-small.js: Added.
+        (typeof.WebAssembly.string_appeared_here.eq):
+        (typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
+        * microbenchmarks/memcpy-wasm.js: Added.
+        (typeof.WebAssembly.string_appeared_here.eq):
+        (typeof.WebAssembly.string_appeared_here.const.1.new.WebAssembly.Instance.new.WebAssembly.Module.new.Uint8Array):
+        * stress/memcpy-typed-loops.js: Added.
+        (noLoop):
+        (invalidStart):
+        (const.size.10.let.arr1.new.Int32Array.size.let.arr2.new.Int32Array.size):
+        (arr2):
+        * wasm/function-tests/memcpy-wasm-loop.js: Added.
+        (0.GetLocal.3.I32Const.1.I32Add.SetLocal.3.Br.1.End.End.End.WebAssembly):
+        (string_appeared_here):
+
 2019-08-20  Yusuke Suzuki  <ysuzuki@apple.com>
 
         [JSC] Array.prototype.toString should not get "join" function each time
diff --git a/JSTests/microbenchmarks/memcpy-loop.js b/JSTests/microbenchmarks/memcpy-loop.js
new file mode 100644 (file)
index 0000000..9197d38
--- /dev/null
@@ -0,0 +1,26 @@
+function doTest(arr1) {
+    let arr2 = []
+    for (let i=0; i<arr1.length; ++i) {
+        arr2[i] = arr1[i]
+    }
+    return arr2
+}
+noInline(doTest);
+
+let arr1 = []
+for (let i=0; i<1000; ++i) {
+    arr1[i] = { test: i }
+}
+
+for (let i=0; i<100000; ++i) doTest(arr1)
+
+if (doTest(arr1) == arr1)
+    throw "Error: did not copy"
+
+if (doTest(arr1)[5].test != 5)
+    throw "Error: bad copy"
+
+doTest(arr1)[5].test = 42
+
+if (arr1[5].test != 42)
+    throw "Error: bad copy"
diff --git a/JSTests/microbenchmarks/memcpy-typed-loop-large.js b/JSTests/microbenchmarks/memcpy-typed-loop-large.js
new file mode 100644 (file)
index 0000000..5cf43d8
--- /dev/null
@@ -0,0 +1,27 @@
+function doTest(arr1, arr2) {
+    if (arr1.length != arr2.length)
+        return []
+    for (let i=0; i<arr1.length; ++i) {
+        arr2[i] = arr1[i]
+    }
+    return arr2
+}
+noInline(doTest);
+
+for (let i=0; i<5000; ++i) doTest([], [])
+
+let arr1 = new Int32Array(1000000)
+let arr2 = new Int32Array(1000000)
+for (let i=0; i<arr1.length; ++i) {
+    arr1[i] = i
+}
+
+for (let i=0; i<1000; ++i) doTest(arr1, arr2)
+
+arr2 = new Int32Array(arr1.length)
+doTest(arr1, arr2)
+
+for (let i=0; i<arr1.length; ++i) {
+    if (arr2[i] != arr1[i])
+        throw "Error: bad copy: " + i + " " + arr1[i] + " " + arr2[i]
+}
diff --git a/JSTests/microbenchmarks/memcpy-typed-loop-small.js b/JSTests/microbenchmarks/memcpy-typed-loop-small.js
new file mode 100644 (file)
index 0000000..34dbd35
--- /dev/null
@@ -0,0 +1,27 @@
+function doTest(arr1, arr2) {
+    if (arr1.length != arr2.length)
+        return []
+    for (let i=0; i<arr1.length; ++i) {
+        arr2[i] = arr1[i]
+    }
+    return arr2
+}
+noInline(doTest);
+
+for (size of [1, 3, 8, 10, 12, 16]) {
+    let arr1 = new Int32Array(size)
+    let arr2 = new Int32Array(size)
+    for (let i=0; i<arr1.length; ++i) {
+        arr1[i] = i
+    }
+
+    for (let i=0; i<50000000; ++i) doTest(arr1, arr2)
+
+    arr2 = new Int32Array(arr1.length)
+    doTest(arr1, arr2)
+
+    for (let i=0; i<arr1.length; ++i) {
+        if (arr2[i] != arr1[i])
+            throw "Error: bad copy: " + i + " " + arr1[i] + " " + arr2[i]
+    }
+}
diff --git a/JSTests/microbenchmarks/memcpy-typed-loop-speculative.js b/JSTests/microbenchmarks/memcpy-typed-loop-speculative.js
new file mode 100644 (file)
index 0000000..b918838
--- /dev/null
@@ -0,0 +1,23 @@
+function doTest(arr1, arr2) {
+    for (let i=0; i<arr1.length; ++i) {
+        arr2[i] = arr1[i]
+    }
+    return arr2
+}
+noInline(doTest);
+
+let arr1 = new Int32Array(10)
+let arr2 = new Int32Array(10)
+for (let i=0; i<arr1.length; ++i) {
+    arr1[i] = i
+}
+
+for (let i=0; i<500000; ++i) doTest(arr1, arr2)
+
+arr2 = new Int32Array(arr1.length)
+doTest(arr1, arr2)
+
+for (let i=0; i<arr1.length; ++i) {
+    if (arr2[i] != arr1[i])
+        throw "Error: bad copy: " + i + " " + arr1[i] + " " + arr2[i]
+}
diff --git a/JSTests/microbenchmarks/memcpy-wasm-large.js b/JSTests/microbenchmarks/memcpy-wasm-large.js
new file mode 100644 (file)
index 0000000..7e93526
--- /dev/null
@@ -0,0 +1,34 @@
+// Source in wasm/stress/memcpy-wasm
+if (typeof WebAssembly === "object") {
+
+function eq(a, b) {
+    if (a !== b)
+        throw new Error("Not equal: " + a + " " + b);
+}
+
+let pages = 64
+let memory = new WebAssembly.Memory({initial: pages, maximum: pages});
+
+let i32 = new Int32Array(memory.buffer);
+for (let i = 0; i < 100000; i++) {
+    i32[i] = i;
+}
+
+const $1 = new WebAssembly.Instance(new WebAssembly.Module(new Uint8Array([
+0,97,115,109,1,0,0,0,1,7,1,96,3,127,127,127,0,2,12,1,2,106,115,3,109,101,109,2,1,pages,pages,3,2,1,0,6,1,0,7,13,1,9,100,111,95,109,101,109,99,112,121,0,0,10,57,1,55,1,1,127,65,0,33,3,3,64,2,64,32,2,32,3,70,13,0,32,1,65,4,108,32,3,65,4,108,106,32,0,32,3,65,4,108,106,40,0,0,54,0,0,32,3,65,1,106,33,3,12,1,11,11,11
+])), { js: { mem: memory } });
+
+for (let i=0; i<500; ++i)
+    $1.exports.do_memcpy(0,50000,40000);
+
+for (let i = 0; i < 50000; i++) {
+    eq(i32[i], i);
+}
+for (let i = 50000; i < 50000+40000; i++) {
+    eq(i32[i], i-50000);
+}
+for (let i = 50000+40000; i < 100000; i++) {
+    eq(i32[i], i);
+}
+
+}
diff --git a/JSTests/microbenchmarks/memcpy-wasm-medium.js b/JSTests/microbenchmarks/memcpy-wasm-medium.js
new file mode 100644 (file)
index 0000000..16f0359
--- /dev/null
@@ -0,0 +1,35 @@
+// Source in wasm/stress/memcpy-wasm
+
+if (typeof WebAssembly === "object") {
+
+function eq(a, b) {
+    if (a !== b)
+        throw new Error("Not equal: " + a + " " + b);
+}
+
+let memory = new WebAssembly.Memory({initial:1, maximum:1});
+
+let i32 = new Int32Array(memory.buffer);
+for (let i = 0; i < 10; i++) {
+    i32[i] = i;
+}
+
+const $1 = new WebAssembly.Instance(new WebAssembly.Module(new Uint8Array([
+0,97,115,109,1,0,0,0,1,7,1,96,3,127,127,127,0,2,12,1,2,106,115,3,109,101,109,2,1,1,1,3,2,1,0,6,1,0,7,13,1,9,100,111,95,109,101,109,99,112,121,0,0,10,57,1,55,1,1,127,65,0,33,3,3,64,2,64,32,2,32,3,70,13,0,32,1,65,4,108,32,3,65,4,108,106,32,0,32,3,65,4,108,106,40,0,0,54,0,0,32,3,65,1,106,33,3,12,1,11,11,11
+])), { js: { mem: memory } });
+
+for (let i=0; i<1000000; ++i)
+    $1.exports.do_memcpy(0,5,5);
+
+eq(i32[0], 0);
+eq(i32[1], 1);
+eq(i32[2], 2);
+eq(i32[3], 3);
+eq(i32[4], 4);
+eq(i32[5], 0);
+eq(i32[6], 1);
+eq(i32[7], 2);
+eq(i32[8], 3);
+eq(i32[9], 4);
+
+}
diff --git a/JSTests/microbenchmarks/memcpy-wasm-small.js b/JSTests/microbenchmarks/memcpy-wasm-small.js
new file mode 100644 (file)
index 0000000..9c7a585
--- /dev/null
@@ -0,0 +1,28 @@
+// Source in wasm/stress/memcpy-wasm
+if (typeof WebAssembly === "object") {
+
+function eq(a, b) {
+    if (a !== b)
+        throw new Error("Not equal: " + a + " " + b);
+}
+
+let memory = new WebAssembly.Memory({initial:1, maximum:1});
+
+let i32 = new Int32Array(memory.buffer);
+for (let i = 0; i < 10; i++) {
+    i32[i] = i;
+}
+
+const $1 = new WebAssembly.Instance(new WebAssembly.Module(new Uint8Array([
+0,97,115,109,1,0,0,0,1,7,1,96,3,127,127,127,0,2,12,1,2,106,115,3,109,101,109,2,1,1,1,3,2,1,0,6,1,0,7,13,1,9,100,111,95,109,101,109,99,112,121,0,0,10,57,1,55,1,1,127,65,0,33,3,3,64,2,64,32,2,32,3,70,13,0,32,1,65,4,108,32,3,65,4,108,106,32,0,32,3,65,4,108,106,40,0,0,54,0,0,32,3,65,1,106,33,3,12,1,11,11,11
+])), { js: { mem: memory } });
+
+for (let i=0; i<1000000; ++i)
+    $1.exports.do_memcpy(0,2,2);
+
+eq(i32[0], 0);
+eq(i32[1], 1);
+eq(i32[2], 0);
+eq(i32[3], 1);
+
+}
diff --git a/JSTests/microbenchmarks/memcpy-wasm.js b/JSTests/microbenchmarks/memcpy-wasm.js
new file mode 100644 (file)
index 0000000..91cbff0
--- /dev/null
@@ -0,0 +1,33 @@
+// Source in wasm/stress/memcpy-wasm
+if (typeof WebAssembly === "object") {
+
+function eq(a, b) {
+    if (a !== b)
+        throw new Error("Not equal: " + a + " " + b);
+}
+
+let memory = new WebAssembly.Memory({initial:1, maximum:1});
+
+let i32 = new Int32Array(memory.buffer);
+for (let i = 0; i < 2000; i++) {
+    i32[i] = i;
+}
+
+const $1 = new WebAssembly.Instance(new WebAssembly.Module(new Uint8Array([
+0,97,115,109,1,0,0,0,1,7,1,96,3,127,127,127,0,2,12,1,2,106,115,3,109,101,109,2,1,1,1,3,2,1,0,6,1,0,7,13,1,9,100,111,95,109,101,109,99,112,121,0,0,10,57,1,55,1,1,127,65,0,33,3,3,64,2,64,32,2,32,3,70,13,0,32,1,65,4,108,32,3,65,4,108,106,32,0,32,3,65,4,108,106,40,0,0,54,0,0,32,3,65,1,106,33,3,12,1,11,11,11
+])), { js: { mem: memory } });
+
+for (let i=0; i<100000; ++i)
+    $1.exports.do_memcpy(0,500,300);
+
+for (let i = 0; i < 500; i++) {
+    eq(i32[i], i);
+}
+for (let i = 500; i < 500+300; i++) {
+    eq(i32[i], i-500);
+}
+for (let i = 500+300; i < 1000; i++) {
+    eq(i32[i], i);
+}
+
+}
diff --git a/JSTests/stress/memcpy-typed-loops.js b/JSTests/stress/memcpy-typed-loops.js
new file mode 100644 (file)
index 0000000..ddcc645
--- /dev/null
@@ -0,0 +1,39 @@
+function noLoop(arr1, arr2) {
+    let i=0
+    if (arr1.size%2==0)
+        i = 0
+    else i = 0
+    arr2[i] = arr1[i]
+    return arr2
+}
+noInline(noLoop);
+
+function invalidStart(arr1, arr2) {
+    if (arr1.length != arr2.length)
+        return []
+    let i = 0
+    do {
+        ++i
+        arr2[i] = arr1[i]
+    } while (i < arr1.length-1)
+    return arr2
+}
+noInline(invalidStart);
+
+const size = 10
+let arr1 = new Int32Array(size)
+let arr2 = new Int32Array(size)
+for (let i=0; i<arr1.length; ++i) {
+    arr1[i] = i
+}
+
+for (let i=0; i<10000000; ++i) noLoop(arr1, arr2)
+for (let i=0; i<10000000; ++i) invalidStart(arr1, arr2)
+
+arr2 = new Int32Array(arr1.length)
+invalidStart(arr1, arr2)
+
+for (let i=1; i<arr1.length; ++i) {
+    if (arr2[i] != arr1[i] || arr2[0] != 0)
+        throw "Error: bad copy: " + i + " " + arr1[i] + " " + arr2[i]
+}
diff --git a/JSTests/wasm/function-tests/memcpy-wasm-loop.js b/JSTests/wasm/function-tests/memcpy-wasm-loop.js
new file mode 100644 (file)
index 0000000..f1741a6
--- /dev/null
@@ -0,0 +1,90 @@
+import * as assert from '../assert.js';
+import Builder from '../Builder.js';
+
+let memory = new WebAssembly.Memory({initial:1, maximum:1});
+
+let i32 = new Int32Array(memory.buffer);
+for (let i = 0; i < 100; i++) {
+    i32[i] = i;
+}
+
+const $1 = new WebAssembly.Instance(new WebAssembly.Module((new Builder())
+      .Type().End()
+      .Import()
+        .Memory("js", "mem", {initial:1, maximum:1})
+      .End()
+      .Function().End()
+      .Global().End()
+      .Export()
+          .Function("do_memcpy")
+      .End()
+      .Code()
+        .Function("do_memcpy", { params: ["i32","i32","i32"], ret: "void" }, ["i32"])
+            .I32Const(0)
+            .SetLocal(3)
+            .Loop("void")
+            .Block("void", b =>
+               b.GetLocal(2)
+               .GetLocal(3)
+               .I32Eq()
+               .BrIf(0)
+
+               .GetLocal(1)
+               .I32Const(4)
+               .I32Mul()
+               .GetLocal(3)
+               .I32Const(4)
+               .I32Mul()
+               .I32Add()
+
+               .GetLocal(0)
+               // Intentional bug: no multiply here
+               .GetLocal(3)
+               .I32Const(4)
+               .I32Mul()
+               .I32Add()
+               .I32Load(0,0)
+
+               .I32Store(0,0)
+
+               .GetLocal(3)
+               .I32Const(1)
+               .I32Add()
+               .SetLocal(3)
+               .Br(1)
+              )
+            .End()
+        .End()
+      .End().WebAssembly().get()), { js: { mem: memory } });
+
+for (let i=0; i<500; ++i)
+    $1.exports.do_memcpy(0,50,30);
+
+for (let i = 0; i < 50; i++) {
+    assert.eq(i32[i], i);
+}
+for (let i = 50; i < 50+30; i++) {
+    assert.eq(i32[i], i-50);
+}
+for (let i = 50+30; i < 100; i++) {
+    assert.eq(i32[i], i);
+}
+
+$1.exports.do_memcpy(0,5,10);
+for (let i = 0; i < 5; i++) {
+    assert.eq(i32[i], i);
+}
+for (let i = 5; i < 10; i++) {
+    assert.eq(i32[i], i-5);
+}
+for (let i = 10; i < 15; i++) {
+    assert.eq(i32[i], i-10);
+}
+for (let i = 15; i < 20; i++) {
+    assert.eq(i32[i], i);
+}
+
+assert.throws(() => $1.exports.do_memcpy(0,16384-5,6), Error, "Out of bounds memory access (evaluating 'func(...args)')")
+for (let i = 0; i < 5; i++) {
+    assert.eq(i32[16384-5 + i], i);
+}
index 737fac5..2fdd74e 100644 (file)
@@ -1,3 +1,39 @@
+2019-08-20  Justin Michaud  <justin_michaud@apple.com>
+
+        Identify memcpy loops in b3
+        https://bugs.webkit.org/show_bug.cgi?id=200181
+
+        Reviewed by Saam Barati.
+
+        Add a new pass in B3 to identify one type of forward byte copy loop and replace it with a call to a custom version of memcpy
+        that will not cause GC tearing and have the correct behaviour when overlapping regions are passed in. 
+
+        Microbenchmarks show memcpy-typed-loop-large is about 6x faster, and everything else is neutral. The optimization is disabled
+        on arm for now, until we add a memcpy implementation for it.
+
+        * JavaScriptCore.xcodeproj/project.pbxproj:
+        * Sources.txt:
+        * b3/B3Generate.cpp:
+        (JSC::B3::generateToAir):
+        * b3/B3ReduceLoopStrength.cpp: Added.
+        (JSC::B3::fastForwardCopy32):
+        (JSC::B3::ReduceLoopStrength::AddrInfo::appendAddr):
+        (JSC::B3::ReduceLoopStrength::ReduceLoopStrength):
+        (JSC::B3::ReduceLoopStrength::reduceByteCopyLoopsToMemcpy):
+        (JSC::B3::ReduceLoopStrength::hoistValue):
+        (JSC::B3::ReduceLoopStrength::run):
+        (JSC::B3::reduceLoopStrength):
+        * b3/B3ReduceLoopStrength.h: Added.
+        * b3/testb3.h:
+        * b3/testb3_1.cpp:
+        (run):
+        * b3/testb3_8.cpp:
+        (testFastForwardCopy32):
+        (testByteCopyLoop):
+        (testByteCopyLoopStartIsLoopDependent):
+        (testByteCopyLoopBoundIsLoopDependent):
+        (addCopyTests):
+
 2019-08-20  Devin Rousso  <drousso@apple.com>
 
         Unreviewed, speculative build fix for High Sierra after r248925
index c0637c4..daea466 100644 (file)
                70ECA6061AFDBEA200449739 /* JSTemplateObjectDescriptor.h in Headers */ = {isa = PBXBuildFile; fileRef = 70ECA6011AFDBEA200449739 /* JSTemplateObjectDescriptor.h */; settings = {ATTRIBUTES = (Private, ); }; };
                70ECA6091AFDBEA200449739 /* TemplateObjectDescriptor.h in Headers */ = {isa = PBXBuildFile; fileRef = 70ECA6041AFDBEA200449739 /* TemplateObjectDescriptor.h */; settings = {ATTRIBUTES = (Private, ); }; };
                72AAF7CE1D0D31B3005E60BE /* JSCustomGetterSetterFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 72AAF7CC1D0D318B005E60BE /* JSCustomGetterSetterFunction.h */; };
+               73E3799422E0EF6500933565 /* B3ReduceLoopStrength.h in Headers */ = {isa = PBXBuildFile; fileRef = 73E3799322E0EF4F00933565 /* B3ReduceLoopStrength.h */; };
                7593C898BE714A64BE93A6E7 /* WasmContextInlines.h in Headers */ = {isa = PBXBuildFile; fileRef = A27958D7FA1142B0AC9E364D /* WasmContextInlines.h */; settings = {ATTRIBUTES = (Private, ); }; };
                790081391E95A8EC0052D7CD /* WasmModule.h in Headers */ = {isa = PBXBuildFile; fileRef = 790081371E95A8EC0052D7CD /* WasmModule.h */; settings = {ATTRIBUTES = (Private, ); }; };
                7905BB691D12050E0019FE57 /* InlineAccess.h in Headers */ = {isa = PBXBuildFile; fileRef = 7905BB671D12050E0019FE57 /* InlineAccess.h */; settings = {ATTRIBUTES = (Private, ); }; };
                70ECA6041AFDBEA200449739 /* TemplateObjectDescriptor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TemplateObjectDescriptor.h; sourceTree = "<group>"; };
                72AAF7CB1D0D318B005E60BE /* JSCustomGetterSetterFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = JSCustomGetterSetterFunction.cpp; sourceTree = "<group>"; };
                72AAF7CC1D0D318B005E60BE /* JSCustomGetterSetterFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSCustomGetterSetterFunction.h; sourceTree = "<group>"; };
+               73E3799322E0EF4F00933565 /* B3ReduceLoopStrength.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = B3ReduceLoopStrength.h; path = b3/B3ReduceLoopStrength.h; sourceTree = "<group>"; };
+               73E3799522E0EF9100933565 /* B3ReduceLoopStrength.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = B3ReduceLoopStrength.cpp; path = b3/B3ReduceLoopStrength.cpp; sourceTree = "<group>"; };
                77B25CB2C3094A92A38E1DB3 /* JSModuleLoader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSModuleLoader.h; sourceTree = "<group>"; };
                790081361E95A8EC0052D7CD /* WasmModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WasmModule.cpp; sourceTree = "<group>"; };
                790081371E95A8EC0052D7CD /* WasmModule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WasmModule.h; sourceTree = "<group>"; };
                                0F725CA61C503DED00AD943A /* B3PureCSE.h */,
                                43422A641C16221E00E2EB98 /* B3ReduceDoubleToFloat.cpp */,
                                43422A651C16221E00E2EB98 /* B3ReduceDoubleToFloat.h */,
+                               73E3799522E0EF9100933565 /* B3ReduceLoopStrength.cpp */,
+                               73E3799322E0EF4F00933565 /* B3ReduceLoopStrength.h */,
                                0FEC85B71BE1462F0080FF74 /* B3ReduceStrength.cpp */,
                                0FEC85B81BE1462F0080FF74 /* B3ReduceStrength.h */,
                                0FEC84EA1BDACDAC0080FF74 /* B3SlotBaseValue.cpp */,
                                0FEC852D1BDACDAC0080FF74 /* B3ProcedureInlines.h in Headers */,
                                0F725CAA1C503DED00AD943A /* B3PureCSE.h in Headers */,
                                43422A671C16267800E2EB98 /* B3ReduceDoubleToFloat.h in Headers */,
+                               73E3799422E0EF6500933565 /* B3ReduceLoopStrength.h in Headers */,
                                0FEC85BD1BE1462F0080FF74 /* B3ReduceStrength.h in Headers */,
                                0FEC85351BDACDAC0080FF74 /* B3SlotBaseValue.h in Headers */,
                                0F2BBD961C5FF3F50023EF23 /* B3SparseCollection.h in Headers */,
index 7289cc4..0c36de0 100644 (file)
@@ -156,6 +156,7 @@ b3/B3PhiChildren.cpp
 b3/B3Procedure.cpp
 b3/B3PureCSE.cpp
 b3/B3ReduceDoubleToFloat.cpp
+b3/B3ReduceLoopStrength.cpp
 b3/B3ReduceStrength.cpp
 b3/B3SSACalculator.cpp
 b3/B3SlotBaseValue.cpp
index ba2fa08..4394b62 100644 (file)
@@ -48,6 +48,7 @@
 #include "B3Procedure.h"
 #include "B3PureCSE.h"
 #include "B3ReduceDoubleToFloat.h"
+#include "B3ReduceLoopStrength.h"
 #include "B3ReduceStrength.h"
 #include "B3TimingScope.h"
 #include "B3Validate.h"
@@ -91,6 +92,7 @@ void generateToAir(Procedure& procedure)
             eliminateCommonSubexpressions(procedure);
         eliminateDeadCode(procedure);
         inferSwitches(procedure);
+        reduceLoopStrength(procedure);
         if (Options::useB3TailDup())
             duplicateTails(procedure);
         fixSSA(procedure);
diff --git a/Source/JavaScriptCore/b3/B3ReduceLoopStrength.cpp b/Source/JavaScriptCore/b3/B3ReduceLoopStrength.cpp
new file mode 100644 (file)
index 0000000..b08087b
--- /dev/null
@@ -0,0 +1,544 @@
+/*
+ * Copyright (C) 2019 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "B3ReduceLoopStrength.h"
+
+#if ENABLE(B3_JIT)
+
+#include "B3BlockInsertionSet.h"
+#include "B3ConstPtrValue.h"
+#include "B3EnsureLoopPreHeaders.h"
+#include "B3PhaseScope.h"
+#include "B3ProcedureInlines.h"
+#include "B3ValueInlines.h"
+#include "B3Variable.h"
+#include "B3VariableValue.h"
+#include <wtf/GraphNodeWorklist.h>
+#include <wtf/IndexSet.h>
+#include <wtf/SmallPtrSet.h>
+#include <wtf/Vector.h>
+
+namespace JSC { namespace B3 {
+
+#if CPU(X86_64)
+/*
+ * The semantics of B3 require that loads and stores are not reduced in granularity.
+ * If we would use system memcpy, then it is possible that we would get a byte copy loop,
+ * and this could cause GC tearing.
+ */
+void fastForwardCopy32(uint32_t* dst, const uint32_t* src, size_t size)
+{
+    uint64_t i;
+    int64_t counter;
+    uint64_t tmp, tmp2;
+
+    asm volatile(
+        "test %q[size], %q[size]\t\n"
+        "jz 900f\t\n" // exit
+        "xorq %q[i], %q[i]\t\n"
+        // if size < 8
+        "cmp $8, %q[size]\t\n"
+        "jl 100f\t\n" // backporch
+        // if (dst + size*4 <= src || src + size*4 <= dst)
+        "leaq (%q[src], %q[size], 4), %q[tmp]\t\n"
+        "cmp %q[tmp], %q[dst]\t\n"
+        "jae 500f\t\n" // simd no overlap
+        "leaq (%q[dst], %q[size], 4), %q[tmp]\t\n"
+        "cmp %q[tmp], %q[src]\t\n"
+        "jae 500f\t\n" // simd no overlap
+        "jmp 100f\t\n" // Backporch
+
+        // Backporch (assumes we can write at least one int)
+        "100:\t\n"
+        // counter = (size - i)%4
+        "mov %q[size], %q[counter]\t\n"
+        "subq %q[i], %q[counter]\t\n"
+
+        // If we are a multiple of 4, unroll the loop
+        // We already know we are nonzero
+        "and $3, %q[counter]\t\n"
+        "jz 200f\t\n" // unrolled loop
+
+        "negq %q[counter]\t\n"
+        ".balign 32\t\n"
+        "101:\t\n"
+        "mov (%q[src], %q[i], 4), %k[tmp]\t\n"
+        "mov %k[tmp], (%q[dst], %q[i], 4)\t\n"
+        "incq %q[i]\t\n"
+        "incq %q[counter]\t\n"
+        "jnz 101b\t\n"
+        // Do we still have anything left?
+        "cmpq %q[size], %q[i]\t\n"
+        "je 900f\t\n" // exit
+        // Then go to the unrolled loop
+        "jmp 200f\t\n"
+
+        // Unrolled loop (assumes multiple of 4, can do one iteration)
+        "200:\t\n"
+        "shr $2, %q[counter]\t\n"
+        "negq %q[counter]\t\n"
+        ".balign 32\t\n"
+        "201:\t\n"
+        "mov (%q[src], %q[i], 4), %k[tmp]\t\n"
+        "mov %k[tmp], (%q[dst], %q[i], 4)\t\n"
+        "mov 4(%q[src], %q[i], 4), %k[tmp2]\t\n"
+        "mov %k[tmp2], 4(%q[dst], %q[i], 4)\t\n"
+        "mov 8(%q[src], %q[i], 4), %k[tmp2]\t\n"
+        "mov %k[tmp2], 8(%q[dst], %q[i], 4)\t\n"
+        "mov 12(%q[src], %q[i], 4), %k[tmp]\t\n"
+        "mov %k[tmp], 12(%q[dst], %q[i], 4)\t\n"
+        "addq $4, %q[i]\t\n"
+        "cmpq %q[size], %q[i]\t\n"
+        "jb 201b\t\n"
+        "jmp 900f\t\n" // exit
+
+        // simd no overlap
+        // counter = -(size/8);
+        "500:\t\n"
+        "mov %q[size], %q[counter]\t\n"
+        "shrq $3, %q[counter]\t\n"
+        "negq %q[counter]\t\n"
+        ".balign 32\t\n"
+        "501:\t\n"
+        "movups (%q[src], %q[i], 4), %%xmm0\t\n"
+        "movups 16(%q[src], %q[i], 4), %%xmm1\t\n"
+        "movups %%xmm0, (%q[dst], %q[i], 4)\t\n"
+        "movups %%xmm1, 16(%q[dst], %q[i], 4)\t\n"
+        "addq $8, %q[i]\t\n"
+        "incq %q[counter]\t\n"
+        "jnz 501b\t\n"
+        // if (i == size)
+        "cmpq %q[size], %q[i]\t\n"
+        "je 900f\t\n" // end
+        "jmp 100b\t\n" // backporch
+        "900:\t\n"
+    : [i] "=&c" (i), [counter] "=&r" (counter), [tmp] "=&a" (tmp),  [tmp2] "=&r" (tmp2)
+    : [dst] "D" (dst), [src] "S" (src), [size] "r" (size)
+    : "xmm0", "xmm1");
+}
+#endif
+
+class ReduceLoopStrength {
+    static const bool verbose = false;
+
+    struct AddrInfo {
+        Value* appendAddr(Procedure& proc, BasicBlock* block, Value* addr)
+        {
+            block->appendNew<Value>(proc, Identity, addr->origin(), addr);
+            return addr;
+        }
+
+        Value* insideLoopAddr;
+
+        Value* arrayBase;
+        Value* index;
+    };
+
+public:
+    ReduceLoopStrength(Procedure& proc)
+        : m_proc(proc)
+        , m_insertionSet(proc)
+        , m_blockInsertionSet(proc)
+        , m_root(proc.at(0))
+        , m_phiChildren(proc)
+    {
+    }
+
+#if CPU(X86_64)
+    void reduceByteCopyLoopsToMemcpy()
+    {
+        HashSet<Value*> patternMatchedValues;
+
+        Value* store = m_value;
+        if (store->opcode() != Store)
+            return;
+        patternMatchedValues.add(store);
+
+        Value* load = store->child(0);
+        uint32_t elemSize = sizeofType(load->type());
+        if (load->opcode() != Load || elemSize != 4)
+            return;
+        patternMatchedValues.add(load);
+
+        if (verbose)
+            dataLogLn("Found store/load:", *store);
+
+        auto extractArrayInfo = [&] (Value* value) -> Optional<AddrInfo> {
+            patternMatchedValues.add(value);
+
+            Optional<AddrInfo> addr = { AddrInfo() };
+
+            Value* sum = value;
+            if (value->opcode() == ZExt32)
+                sum = sum->child(0);
+
+            if (sum->opcode() != Add || sum->numChildren() != 2)
+                return { };
+
+            addr->insideLoopAddr = sum;
+            auto extractLoopIndexInSubtree = [&] (Value* value) -> Value* {
+                // Match arrBase[i*elemSize]
+                if (value->opcode() == Shl
+                    && value->child(0)->opcode() == Phi
+                    && value->child(1)->isInt(WTF::fastLog2(elemSize)))
+                    return value->child(0);
+                if (value->opcode() == Shl
+                    && value->child(0)->opcode() == ZExt32
+                    && value->child(0)->child(0)->opcode() == Phi
+                    && value->child(1)->isInt(WTF::fastLog2(elemSize)))
+                    return value->child(0)->child(0);
+                return nullptr;
+            };
+
+            // Try to find a known pattern applied to a single phi, which we can assume is our loop index.
+            Value* left = extractLoopIndexInSubtree(sum->child(0));
+            Value* right = extractLoopIndexInSubtree(sum->child(1));
+            if (left && !right) {
+                addr->index = left;
+                addr->arrayBase = sum->child(1);
+            } else if (right && !left) {
+                addr->index = right;
+                addr->arrayBase = sum->child(0);
+            } else
+                return { };
+
+            patternMatchedValues.add(addr->index);
+            patternMatchedValues.add(addr->arrayBase);
+
+            return addr;
+        };
+
+        auto destination = extractArrayInfo(store->child(1));
+        auto source = extractArrayInfo(load->child(0));
+
+        if (!source || !destination || source->index != destination->index)
+            return;
+
+        if (destination->arrayBase->type() != Int64 || source->arrayBase->type() != Int64)
+            return;
+
+        if (verbose)
+            dataLogLn("Found array info: ", *source->arrayBase, "[", *source->index, "] = ", *destination->arrayBase, "[", *destination->index, "]");
+
+        // Find the loop header, footer and inner blocks.
+        // Verify that there is one entrance to and one exit from the loop.
+        auto* loop = m_proc.naturalLoops().innerMostLoopOf(m_block);
+        if (!loop)
+            return;
+        BasicBlock* loopHead = loop->header();
+        BasicBlock* loopFoot = nullptr;
+        BasicBlock* loopPreheader = nullptr;
+        BasicBlock* loopPostfooter = nullptr;
+        SmallPtrSet<BasicBlock*> loopInnerBlocks;
+
+        {
+            for (unsigned i = 0; i < loop->size(); ++i)
+                loopInnerBlocks.add(loop->at(i));
+
+            if (!loopInnerBlocks.contains(load->owner))
+                return;
+
+            if (!loopInnerBlocks.contains(store->owner))
+                return;
+
+            SmallPtrSet<BasicBlock*> loopEntrances;
+            SmallPtrSet<BasicBlock*> loopExits;
+            for (auto* block : loopInnerBlocks) {
+                for (auto successor : block->successors()) {
+                    if (!loopInnerBlocks.contains(successor.block()))
+                        loopExits.add(successor.block());
+                }
+                for (auto* predecessor : block->predecessors()) {
+                    if (!loopInnerBlocks.contains(predecessor))
+                        loopEntrances.add(predecessor);
+                }
+            }
+
+            if (loopExits.size() != 1) {
+                if (verbose) {
+                    dataLog("Loop has incorrect number of exits: ");
+                    for (BasicBlock* block : loopExits)
+                        dataLog(*block, " ");
+                    dataLogLn();
+                }
+                return;
+            }
+            loopPostfooter = *loopExits.begin();
+
+            if (loopEntrances.size() != 1) {
+                if (verbose) {
+                    dataLog("Loop has incorrect number of entrances: ");
+                    for (BasicBlock* block : loopEntrances)
+                        dataLog(*block, " ");
+                    dataLogLn();
+                }
+                return;
+            }
+            loopPreheader = *loopEntrances.begin();
+
+            if (!loopHead->predecessors().contains(loopPreheader)) {
+                if (verbose)
+                    dataLogLn("Loop head does not follow preheader");
+                return;
+            }
+
+            for (BasicBlock* predecessor : loopPostfooter->predecessors()) {
+                if (loopInnerBlocks.contains(predecessor)) {
+                    // There is only one loop exit.
+                    RELEASE_ASSERT(!loopFoot);
+                    loopFoot = predecessor;
+                }
+            }
+        }
+
+        if (verbose) {
+            dataLog("Found loop head:", *loopHead, " foot:", *loopFoot, " prehead ", *loopPreheader, " postfoot ", *loopPostfooter, " inner blocks: ");
+            for (BasicBlock* block : loopInnerBlocks)
+                dataLog(*block, " ");
+            dataLogLn();
+        }
+
+        // Verify that the index is a monotonic inductive variable.
+        Value* startingIndex = nullptr;
+        {
+            if (m_phiChildren.at(source->index).size() != 2)
+                return;
+
+            Value* updateIndex = nullptr;
+            for (Value* up : m_phiChildren.at(source->index)) {
+                if (m_proc.dominators().dominates(up->owner, loopPreheader))
+                    startingIndex = up;
+                else
+                    updateIndex = up;
+            }
+
+            if (!updateIndex || !startingIndex || !loopInnerBlocks.contains(updateIndex->owner))
+                return;
+            patternMatchedValues.add(updateIndex);
+            patternMatchedValues.add(startingIndex);
+
+            updateIndex = updateIndex->child(0);
+            startingIndex = startingIndex->child(0);
+            if (updateIndex->opcode() != Add || !updateIndex->child(1)->isInt(1) || updateIndex->child(0) != source->index)
+                return;
+
+            if (!startingIndex->isInt(0))
+                return;
+        }
+
+        if (verbose)
+            dataLogLn("Found starting index:", *startingIndex);
+
+        // Identify the loop exit condition.
+        Value* exitCondition = nullptr;
+        // Is this an exit condition or a continuation condition?
+        bool conditionInverted = false;
+        // Do we check the index before or after using it?
+        bool checkBeforeWrite = false;
+        {
+            Value* branch = loopFoot->last();
+            if (branch->opcode() != Branch)
+                return;
+            patternMatchedValues.add(branch);
+            exitCondition = branch->child(0);
+
+            conditionInverted = loopPostfooter != loopFoot->successor(0).block();
+            checkBeforeWrite = m_proc.dominators().strictlyDominates(loopFoot, m_block);
+        }
+
+        if (verbose)
+            dataLogLn("Found exit condition: ", *exitCondition, " inverted: ", conditionInverted, " checks before the first write: ", checkBeforeWrite);
+
+        // Idenfity the loop bound by matching loop bound expressions.
+        Value* loopBound = nullptr;
+        {
+            auto extractLoopBound = [&] (Value* index, Value* bound) -> Value* {
+                // Match i+1 when we do a write first followed by the check for the next iteration
+                if (!checkBeforeWrite && index->opcode() == Add && index->child(0) == source->index && index->child(1)->isInt(1))
+                    return bound;
+                return nullptr;
+            };
+
+            if (exitCondition->opcode() == GreaterThan && conditionInverted) {
+                // enter loop again if size > i
+                loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0));
+            } else if (exitCondition->opcode() == LessThan && !conditionInverted) {
+                // enter loop again if size < i
+                loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0));
+            }
+
+            if (!loopBound) {
+                if (verbose)
+                    dataLogLn("Unable to extract bound from exit condition ", *exitCondition);
+                return;
+            }
+        }
+
+        // Make sure there are no other effectful things happening.
+        // This also guarantees that we found the only branch in the loop. This means that our
+        // load/store must happen iff our index update and branch do.
+        for (BasicBlock* block : loopInnerBlocks) {
+            for (unsigned index = 0; index < block->size(); ++index) {
+                Value* value = block->at(index);
+                if (patternMatchedValues.contains(value) || value->opcode() == Jump)
+                    continue;
+
+                Effects effects = value->effects();
+                effects.readsLocalState = false;
+                if (effects != Effects::none()) {
+                    if (verbose)
+                        dataLogLn("Not byte copy loop, node ", *value, " has effects");
+                    return;
+                }
+            }
+        }
+
+        if (!hoistValue(loopPreheader, source->arrayBase, false)
+            || !hoistValue(loopPreheader, destination->arrayBase, false)
+            || !hoistValue(loopPreheader, loopBound, false))
+            return;
+
+        if (verbose)
+            dataLogLn("Found byte copy loop: ", *source->arrayBase, "[", *source->index, "] = ", *destination->arrayBase, "[", *destination->index, "] from index ", *startingIndex, " to max index ", *loopBound, " stride: ", elemSize);
+
+        bool hoistResult = hoistValue(loopPreheader, source->arrayBase);
+        hoistResult &= hoistValue(loopPreheader, destination->arrayBase);
+        hoistResult &= hoistValue(loopPreheader, loopBound);
+        ASSERT_UNUSED(hoistResult, hoistResult);
+
+        auto origin = loopPostfooter->at(0)->origin();
+
+        BasicBlock* memcpy = m_blockInsertionSet.insertBefore(loopPostfooter, loopPostfooter->frequency());
+        memcpy->setSuccessors(FrequentedBlock(loopPostfooter));
+        memcpy->addPredecessor(loopPreheader);
+        for (BasicBlock* pred : loopPostfooter->predecessors())
+            loopPostfooter->removePredecessor(pred);
+        loopPostfooter->addPredecessor(memcpy);
+        loopPreheader->setSuccessors(memcpy);
+
+        Effects effects = Effects::forCall();
+        memcpy->appendNew<CCallValue>(m_proc, B3::Void, origin, effects,
+            memcpy->appendNew<ConstPtrValue>(m_proc, origin, tagCFunctionPtr<void*>(fastForwardCopy32, B3CCallPtrTag)),
+            destination->appendAddr(m_proc, memcpy, destination->arrayBase),
+            source->appendAddr(m_proc, memcpy, source->arrayBase),
+            loopBound);
+
+        memcpy->appendNew<Value>(m_proc, Jump, origin);
+    }
+
+    bool hoistValue(BasicBlock* into, Value* value, bool canMutate = true)
+    {
+        if (m_proc.dominators().dominates(value->owner, into))
+            return true;
+
+        Effects effects = value->effects();
+        if (effects != Effects::none()) {
+            if (verbose)
+                dataLogLn("Cannot hoist ", *value, " into ", *into, " since it has effects");
+            return false;
+        }
+
+        for (Value* child : value->children()) {
+            if (!hoistValue(into, child, false))
+                return false;
+        }
+
+        if (canMutate) {
+            for (Value* child : value->children())
+                hoistValue(into, child);
+
+            value->owner->at(value->index()) = m_proc.add<Value>(Nop, Void, value->origin());
+            into->appendNonTerminal(value);
+        }
+
+        return true;
+    }
+
+    bool run()
+    {
+        if (verbose)
+            dataLogLn("ReduceLoopStrength examining proc: ", m_proc);
+        bool result = false;
+
+        do {
+            m_changed = false;
+
+            for (BasicBlock* block : m_proc.blocksInPreOrder()) {
+                m_block = block;
+
+                for (m_index = 0; m_index < block->size(); ++m_index) {
+                    m_value = m_block->at(m_index);
+                    m_value->performSubstitution();
+                    reduceByteCopyLoopsToMemcpy();
+                    m_insertionSet.execute(m_block);
+                    m_changed |= m_blockInsertionSet.execute();
+
+                    if (m_changed) {
+                        m_proc.resetReachability();
+                        m_proc.invalidateCFG();
+                        m_proc.resetValueOwners();
+                        result = true;
+                        break;
+                    }
+                }
+
+                if (m_changed)
+                    break;
+            }
+        } while (m_changed && m_proc.optLevel() >= 2);
+
+        if (result && verbose)
+            dataLogLn("ReduceLoopStrength produced proc: ", m_proc);
+
+        return result;
+    }
+#else
+    bool run()
+    {
+        return false;
+    }
+#endif
+
+    Procedure& m_proc;
+    InsertionSet m_insertionSet;
+    BlockInsertionSet m_blockInsertionSet;
+    BasicBlock* m_root { nullptr };
+    BasicBlock* m_block { nullptr };
+    unsigned m_index { 0 };
+    Value* m_value { nullptr };
+    bool m_changed { false };
+    PhiChildren m_phiChildren;
+};
+
+bool reduceLoopStrength(Procedure& proc)
+{
+    PhaseScope phaseScope(proc, "reduceLoopStrength");
+    return ReduceLoopStrength(proc).run();
+}
+
+} } // namespace JSC::B3
+
+#endif // ENABLE(B3_JIT)
diff --git a/Source/JavaScriptCore/b3/B3ReduceLoopStrength.h b/Source/JavaScriptCore/b3/B3ReduceLoopStrength.h
new file mode 100644 (file)
index 0000000..101b2b2
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2019 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#pragma once
+
+#if ENABLE(B3_JIT)
+
+namespace JSC { namespace B3 {
+
+class Procedure;
+
+// Apply transformations that match and simplify entire loops.
+
+bool reduceLoopStrength(Procedure&);
+#if CPU(X86_64)
+JS_EXPORT_PRIVATE void fastForwardCopy32(uint32_t* dst, const uint32_t* src, size_t);
+#endif
+
+} } // namespace JSC::B3
+
+#endif // ENABLE(B3_JIT)
+
index 1c070d1..b504096 100644 (file)
@@ -48,6 +48,7 @@
 #include "B3MoveConstants.h"
 #include "B3NativeTraits.h"
 #include "B3Procedure.h"
+#include "B3ReduceLoopStrength.h"
 #include "B3ReduceStrength.h"
 #include "B3SlotBaseValue.h"
 #include "B3StackSlot.h"
@@ -1016,6 +1017,13 @@ void addAtomicTests(const char* filter, Deque<RefPtr<SharedTask<void()>>>&);
 void addLoadTests(const char* filter, Deque<RefPtr<SharedTask<void()>>>&);
 void addTupleTests(const char* filter, Deque<RefPtr<SharedTask<void()>>>&);
 
+void testFastForwardCopy32();
+void testByteCopyLoop();
+void testByteCopyLoopStartIsLoopDependent();
+void testByteCopyLoopBoundIsLoopDependent();
+
+void addCopyTests(const char* filter, Deque<RefPtr<SharedTask<void()>>>&);
+
 bool shouldRun(const char* filter, const char* testName);
 
 #endif // ENABLE(B3_JIT)
index 24953b7..27acbe6 100644 (file)
@@ -538,6 +538,8 @@ void run(const char* filter)
     addLoadTests(filter, tasks);
     addTupleTests(filter, tasks);
 
+    addCopyTests(filter, tasks);
+
     RUN(testSpillGP());
     RUN(testSpillFP());
 
index 76c8366..5acf3f6 100644 (file)
@@ -866,4 +866,234 @@ void addLoadTests(const char* filter, Deque<RefPtr<SharedTask<void()>>>& tasks)
     RUN(testLoad<uint16_t>(Load16Z, -1000000000));
 }
 
+void testFastForwardCopy32()
+{
+#if CPU(X86_64)
+    for (const bool aligned : { true, false }) {
+        for (const bool overlap : { false, true }) {
+            for (size_t arrsize : { 1, 4, 5, 6, 8, 10, 12, 16, 20, 40, 100, 1000}) {
+                size_t overlapAmount = 5;
+
+                uint32_t* arr1, *arr2;
+
+                if (overlap) {
+                    arr1 = new uint32_t[arrsize * 2];
+                    arr2 = arr1 + (arrsize - overlapAmount);
+                } else {
+                    arr1 = new uint32_t[arrsize];
+                    arr2 = new uint32_t[arrsize];
+                }
+
+                if (!aligned && arrsize < 3)
+                    continue;
+                if (overlap && arrsize <= overlapAmount + 3)
+                    continue;
+
+                if (!aligned) {
+                    ++arr1;
+                    ++arr2;
+                    arrsize -= 1;
+                    overlapAmount -= 1;
+                }
+
+                for (size_t i = 0; i < arrsize; ++i)
+                    arr1[i] = i;
+
+                fastForwardCopy32(arr2, arr1, arrsize);
+
+                if (overlap) {
+                    for (size_t i = 0; i < arrsize - overlapAmount; ++i)
+                        CHECK(arr2[i] == i);
+                    for (size_t i = arrsize - overlapAmount; i < arrsize; ++i)
+                        CHECK(arr2[i] == i - (arrsize - overlapAmount));
+                } else {
+                    for (size_t i = 0; i < arrsize; ++i)
+                        CHECK(arr2[i] == i);
+                }
+
+                if (!aligned) {
+                    --arr1;
+                    --arr2;
+                }
+
+                if (!overlap) {
+                    delete[] arr1;
+                    delete[] arr2;
+                } else
+                    delete[] arr1;
+            }
+        }
+    }
+#endif
+}
+
+void testByteCopyLoop()
+{
+    Procedure proc;
+    BasicBlock* root = proc.addBlock();
+    BasicBlock* head = proc.addBlock();
+    BasicBlock* update = proc.addBlock();
+    BasicBlock* continuation = proc.addBlock();
+
+    auto* arraySrc = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR0);
+    auto* arrayDst = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR1);
+    auto* arraySize = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR2);
+    auto* one = root->appendNew<Const32Value>(proc, Origin(), 1);
+    auto* two = root->appendNew<Const32Value>(proc, Origin(), 2);
+    UpsilonValue* startingIndex = root->appendNew<UpsilonValue>(proc, Origin(), root->appendNew<Const32Value>(proc, Origin(), 0));
+    root->appendNew<Value>(proc, Jump, Origin());
+    root->setSuccessors(FrequentedBlock(head));
+
+    auto* index = head->appendNew<Value>(proc, Phi, Int32, Origin());
+    startingIndex->setPhi(index);
+    auto* loadIndex = head->appendNew<Value>(proc, Add, Origin(), arraySrc, head->appendNew<Value>(proc, Shl, Origin(), index, two));
+    auto* storeIndex = head->appendNew<Value>(proc, Add, Origin(), arrayDst, head->appendNew<Value>(proc, Shl, Origin(), index, two));
+    head->appendNew<MemoryValue>(proc, Store, Origin(), head->appendNew<MemoryValue>(proc, Load, Int32, Origin(), loadIndex), storeIndex);
+    auto* newIndex = head->appendNew<Value>(proc, Add, Origin(), index, one);
+    auto* cmpValue = head->appendNew<Value>(proc, GreaterThan, Origin(), newIndex, arraySize);
+    head->appendNew<Value>(proc, Branch, Origin(), cmpValue);
+    head->setSuccessors(FrequentedBlock(continuation), FrequentedBlock(update));
+
+    UpsilonValue* updateIndex = update->appendNew<UpsilonValue>(proc, Origin(), newIndex);
+    updateIndex->setPhi(index);
+    update->appendNew<Value>(proc, Jump, Origin());
+    update->setSuccessors(FrequentedBlock(head));
+
+    continuation->appendNewControlValue(proc, Return, Origin());
+
+    int* arr1 = new int[3];
+    int* arr2 = new int[3];
+
+    arr1[0] = 0;
+    arr1[1] = 0;
+    arr1[2] = 0;
+    arr2[0] = 1;
+    arr2[1] = 2;
+    arr2[2] = 3;
+
+    compileAndRun<void>(proc, arr2, arr1, 3);
+
+    CHECK_EQ(arr1[0], 1);
+    CHECK_EQ(arr1[1], 2);
+    CHECK_EQ(arr1[2], 3);
+
+    delete[] arr1;
+    delete [] arr2;
+}
+
+void testByteCopyLoopStartIsLoopDependent()
+{
+    Procedure proc;
+    BasicBlock* root = proc.addBlock();
+    BasicBlock* head = proc.addBlock();
+    BasicBlock* update = proc.addBlock();
+    BasicBlock* continuation = proc.addBlock();
+
+    auto* arraySrc = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR0);
+    auto* arrayDst = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR1);
+    auto* arraySize = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR2);
+    auto* one = root->appendNew<Const32Value>(proc, Origin(), 1);
+    auto* two = root->appendNew<Const32Value>(proc, Origin(), 2);
+    root->appendNew<Value>(proc, Jump, Origin());
+    root->setSuccessors(FrequentedBlock(head));
+
+    UpsilonValue* startingIndex = head->appendNew<UpsilonValue>(proc, Origin(), head->appendNew<Const32Value>(proc, Origin(), 0));
+    auto* index = head->appendNew<Value>(proc, Phi, Int32, Origin());
+    startingIndex->setPhi(index);
+    auto* loadIndex = head->appendNew<Value>(proc, Add, Origin(), arraySrc, head->appendNew<Value>(proc, Shl, Origin(), index, two));
+    auto* storeIndex = head->appendNew<Value>(proc, Add, Origin(), arrayDst, head->appendNew<Value>(proc, Shl, Origin(), index, two));
+    head->appendNew<MemoryValue>(proc, Store, Origin(), head->appendNew<MemoryValue>(proc, Load, Int32, Origin(), loadIndex), storeIndex);
+    auto* newIndex = head->appendNew<Value>(proc, Add, Origin(), index, one);
+    auto* cmpValue = head->appendNew<Value>(proc, GreaterThan, Origin(), newIndex, arraySize);
+    head->appendNew<Value>(proc, Branch, Origin(), cmpValue);
+    head->setSuccessors(FrequentedBlock(continuation), FrequentedBlock(update));
+
+    UpsilonValue* updateIndex = update->appendNew<UpsilonValue>(proc, Origin(), newIndex);
+    updateIndex->setPhi(index);
+    update->appendNew<Value>(proc, Jump, Origin());
+    update->setSuccessors(FrequentedBlock(head));
+
+    continuation->appendNewControlValue(proc, Return, Origin());
+
+    int* arr1 = new int[3];
+    int* arr2 = new int[3];
+
+    arr1[0] = 0;
+    arr1[1] = 0;
+    arr1[2] = 0;
+    arr2[0] = 1;
+    arr2[1] = 2;
+    arr2[2] = 3;
+
+    compileAndRun<void>(proc, arr2, arr1, 0);
+
+    CHECK_EQ(arr1[0], 1);
+    CHECK_EQ(arr1[1], 0);
+    CHECK_EQ(arr1[2], 0);
+
+    delete[] arr1;
+    delete [] arr2;
+}
+
+void testByteCopyLoopBoundIsLoopDependent()
+{
+    Procedure proc;
+    BasicBlock* root = proc.addBlock();
+    BasicBlock* head = proc.addBlock();
+    BasicBlock* update = proc.addBlock();
+    BasicBlock* continuation = proc.addBlock();
+
+    auto* arraySrc = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR0);
+    auto* arrayDst = root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR1);
+    auto* one = root->appendNew<Const32Value>(proc, Origin(), 1);
+    auto* two = root->appendNew<Const32Value>(proc, Origin(), 2);
+    UpsilonValue* startingIndex = root->appendNew<UpsilonValue>(proc, Origin(), root->appendNew<Const32Value>(proc, Origin(), 0));
+    root->appendNew<Value>(proc, Jump, Origin());
+    root->setSuccessors(FrequentedBlock(head));
+
+    auto* index = head->appendNew<Value>(proc, Phi, Int32, Origin());
+    startingIndex->setPhi(index);
+    auto* loadIndex = head->appendNew<Value>(proc, Add, Origin(), arraySrc, head->appendNew<Value>(proc, Shl, Origin(), index, two));
+    auto* storeIndex = head->appendNew<Value>(proc, Add, Origin(), arrayDst, head->appendNew<Value>(proc, Shl, Origin(), index, two));
+    head->appendNew<MemoryValue>(proc, Store, Origin(), head->appendNew<MemoryValue>(proc, Load, Int32, Origin(), loadIndex), storeIndex);
+    auto* newIndex = head->appendNew<Value>(proc, Add, Origin(), index, one);
+    auto* cmpValue = head->appendNew<Value>(proc, GreaterThan, Origin(), newIndex, index);
+    head->appendNew<Value>(proc, Branch, Origin(), cmpValue);
+    head->setSuccessors(FrequentedBlock(continuation), FrequentedBlock(update));
+
+    UpsilonValue* updateIndex = update->appendNew<UpsilonValue>(proc, Origin(), newIndex);
+    updateIndex->setPhi(index);
+    update->appendNew<Value>(proc, Jump, Origin());
+    update->setSuccessors(FrequentedBlock(head));
+
+    continuation->appendNewControlValue(proc, Return, Origin());
+
+    int* arr1 = new int[3];
+    int* arr2 = new int[3];
+
+    arr1[0] = 0;
+    arr1[1] = 0;
+    arr1[2] = 0;
+    arr2[0] = 1;
+    arr2[1] = 2;
+    arr2[2] = 3;
+
+    compileAndRun<void>(proc, arr2, arr1, 3);
+
+    CHECK_EQ(arr1[0], 1);
+    CHECK_EQ(arr1[1], 0);
+    CHECK_EQ(arr1[2], 0);
+
+    delete[] arr1;
+    delete [] arr2;
+}
+
+void addCopyTests(const char* filter, Deque<RefPtr<SharedTask<void()>>>& tasks)
+{
+    RUN(testFastForwardCopy32());
+    RUN(testByteCopyLoop());
+    RUN(testByteCopyLoopStartIsLoopDependent());
+    RUN(testByteCopyLoopBoundIsLoopDependent());
+}
+
 #endif // ENABLE(B3_JIT)