Add b3 macro lowering for CheckMul on arm64
authorjustin_michaud@apple.com <justin_michaud@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 11 Jul 2019 21:08:36 +0000 (21:08 +0000)
committerjustin_michaud@apple.com <justin_michaud@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 11 Jul 2019 21:08:36 +0000 (21:08 +0000)
https://bugs.webkit.org/show_bug.cgi?id=199251

Reviewed by Robin Morisset.

JSTests:

* microbenchmarks/check-mul-constant.js: Added.
(doTest):
* microbenchmarks/check-mul-no-constant.js: Added.
(doTest):
* microbenchmarks/check-mul-power-of-two.js: Added.
(doTest):

Source/JavaScriptCore:

- Lower CheckMul for 32-bit arguments on arm64 into a mul and then an overflow check.
- Add a new opcode to air on arm64 for smull (multiplySignExtend32).
- Fuse sign extend 32 + mul into smull (taking two 32-bit arguments and producing 64 bits).
- 1.25x speedup on power of two microbenchmark, 1.15x speedup on normal constant microbenchmark,
  and no change on the no-constant benchmark.
Also, skip some of the b3 tests that were failing before this patch so that the new tests can run
to completion.

* assembler/MacroAssemblerARM64.h:
(JSC::MacroAssemblerARM64::multiplySignExtend32):
* assembler/testmasm.cpp:
(JSC::testMul32SignExtend):
(JSC::run):
* b3/B3LowerMacros.cpp:
* b3/B3LowerToAir.cpp:
* b3/air/AirOpcode.opcodes:
* b3/testb3.cpp:
(JSC::B3::testMulArgs32SignExtend):
(JSC::B3::testMulImm32SignExtend):
(JSC::B3::testMemoryFence):
(JSC::B3::testStoreFence):
(JSC::B3::testLoadFence):
(JSC::B3::testPinRegisters):
(JSC::B3::run):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@247363 268f45cc-cd09-0410-ab3c-d52691b4dbfc

12 files changed:
JSTests/ChangeLog
JSTests/microbenchmarks/check-mul-constant.js [new file with mode: 0644]
JSTests/microbenchmarks/check-mul-no-constant.js [new file with mode: 0644]
JSTests/microbenchmarks/check-mul-power-of-two.js [new file with mode: 0644]
Source/JavaScriptCore/ChangeLog
Source/JavaScriptCore/assembler/MacroAssemblerARM64.h
Source/JavaScriptCore/assembler/testmasm.cpp
Source/JavaScriptCore/b3/B3LowerMacros.cpp
Source/JavaScriptCore/b3/B3LowerToAir.cpp
Source/JavaScriptCore/b3/B3StackmapValue.h
Source/JavaScriptCore/b3/air/AirOpcode.opcodes
Source/JavaScriptCore/b3/testb3.cpp

index 132b78b..d0f1732 100644 (file)
@@ -1,3 +1,17 @@
+2019-07-11  Justin Michaud  <justin_michaud@apple.com>
+
+        Add b3 macro lowering for CheckMul on arm64
+        https://bugs.webkit.org/show_bug.cgi?id=199251
+
+        Reviewed by Robin Morisset.
+
+        * microbenchmarks/check-mul-constant.js: Added.
+        (doTest):
+        * microbenchmarks/check-mul-no-constant.js: Added.
+        (doTest):
+        * microbenchmarks/check-mul-power-of-two.js: Added.
+        (doTest):
+
 2019-07-10  Tadeu Zagallo  <tzagallo@apple.com>
 
         Optimize join of large empty arrays
diff --git a/JSTests/microbenchmarks/check-mul-constant.js b/JSTests/microbenchmarks/check-mul-constant.js
new file mode 100644 (file)
index 0000000..e4452ea
--- /dev/null
@@ -0,0 +1,13 @@
+function doTest(max) {
+    let sum = 0
+    for (let i=0; i<max; ++i) {
+        sum = sum + i * 304;
+    }
+    return sum
+}
+noInline(doTest);
+
+for (let i=0; i<100000; ++i) doTest(10000)
+
+if (doTest(10000) != 15198480000)
+    throw "Error: bad result: " + doTest(10000);
diff --git a/JSTests/microbenchmarks/check-mul-no-constant.js b/JSTests/microbenchmarks/check-mul-no-constant.js
new file mode 100644 (file)
index 0000000..962c2a0
--- /dev/null
@@ -0,0 +1,13 @@
+function doTest(max) {
+    let sum = 0
+    for (let i=0; i<max; ++i) {
+        sum = sum + i * i;
+    }
+    return sum
+}
+noInline(doTest);
+
+for (let i=0; i<100000; ++i) doTest(10000)
+
+if (doTest(10000) != 333283335000)
+    throw "Error: bad result: " + doTest(10000);
diff --git a/JSTests/microbenchmarks/check-mul-power-of-two.js b/JSTests/microbenchmarks/check-mul-power-of-two.js
new file mode 100644 (file)
index 0000000..9265883
--- /dev/null
@@ -0,0 +1,13 @@
+function doTest(max) {
+    let sum = 0
+    for (let i=0; i<max; ++i) {
+        sum = sum + i * 64;
+    }
+    return sum
+}
+noInline(doTest);
+
+for (let i=0; i<100000; ++i) doTest(10000)
+
+if (doTest(10000) != 3199680000)
+    throw "Error: bad result: " + doTest(10000);
index ad34109..bf0665c 100644 (file)
@@ -1,3 +1,35 @@
+2019-07-11  Justin Michaud  <justin_michaud@apple.com>
+
+        Add b3 macro lowering for CheckMul on arm64
+        https://bugs.webkit.org/show_bug.cgi?id=199251
+
+        Reviewed by Robin Morisset.
+
+        - Lower CheckMul for 32-bit arguments on arm64 into a mul and then an overflow check.
+        - Add a new opcode to air on arm64 for smull (multiplySignExtend32).
+        - Fuse sign extend 32 + mul into smull (taking two 32-bit arguments and producing 64 bits). 
+        - 1.25x speedup on power of two microbenchmark, 1.15x speedup on normal constant microbenchmark, 
+          and no change on the no-constant benchmark.
+        Also, skip some of the b3 tests that were failing before this patch so that the new tests can run
+        to completion.
+
+        * assembler/MacroAssemblerARM64.h:
+        (JSC::MacroAssemblerARM64::multiplySignExtend32):
+        * assembler/testmasm.cpp:
+        (JSC::testMul32SignExtend):
+        (JSC::run):
+        * b3/B3LowerMacros.cpp:
+        * b3/B3LowerToAir.cpp:
+        * b3/air/AirOpcode.opcodes:
+        * b3/testb3.cpp:
+        (JSC::B3::testMulArgs32SignExtend):
+        (JSC::B3::testMulImm32SignExtend):
+        (JSC::B3::testMemoryFence):
+        (JSC::B3::testStoreFence):
+        (JSC::B3::testLoadFence):
+        (JSC::B3::testPinRegisters):
+        (JSC::B3::run):
+
 2019-07-11  Yusuke Suzuki  <ysuzuki@apple.com>
 
         Unreviewed, revert r243617.
index 3f4c16d..2ea911e 100644 (file)
@@ -569,6 +569,11 @@ public:
         m_assembler.msub<64>(dest, mulLeft, mulRight, ARM64Registers::zr);
     }
 
+    void multiplySignExtend32(RegisterID left, RegisterID right, RegisterID dest)
+    {
+        m_assembler.smull(dest, left, right);
+    }
+
     void div32(RegisterID dividend, RegisterID divisor, RegisterID dest)
     {
         m_assembler.sdiv<32>(dest, dividend, divisor);
index 3cbaed8..c906b37 100644 (file)
@@ -351,6 +351,25 @@ void testMul32WithImmediates()
     }
 }
 
+#if CPU(ARM64)
+void testMul32SignExtend()
+{
+    for (auto value : int32Operands()) {
+        auto mul = compile([=] (CCallHelpers& jit) {
+            jit.emitFunctionPrologue();
+
+            jit.multiplySignExtend32(GPRInfo::argumentGPR0, GPRInfo::argumentGPR1, GPRInfo::returnValueGPR);
+
+            jit.emitFunctionEpilogue();
+            jit.ret();
+        });
+
+        for (auto value2 : int32Operands())
+            CHECK_EQ(invoke<long int>(mul, value, value2), ((long int) value) * ((long int) value2));
+    }
+}
+#endif
+
 #if CPU(X86) || CPU(X86_64) || CPU(ARM64)
 void testCompareFloat(MacroAssembler::DoubleCondition condition)
 {
@@ -1118,6 +1137,10 @@ void run(const char* filter)
     RUN(testCompareDouble(MacroAssembler::DoubleLessThanOrEqualOrUnordered));
     RUN(testMul32WithImmediates());
 
+#if CPU(ARM64)
+    RUN(testMul32SignExtend());
+#endif
+
 #if CPU(X86) || CPU(X86_64) || CPU(ARM64)
     RUN(testCompareFloat(MacroAssembler::DoubleEqual));
     RUN(testCompareFloat(MacroAssembler::DoubleNotEqual));
index 4c594d2..4d4eb17 100644 (file)
@@ -177,6 +177,36 @@ private:
                 break;
             }
 
+            case CheckMul: {
+                if (isARM64() && m_value->child(0)->type() == Int32) {
+                    CheckValue* checkMul = m_value->as<CheckValue>();
+
+                    Value* left = m_insertionSet.insert<Value>(m_index, SExt32, m_origin, m_value->child(0));
+                    Value* right = m_insertionSet.insert<Value>(m_index, SExt32, m_origin, m_value->child(1));
+                    Value* mulResult = m_insertionSet.insert<Value>(m_index, Mul, m_origin, left, right);
+                    Value* mulResult32 = m_insertionSet.insert<Value>(m_index, Trunc, m_origin, mulResult);
+                    Value* upperResult = m_insertionSet.insert<Value>(m_index, Trunc, m_origin,
+                        m_insertionSet.insert<Value>(m_index, SShr, m_origin, mulResult, m_insertionSet.insert<Const32Value>(m_index, m_origin, 32)));
+                    Value* signBit = m_insertionSet.insert<Value>(m_index, SShr, m_origin,
+                        mulResult32,
+                        m_insertionSet.insert<Const32Value>(m_index, m_origin, 31));
+                    Value* hasOverflowed = m_insertionSet.insert<Value>(m_index, NotEqual, m_origin, upperResult, signBit);
+
+                    CheckValue* check = m_insertionSet.insert<CheckValue>(m_index, Check, m_origin, hasOverflowed);
+                    check->setGenerator(checkMul->generator());
+                    check->clobberEarly(checkMul->earlyClobbered());
+                    check->clobberLate(checkMul->lateClobbered());
+                    auto children = checkMul->constrainedChildren();
+                    auto it = children.begin();
+                    for (std::advance(it, 2); it != children.end(); ++it)
+                        check->append(*it);
+
+                    m_value->replaceWithIdentity(mulResult32);
+                    m_changed = true;
+                }
+                break;
+            }
+
             case Switch: {
                 SwitchValue* switchValue = m_value->as<SwitchValue>();
                 Vector<SwitchCase> cases;
index b4098bc..58c0240 100644 (file)
@@ -2602,6 +2602,27 @@ private:
         }
 
         case Mul: {
+            if (m_value->type() == Int64
+                && isValidForm(MultiplySignExtend32, Arg::Tmp, Arg::Tmp, Arg::Tmp)
+                && m_value->child(0)->opcode() == SExt32
+                && !m_locked.contains(m_value->child(0))) {
+                Value* opLeft = m_value->child(0);
+                Value* left = opLeft->child(0);
+                Value* opRight = m_value->child(1);
+                Value* right = nullptr;
+
+                if (opRight->opcode() == SExt32 && !m_locked.contains(opRight->child(0))) {
+                    right = opRight->child(0);
+                } else if (m_value->child(1)->isRepresentableAs<int32_t>() && !m_locked.contains(m_value->child(1))) {
+                    // We just use the 64-bit const int as a 32 bit const int directly
+                    right = opRight;
+                }
+
+                if (right) {
+                    append(MultiplySignExtend32, tmp(left), tmp(right), tmp(m_value));
+                    return;
+                }
+            }
             appendBinOp<Mul32, Mul64, MulDouble, MulFloat, Commutative>(
                 m_value->child(0), m_value->child(1));
             return;
index be99d8c..1828e50 100644 (file)
@@ -220,6 +220,7 @@ public:
 
     class ConstrainedValueCollection {
     public:
+
         ConstrainedValueCollection(const StackmapValue& value)
             : m_value(value)
         {
@@ -233,6 +234,12 @@ public:
 
         class iterator {
         public:
+            using iterator_category = std::forward_iterator_tag;
+            using value_type = ConstrainedValue;
+            using difference_type = int;
+            using pointer = void;
+            using reference = ConstrainedValue;
+
             iterator()
                 : m_collection(nullptr)
                 , m_index(0)
index 11ada41..cb7d637 100644 (file)
@@ -261,6 +261,9 @@ arm64: MultiplyNeg32 U:G:32, U:G:32, ZD:G:32
 arm64: MultiplyNeg64 U:G:64, U:G:64, ZD:G:64
     Tmp, Tmp, Tmp
 
+arm64: MultiplySignExtend32 U:G:32, U:G:32, ZD:G:64
+    Tmp, Tmp, Tmp
+
 arm64: Div32 U:G:32, U:G:32, ZD:G:32
     Tmp, Tmp, Tmp
 
index 7f31679..c9fd18e 100644 (file)
@@ -1189,6 +1189,47 @@ void testMulArgs32(int a, int b)
     CHECK(compileAndRun<int>(proc, a, b) == a * b);
 }
 
+void testMulArgs32SignExtend(int a, int b)
+{
+    Procedure proc;
+    if (proc.optLevel() < 1)
+        return;
+    BasicBlock* root = proc.addBlock();
+    Value* arg1 = root->appendNew<Value>(
+        proc, Trunc, Origin(),
+        root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR0));
+    Value* arg2 = root->appendNew<Value>(
+        proc, Trunc, Origin(),
+        root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR1));
+    Value* arg164 = root->appendNew<Value>(proc, SExt32, Origin(), arg1);
+    Value* arg264 = root->appendNew<Value>(proc, SExt32, Origin(), arg2);
+    Value* mul = root->appendNew<Value>(proc, Mul, Origin(), arg164, arg264);
+    root->appendNewControlValue(proc, Return, Origin(), mul);
+
+    auto code = compileProc(proc);
+
+    CHECK(invoke<long int>(*code, a, b) == ((long int) a) * ((long int) b));
+}
+
+void testMulImm32SignExtend(const int a, int b)
+{
+    Procedure proc;
+    if (proc.optLevel() < 1)
+        return;
+    BasicBlock* root = proc.addBlock();
+    Value* arg1 = root->appendNew<Const64Value>(proc, Origin(), a);
+    Value* arg2 = root->appendNew<Value>(
+        proc, Trunc, Origin(),
+        root->appendNew<ArgumentRegValue>(proc, Origin(), GPRInfo::argumentGPR1));
+    Value* arg264 = root->appendNew<Value>(proc, SExt32, Origin(), arg2);
+    Value* mul = root->appendNew<Value>(proc, Mul, Origin(), arg1, arg264);
+    root->appendNewControlValue(proc, Return, Origin(), mul);
+
+    auto code = compileProc(proc);
+
+    CHECK(invoke<long int>(*code, b) == ((long int) a) * ((long int) b));
+}
+
 void testMulLoadTwice()
 {
     auto test = [&] () {
@@ -14635,55 +14676,55 @@ void testPatchpointTerminalReturnValue(bool successIsRare)
 void testMemoryFence()
 {
     Procedure proc;
-    
+
     BasicBlock* root = proc.addBlock();
-    
+
     root->appendNew<FenceValue>(proc, Origin());
     root->appendNew<Value>(proc, Return, Origin(), root->appendIntConstant(proc, Origin(), Int32, 42));
-    
+
     auto code = compileProc(proc);
     CHECK_EQ(invoke<int>(*code), 42);
     if (isX86())
         checkUsesInstruction(*code, "lock or $0x0, (%rsp)");
     if (isARM64())
-        checkUsesInstruction(*code, "dmb    ish");
+        checkUsesInstruction(*code, "dmb     ish");
     checkDoesNotUseInstruction(*code, "mfence");
-    checkDoesNotUseInstruction(*code, "dmb    ishst");
+    checkDoesNotUseInstruction(*code, "dmb     ishst");
 }
 
 void testStoreFence()
 {
     Procedure proc;
-    
+
     BasicBlock* root = proc.addBlock();
-    
+
     root->appendNew<FenceValue>(proc, Origin(), HeapRange::top(), HeapRange());
     root->appendNew<Value>(proc, Return, Origin(), root->appendIntConstant(proc, Origin(), Int32, 42));
-    
+
     auto code = compileProc(proc);
     CHECK_EQ(invoke<int>(*code), 42);
     checkDoesNotUseInstruction(*code, "lock");
     checkDoesNotUseInstruction(*code, "mfence");
     if (isARM64())
-        checkUsesInstruction(*code, "dmb    ishst");
+        checkUsesInstruction(*code, "dmb     ishst");
 }
 
 void testLoadFence()
 {
     Procedure proc;
-    
+
     BasicBlock* root = proc.addBlock();
-    
+
     root->appendNew<FenceValue>(proc, Origin(), HeapRange(), HeapRange::top());
     root->appendNew<Value>(proc, Return, Origin(), root->appendIntConstant(proc, Origin(), Int32, 42));
-    
+
     auto code = compileProc(proc);
     CHECK_EQ(invoke<int>(*code), 42);
     checkDoesNotUseInstruction(*code, "lock");
     checkDoesNotUseInstruction(*code, "mfence");
     if (isARM64())
-        checkUsesInstruction(*code, "dmb    ish");
-    checkDoesNotUseInstruction(*code, "dmb    ishst");
+        checkUsesInstruction(*code, "dmb     ish");
+    checkDoesNotUseInstruction(*code, "dmb     ishst");
 }
 
 void testTrappingLoad()
@@ -14961,7 +15002,7 @@ void testPinRegisters()
             usesCSRs |= csrs.get(regAtOffset.reg());
         CHECK_EQ(usesCSRs, !pin);
     };
-    
+
     go(true);
     go(false);
 }
@@ -17157,6 +17198,29 @@ void run(const char* filter)
     Deque<RefPtr<SharedTask<void()>>> tasks;
 
     auto shouldRun = [&] (const char* testName) -> bool {
+        // FIXME: These tests fail <https://bugs.webkit.org/show_bug.cgi?id=199330>.
+        if (!filter && isARM64()) {
+            for (auto& failingTest : {
+                "testReportUsedRegistersLateUseFollowedByEarlyDefDoesNotMarkUseAsDead",
+                "testNegFloatWithUselessDoubleConversion",
+                "testPinRegisters",
+            }) {
+                if (WTF::findIgnoringASCIICaseWithoutLength(testName, failingTest) != WTF::notFound) {
+                    dataLogLn("*** Warning: Skipping known-bad test: ", testName);
+                    return false;
+                }
+            }
+        }
+        if (!filter && isX86()) {
+            for (auto& failingTest : {
+                "testReportUsedRegistersLateUseFollowedByEarlyDefDoesNotMarkUseAsDead",
+            }) {
+                if (WTF::findIgnoringASCIICaseWithoutLength(testName, failingTest) != WTF::notFound) {
+                    dataLogLn("*** Warning: Skipping known-bad test: ", testName);
+                    return false;
+                }
+            }
+        }
         return !filter || WTF::findIgnoringASCIICaseWithoutLength(testName, filter) != WTF::notFound;
     };
 
@@ -17277,8 +17341,21 @@ void run(const char* filter)
     RUN(testMulImmArg(0, 2));
     RUN(testMulImmArg(1, 0));
     RUN(testMulImmArg(3, 3));
+    RUN(testMulImm32SignExtend(1, 2));
+    RUN(testMulImm32SignExtend(0, 2));
+    RUN(testMulImm32SignExtend(1, 0));
+    RUN(testMulImm32SignExtend(3, 3));
+    RUN(testMulImm32SignExtend(0xFFFFFFFF, 0xFFFFFFFF));
+    RUN(testMulImm32SignExtend(0xFFFFFFFE, 0xFFFFFFFF));
+    RUN(testMulImm32SignExtend(0xFFFFFFFF, 0xFFFFFFFE));
     RUN(testMulArgs32(1, 1));
     RUN(testMulArgs32(1, 2));
+    RUN(testMulArgs32(0xFFFFFFFF, 0xFFFFFFFF));
+    RUN(testMulArgs32(0xFFFFFFFE, 0xFFFFFFFF));
+    RUN(testMulArgs32SignExtend(1, 1));
+    RUN(testMulArgs32SignExtend(1, 2));
+    RUN(testMulArgs32SignExtend(0xFFFFFFFF, 0xFFFFFFFF));
+    RUN(testMulArgs32SignExtend(0xFFFFFFFE, 0xFFFFFFFF));
     RUN(testMulLoadTwice());
     RUN(testMulAddArgsLeft());
     RUN(testMulAddArgsRight());