Add a compare-results script to compare benchmark results
authorsbarati@apple.com <sbarati@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 8 Mar 2019 23:20:15 +0000 (23:20 +0000)
committersbarati@apple.com <sbarati@apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Fri, 8 Mar 2019 23:20:15 +0000 (23:20 +0000)
https://bugs.webkit.org/show_bug.cgi?id=195486
<rdar://problem/48723397>

Reviewed by Geoffrey Garen.

This patch adds a script to compare benchmark results using Welch's two-tailed t test.
Initially, this patch only reasons about PLT5/JetStream2/Speedometer2. It will be easy
to extend it to learn about our other benchmarks.

* Scripts/compare-results: Added.
(readJSONFile):
(detectJetStream2):
(JetStream2Results):
(detectSpeedometer2):
(Speedometer2Results):
(detectPLT5):
(PLT5Results):
(detectBenchmark):
(biggerIsBetter):
(ttest):
(getOptions):
(main):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@242658 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Tools/ChangeLog
Tools/Scripts/compare-results [new file with mode: 0644]

index 654b2de..6ae8ba0 100644 (file)
@@ -1,3 +1,29 @@
+2019-03-08  Saam barati  <sbarati@apple.com>
+
+        Add a compare-results script to compare benchmark results
+        https://bugs.webkit.org/show_bug.cgi?id=195486
+        <rdar://problem/48723397>
+
+        Reviewed by Geoffrey Garen.
+
+        This patch adds a script to compare benchmark results using Welch's two-tailed t test.
+        Initially, this patch only reasons about PLT5/JetStream2/Speedometer2. It will be easy
+        to extend it to learn about our other benchmarks.
+
+        * Scripts/compare-results: Added.
+        (readJSONFile):
+        (detectJetStream2):
+        (JetStream2Results):
+        (detectSpeedometer2):
+        (Speedometer2Results):
+        (detectPLT5):
+        (PLT5Results):
+        (detectBenchmark):
+        (biggerIsBetter):
+        (ttest):
+        (getOptions):
+        (main):
+
 2019-03-08  Stephanie Lewis  <slewis@apple.com>
 
         Ensure old tab state is cleared between iterations of run-benchmark
diff --git a/Tools/Scripts/compare-results b/Tools/Scripts/compare-results
new file mode 100644 (file)
index 0000000..1ddbddb
--- /dev/null
@@ -0,0 +1,198 @@
+#!/usr/bin/env python -u
+
+# Copyright (C) 2019 Apple Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1.  Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer. 
+# 2.  Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution. 
+# 3.  Neither the name of Apple Inc. ("Apple") nor the names of
+#     its contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission. 
+#
+# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+import argparse
+import json
+
+try:
+    from scipy import stats
+except:
+    print "ERROR: scipy package is not installed. Run `pip install scipy`"
+    sys.exit(1)
+
+try:
+    import numpy
+except:
+    print "ERROR: numpy package is not installed. Run `pip install numpy`"
+    sys.exit(1)
+
+def readJSONFile(path):
+    with open(path, 'r') as contents:
+        return json.loads(contents.read())
+
+Speedometer2 = "Speedometer2"
+JetStream2 = "JetStream2"
+PLT5 = "PLT5"
+
+def detectJetStream2(payload):
+    return "JetStream2.0" in payload
+
+def JetStream2Results(payload):
+    assert detectJetStream2(payload)
+
+    js = payload["JetStream2.0"]
+    iterations = len(js["tests"]["gaussian-blur"]["metrics"]["Score"]["current"])
+    results = []
+    for i in range(iterations):
+        scores = []
+        for test in js["tests"].keys():
+            scores.append(js["tests"][test]["metrics"]["Score"]["current"][i])
+        geomean = stats.gmean(scores)
+        
+        results.append(geomean)
+
+    return results
+
+def detectSpeedometer2(payload):
+    return "Speedometer-2" in payload
+
+def Speedometer2Results(payload):
+    assert detectSpeedometer2(payload)
+    results = []
+    for arr in payload["Speedometer-2"]["metrics"]["Score"]["current"]:
+        results.append(numpy.mean(arr))
+    return results
+
+def detectPLT5(payload):
+    if "iterations" not in payload:
+        return False
+    iterations = payload["iterations"]
+    if not isinstance(iterations, list):
+        return False
+    if not len(iterations):
+        return False
+    if "cold" not in iterations[0]:
+        return False
+    if "warm" not in iterations[0]:
+        return False
+    if "Geometric" not in iterations[0]:
+        return False
+    return True
+
+def PLT5Results(payload):
+    assert detectPLT5(payload)
+    results = []
+    for obj in payload["iterations"]:
+        results.append(obj["Geometric"])
+    return results
+
+def detectBenchmark(payload):
+    if detectJetStream2(payload):
+        return JetStream2
+    if detectSpeedometer2(payload):
+        return Speedometer2
+    if detectPLT5(payload):
+        return PLT5
+    return None
+
+def biggerIsBetter(benchmarkType):
+    if benchmarkType == JetStream2:
+        return True
+    if benchmarkType == Speedometer2:
+        return True
+    if benchmarkType == PLT5:
+        return False
+
+    print "Should not be reached."
+    assert False
+
+def ttest(benchmarkType, a, b):
+    # We use two-tailed Welch's
+    (tStatistic, pValue) = stats.ttest_ind(a, b, equal_var=False)
+    aMean = numpy.mean(a)
+    bMean = numpy.mean(b)
+    print "a mean = {:.5f}".format(aMean)
+    print "b mean = {:.5f}".format(bMean)
+
+    print "pValue = {:.10f}".format(pValue)
+
+    if biggerIsBetter(benchmarkType):
+        print "(Bigger means are better.)"
+        if aMean > bMean:
+            print "{:.3f} times worse".format((aMean / bMean))
+        else:
+            print "{:.3f} times better".format((bMean / aMean))
+    else:
+        print "(Smaller means are better.)"
+        if aMean > bMean:
+            print "{:.3f} times better".format((aMean / bMean))
+        else:
+            print "{:.3f} times worse".format((bMean / aMean))
+
+    if pValue <= 0.05:
+        print "Results ARE significant"
+    else:
+        print "Results ARE NOT significant"
+
+def getOptions():
+    parser = argparse.ArgumentParser(description="Compare two WebKit benchmark results. Pass in two JSON result files to compare them. This script prints the pValue along with the magnitude of the change.")
+
+    parser.add_argument("-a",
+        type=str,
+        required=True,
+        help="a of a/b. Path to JSON results file.")
+
+    parser.add_argument("-b",
+        type=str,
+        required=True,
+        help="b of a/b. Path to JSON results file.")
+
+    return parser.parse_known_args()[0]
+
+
+def main():
+    args = getOptions()
+
+    a = readJSONFile(args.a)
+    b = readJSONFile(args.b)
+
+    typeA = detectBenchmark(a)
+    typeB = detectBenchmark(b)
+
+    if typeA != typeB:
+        print "-a and -b are not the same benchmark. a={} b={}".format(typeA, typeB)
+        sys.exit(1)
+
+    if not (typeA and typeB):
+        print "Unknown benchmark type. a={} b={}".format(typeA, typeB)
+        sys.exit(1)
+    
+    if typeA == JetStream2:
+        ttest(typeA, JetStream2Results(a), JetStream2Results(b))
+    elif typeA == Speedometer2:
+        ttest(typeA, Speedometer2Results(a), Speedometer2Results(b))
+    elif typeA == PLT5:
+        ttest(typeA, PLT5Results(a), PLT5Results(b))
+    else:
+        print "Unknown benchmark type"
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
+