The results of A/B testing should state statistical significance
authorrniwa@webkit.org <rniwa@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 9 Apr 2015 04:58:19 +0000 (04:58 +0000)
committerrniwa@webkit.org <rniwa@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 9 Apr 2015 04:58:19 +0000 (04:58 +0000)
https://bugs.webkit.org/show_bug.cgi?id=143552

Reviewed by Chris Dumez.

Added statistical comparisons between results for each configuration on analysis task page using
Welch's t-test. The probability as well as t-statistics and the degrees of freedoms are reported.

* public/v2/app.js:
(App.TestGroupPane._populate): Report the list of statistical comparison between every pair of
root configurations in the results. e.g. if we've got A, B, C configurations then compare A/B, A/C
and B/C.
(App.TestGroupPane._computeStatisticalSignificance): Compute the statistical significance using
Welch's t-test. Report the probability by which two samples do not come from the same distribution.
(App.TestGroupPane._createConfigurationSummary): Include the array of results for this configuration.
Also renamed "items" to "requests" for clarity.

* public/v2/index.html: Added the template for showing statistical comparisons.

* public/v2/js/statistics.js: Renamed tDistributionQuantiles to tDistributionByOneSidedProbability
for clarity. Also factored out the functions to convert from one-sided probability to two-sided
probability and vice versa.
(Statistics.supportedConfidenceIntervalProbabilities):
(Statistics.confidenceIntervalDelta):
(Statistics.probabilityRangeForWelchsT): Added. Computes the lower bound and the upper bound for
the probability that two values are sampled from distinct distributions using Welch's t-test.
(Statistics.computeWelchsT): This function now takes two-sided probability like all other functions.
(.tDistributionByOneSidedProbability): Renamed from tDistributionQuantiles.
(.oneSidedToTwoSidedProbability): Extracted.
(.twoSidedToOneSidedProbability): Extracted.
(Statistics.MovingAverageStrategies): Converted the one-sided probability to the two-sided probability
now that computeWelchsT takes two-sided probability.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@182587 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Websites/perf.webkit.org/ChangeLog
Websites/perf.webkit.org/public/v2/app.js
Websites/perf.webkit.org/public/v2/index.html
Websites/perf.webkit.org/public/v2/js/statistics.js

index 20e094b..eef53c9 100644 (file)
@@ -1,5 +1,40 @@
 2015-04-08  Ryosuke Niwa  <rniwa@webkit.org>
 
 2015-04-08  Ryosuke Niwa  <rniwa@webkit.org>
 
+        The results of A/B testing should state statistical significance
+        https://bugs.webkit.org/show_bug.cgi?id=143552
+
+        Reviewed by Chris Dumez.
+
+        Added statistical comparisons between results for each configuration on analysis task page using
+        Welch's t-test. The probability as well as t-statistics and the degrees of freedoms are reported.
+
+        * public/v2/app.js:
+        (App.TestGroupPane._populate): Report the list of statistical comparison between every pair of
+        root configurations in the results. e.g. if we've got A, B, C configurations then compare A/B, A/C
+        and B/C.
+        (App.TestGroupPane._computeStatisticalSignificance): Compute the statistical significance using
+        Welch's t-test. Report the probability by which two samples do not come from the same distribution.
+        (App.TestGroupPane._createConfigurationSummary): Include the array of results for this configuration.
+        Also renamed "items" to "requests" for clarity.
+
+        * public/v2/index.html: Added the template for showing statistical comparisons.
+
+        * public/v2/js/statistics.js: Renamed tDistributionQuantiles to tDistributionByOneSidedProbability
+        for clarity. Also factored out the functions to convert from one-sided probability to two-sided
+        probability and vice versa.
+        (Statistics.supportedConfidenceIntervalProbabilities):
+        (Statistics.confidenceIntervalDelta):
+        (Statistics.probabilityRangeForWelchsT): Added. Computes the lower bound and the upper bound for
+        the probability that two values are sampled from distinct distributions using Welch's t-test.
+        (Statistics.computeWelchsT): This function now takes two-sided probability like all other functions.
+        (.tDistributionByOneSidedProbability): Renamed from tDistributionQuantiles.
+        (.oneSidedToTwoSidedProbability): Extracted.
+        (.twoSidedToOneSidedProbability): Extracted.
+        (Statistics.MovingAverageStrategies): Converted the one-sided probability to the two-sided probability
+        now that computeWelchsT takes two-sided probability.
+
+2015-04-08  Ryosuke Niwa  <rniwa@webkit.org>
+
         Unreviewed fix after r182496 for when the cached runs JSON doesn't exist.
 
         * public/v2/app.js:
         Unreviewed fix after r182496 for when the cached runs JSON doesn't exist.
 
         * public/v2/app.js:
index 8df73b1..392bb0d 100755 (executable)
@@ -1334,7 +1334,39 @@ App.TestGroupPane = Ember.ObjectProxy.extend({
         range.min -= margin;
 
         this.set('configurations', configurations);
         range.min -= margin;
 
         this.set('configurations', configurations);
+
+        var comparisons = [];
+        for (var i = 0; i < configurations.length - 1; i++) {
+            var summary1 = configurations[i].summary;
+            for (var j = i + 1; j < configurations.length; j++) {
+                var summary2 = configurations[j].summary;
+                comparisons.push({
+                    label: summary1.configLetter + ' / ' + summary2.configLetter,
+                    result: this._computeStatisticalSignificance(summary1.measuredValues, summary2.measuredValues)
+                });
+            }
+        }
+        this.set('comparisons', comparisons);
     }.observes('testResults', 'buildRequests'),
     }.observes('testResults', 'buildRequests'),
+    _computeStatisticalSignificance: function (values1, values2)
+    {
+        var tFormatter = d3.format('.3g');
+        var probabilityFormatter = d3.format('.2p');
+        var statistics = Statistics.probabilityRangeForWelchsT(values1, values2);
+        if (isNaN(statistics.t) || isNaN(statistics.degreesOfFreedom))
+            return 'N/A';
+
+        var details = ' (t=' + tFormatter(statistics.t) + ' df=' + tFormatter(statistics.degreesOfFreedom) + ')';
+
+        if (!statistics.range[0])
+            return 'Not statistically significant' + details;
+
+        var lowerLimit = probabilityFormatter(statistics.range[0]);
+        if (!statistics.range[1])
+            return 'Statistical significance > ' + lowerLimit + details;
+
+        return lowerLimit + ' < Statistical significance < ' + probabilityFormatter(statistics.range[1]) + details;
+    },
     _updateReferenceChart: function ()
     {
         var configurations = this.get('configurations');
     _updateReferenceChart: function ()
     {
         var configurations = this.get('configurations');
@@ -1458,12 +1490,13 @@ App.TestGroupPane = Ember.ObjectProxy.extend({
             revisionList: summaryRevisions,
             formattedValue: isNaN(mean) ? null : testResults.formatWithDeltaAndUnit(mean, ciDelta),
             value: mean,
             revisionList: summaryRevisions,
             formattedValue: isNaN(mean) ? null : testResults.formatWithDeltaAndUnit(mean, ciDelta),
             value: mean,
+            measuredValues: valuesInConfig,
             confidenceIntervalDelta: ciDelta,
             valueRange: range,
             statusLabel: App.BuildRequest.aggregateStatuses(requests),
         });
 
             confidenceIntervalDelta: ciDelta,
             valueRange: range,
             statusLabel: App.BuildRequest.aggregateStatuses(requests),
         });
 
-        return Ember.Object.create({summary: summary, items: requests, rootSet: rootSet});
+        return Ember.Object.create({summary: summary, requests: requests, rootSet: rootSet});
     },
 });
 
     },
 });
 
index 97d2ae4..9a88d21 100755 (executable)
                                 {{partial "testGroupRow"}}
                             {{/with}}
                         </tr>
                                 {{partial "testGroupRow"}}
                             {{/with}}
                         </tr>
-                        {{#each items}}
+                        {{#each requests}}
                             <tr class="request">
                                 {{#with ../this}}
                                     <td class="config-letter" {{action toggleShowRequestList this}}></td>
                             <tr class="request">
                                 {{#with ../this}}
                                     <td class="config-letter" {{action toggleShowRequestList this}}></td>
                         {{/each}}
                     </tbody>
                 {{/each}}
                         {{/each}}
                     </tbody>
                 {{/each}}
+                {{#each comparisons}}
+                    <tbody>
+                        <tr>
+                            <td colspan="2">{{label}}</td>
+                            {{#with ../this}}
+                                {{#each repositories}}
+                                    <td></td>
+                                {{/each}}
+                            {{/with}}
+                            <td colspan="2">{{result}}</td>
+                        </tr>
+                    </tbody>
+                {{/each}}
             </table>
             <div class="reference-chart">
                 {{#if referenceChart}}
             </table>
             <div class="reference-chart">
                 {{#if referenceChart}}
index a752240..69945df 100755 (executable)
@@ -26,21 +26,21 @@ var Statistics = new (function () {
 
     this.supportedConfidenceIntervalProbabilities = function () {
         var supportedProbabilities = [];
 
     this.supportedConfidenceIntervalProbabilities = function () {
         var supportedProbabilities = [];
-        for (var quantile in tDistributionQuantiles)
-            supportedProbabilities.push((1 - (1 - quantile) * 2).toFixed(2));
+        for (var probability in tDistributionByOneSidedProbability)
+            supportedProbabilities.push(oneSidedToTwoSidedProbability(probability).toFixed(2));
         return supportedProbabilities
     }
 
     // Computes the delta d s.t. (mean - d, mean + d) is the confidence interval with the specified probability in O(1).
     this.confidenceIntervalDelta = function (probability, numberOfSamples, sum, squareSum) {
         return supportedProbabilities
     }
 
     // Computes the delta d s.t. (mean - d, mean + d) is the confidence interval with the specified probability in O(1).
     this.confidenceIntervalDelta = function (probability, numberOfSamples, sum, squareSum) {
-        var quantile = (1 - (1 - probability) / 2);
-        if (!(quantile in tDistributionQuantiles)) {
+        var oneSidedProbability = twoSidedToOneSidedProbability(probability);
+        if (!(oneSidedProbability in tDistributionByOneSidedProbability)) {
             throw 'We only support ' + this.supportedConfidenceIntervalProbabilities().map(function (probability)
             { return probability * 100 + '%'; } ).join(', ') + ' confidence intervals.';
         }
         if (numberOfSamples - 2 < 0)
             return NaN;
             throw 'We only support ' + this.supportedConfidenceIntervalProbabilities().map(function (probability)
             { return probability * 100 + '%'; } ).join(', ') + ' confidence intervals.';
         }
         if (numberOfSamples - 2 < 0)
             return NaN;
-        var deltas = tDistributionQuantiles[quantile];
+        var deltas = tDistributionByOneSidedProbability[oneSidedProbability];
         var degreesOfFreedom = numberOfSamples - 1;
         if (degreesOfFreedom > deltas.length)
             throw 'We only support up to ' + deltas.length + ' degrees of freedom';
         var degreesOfFreedom = numberOfSamples - 1;
         if (degreesOfFreedom > deltas.length)
             throw 'We only support up to ' + deltas.length + ' degrees of freedom';
@@ -61,6 +61,25 @@ var Statistics = new (function () {
         return this.computeWelchsT(values1, 0, values1.length, values2, 0, values2.length, probability).significantlyDifferent;
     }
 
         return this.computeWelchsT(values1, 0, values1.length, values2, 0, values2.length, probability).significantlyDifferent;
     }
 
+    this.probabilityRangeForWelchsT = function (values1, values2) {
+        var result = this.computeWelchsT(values1, 0, values1.length, values2, 0, values2.length);
+        if (isNaN(result.t) || isNaN(result.degreesOfFreedom))
+            return {t: NaN, degreesOfFreedom:NaN, range: [null, null]};
+
+        var lowerBound = null;
+        var upperBound = null;
+        for (var probability in tDistributionByOneSidedProbability) {
+            var twoSidedProbability = oneSidedToTwoSidedProbability(probability);
+            if (result.t > tDistributionByOneSidedProbability[probability][Math.round(result.degreesOfFreedom - 1)])
+                lowerBound = twoSidedProbability;
+            else if (lowerBound) {
+                upperBound = twoSidedProbability;
+                break;
+            }
+        }
+        return {t: result.t, degreesOfFreedom: result.degreesOfFreedom, range: [lowerBound, upperBound]};
+    }
+
     this.computeWelchsT = function (values1, startIndex1, length1, values2, startIndex2, length2, probability) {
         var stat1 = sampleMeanAndVarianceForValues(values1, startIndex1, length1);
         var stat2 = sampleMeanAndVarianceForValues(values2, startIndex2, length2);
     this.computeWelchsT = function (values1, startIndex1, length1, values2, startIndex2, length2, probability) {
         var stat1 = sampleMeanAndVarianceForValues(values1, startIndex1, length1);
         var stat2 = sampleMeanAndVarianceForValues(values2, startIndex2, length2);
@@ -71,10 +90,11 @@ var Statistics = new (function () {
         var degreesOfFreedom = sumOfSampleVarianceOverSampleSize * sumOfSampleVarianceOverSampleSize
             / (stat1.variance * stat1.variance / stat1.size / stat1.size / stat1.degreesOfFreedom
                 + stat2.variance * stat2.variance / stat2.size / stat2.size / stat2.degreesOfFreedom);
         var degreesOfFreedom = sumOfSampleVarianceOverSampleSize * sumOfSampleVarianceOverSampleSize
             / (stat1.variance * stat1.variance / stat1.size / stat1.size / stat1.degreesOfFreedom
                 + stat2.variance * stat2.variance / stat2.size / stat2.size / stat2.degreesOfFreedom);
+        var minT = tDistributionByOneSidedProbability[twoSidedToOneSidedProbability(probability || 0.8)][Math.round(degreesOfFreedom - 1)];
         return {
             t: t,
             degreesOfFreedom: degreesOfFreedom,
         return {
             t: t,
             degreesOfFreedom: degreesOfFreedom,
-            significantlyDifferent: t > tDistributionQuantiles[probability || 0.9][Math.round(degreesOfFreedom - 1)],
+            significantlyDifferent: t > minT,
         };
     }
 
         };
     }
 
@@ -118,8 +138,7 @@ var Statistics = new (function () {
         recursivelySplitIntoTwoSegmentsAtMaxTIfSignificantlyDifferent(values, startIndex + argTMax, length - argTMax, minLength, segments);
     }
 
         recursivelySplitIntoTwoSegmentsAtMaxTIfSignificantlyDifferent(values, startIndex + argTMax, length - argTMax, minLength, segments);
     }
 
-    // One-sided t-distribution.
-    var tDistributionQuantiles = {
+    var tDistributionByOneSidedProbability = {
         0.9: [
             3.077684, 1.885618, 1.637744, 1.533206, 1.475884, 1.439756, 1.414924, 1.396815, 1.383029, 1.372184,
             1.363430, 1.356217, 1.350171, 1.345030, 1.340606, 1.336757, 1.333379, 1.330391, 1.327728, 1.325341,
         0.9: [
             3.077684, 1.885618, 1.637744, 1.533206, 1.475884, 1.439756, 1.414924, 1.396815, 1.383029, 1.372184,
             1.363430, 1.356217, 1.350171, 1.345030, 1.340606, 1.336757, 1.333379, 1.330391, 1.327728, 1.325341,
@@ -169,6 +188,8 @@ var Statistics = new (function () {
             2.373270, 2.372687, 2.372119, 2.371564, 2.371022, 2.370493, 2.369977, 2.369472, 2.368979, 2.368497,
             2.368026, 2.367566, 2.367115, 2.366674, 2.366243, 2.365821, 2.365407, 2.365002, 2.364606, 2.364217]
     };
             2.373270, 2.372687, 2.372119, 2.371564, 2.371022, 2.370493, 2.369977, 2.369472, 2.368979, 2.368497,
             2.368026, 2.367566, 2.367115, 2.366674, 2.366243, 2.365821, 2.365407, 2.365002, 2.364606, 2.364217]
     };
+    function oneSidedToTwoSidedProbability(probability) { return 2 * probability - 1; }
+    function twoSidedToOneSidedProbability(probability) { return (1 - (1 - probability) / 2); }
 
     this.MovingAverageStrategies = [
         {
 
     this.MovingAverageStrategies = [
         {
@@ -501,7 +522,7 @@ var Statistics = new (function () {
                 var results = new Array(values.length);
                 var p = false;
                 for (var i = 20; i < values.length - 5; i++)
                 var results = new Array(values.length);
                 var p = false;
                 for (var i = 20; i < values.length - 5; i++)
-                    results[i] = Statistics.testWelchsT(values.slice(i - 20, i), values.slice(i, i + 5), 0.99) ? 5 : 0;
+                    results[i] = Statistics.testWelchsT(values.slice(i - 20, i), values.slice(i, i + 5), 0.98) ? 5 : 0;
                 return results;
             }
         },
                 return results;
             }
         },