+2015-04-08 Ryosuke Niwa <rniwa@webkit.org>
+
+ The results of A/B testing should state statistical significance
+ https://bugs.webkit.org/show_bug.cgi?id=143552
+
+ Reviewed by Chris Dumez.
+
+ Added statistical comparisons between results for each configuration on analysis task page using
+ Welch's t-test. The probability as well as t-statistics and the degrees of freedoms are reported.
+
+ * public/v2/app.js:
+ (App.TestGroupPane._populate): Report the list of statistical comparison between every pair of
+ root configurations in the results. e.g. if we've got A, B, C configurations then compare A/B, A/C
+ and B/C.
+ (App.TestGroupPane._computeStatisticalSignificance): Compute the statistical significance using
+ Welch's t-test. Report the probability by which two samples do not come from the same distribution.
+ (App.TestGroupPane._createConfigurationSummary): Include the array of results for this configuration.
+ Also renamed "items" to "requests" for clarity.
+
+ * public/v2/index.html: Added the template for showing statistical comparisons.
+
+ * public/v2/js/statistics.js: Renamed tDistributionQuantiles to tDistributionByOneSidedProbability
+ for clarity. Also factored out the functions to convert from one-sided probability to two-sided
+ probability and vice versa.
+ (Statistics.supportedConfidenceIntervalProbabilities):
+ (Statistics.confidenceIntervalDelta):
+ (Statistics.probabilityRangeForWelchsT): Added. Computes the lower bound and the upper bound for
+ the probability that two values are sampled from distinct distributions using Welch's t-test.
+ (Statistics.computeWelchsT): This function now takes two-sided probability like all other functions.
+ (.tDistributionByOneSidedProbability): Renamed from tDistributionQuantiles.
+ (.oneSidedToTwoSidedProbability): Extracted.
+ (.twoSidedToOneSidedProbability): Extracted.
+ (Statistics.MovingAverageStrategies): Converted the one-sided probability to the two-sided probability
+ now that computeWelchsT takes two-sided probability.
+
2015-04-08 Ryosuke Niwa <rniwa@webkit.org>
Unreviewed fix after r182496 for when the cached runs JSON doesn't exist.
range.min -= margin;
this.set('configurations', configurations);
+
+ var comparisons = [];
+ for (var i = 0; i < configurations.length - 1; i++) {
+ var summary1 = configurations[i].summary;
+ for (var j = i + 1; j < configurations.length; j++) {
+ var summary2 = configurations[j].summary;
+ comparisons.push({
+ label: summary1.configLetter + ' / ' + summary2.configLetter,
+ result: this._computeStatisticalSignificance(summary1.measuredValues, summary2.measuredValues)
+ });
+ }
+ }
+ this.set('comparisons', comparisons);
}.observes('testResults', 'buildRequests'),
+ _computeStatisticalSignificance: function (values1, values2)
+ {
+ var tFormatter = d3.format('.3g');
+ var probabilityFormatter = d3.format('.2p');
+ var statistics = Statistics.probabilityRangeForWelchsT(values1, values2);
+ if (isNaN(statistics.t) || isNaN(statistics.degreesOfFreedom))
+ return 'N/A';
+
+ var details = ' (t=' + tFormatter(statistics.t) + ' df=' + tFormatter(statistics.degreesOfFreedom) + ')';
+
+ if (!statistics.range[0])
+ return 'Not statistically significant' + details;
+
+ var lowerLimit = probabilityFormatter(statistics.range[0]);
+ if (!statistics.range[1])
+ return 'Statistical significance > ' + lowerLimit + details;
+
+ return lowerLimit + ' < Statistical significance < ' + probabilityFormatter(statistics.range[1]) + details;
+ },
_updateReferenceChart: function ()
{
var configurations = this.get('configurations');
revisionList: summaryRevisions,
formattedValue: isNaN(mean) ? null : testResults.formatWithDeltaAndUnit(mean, ciDelta),
value: mean,
+ measuredValues: valuesInConfig,
confidenceIntervalDelta: ciDelta,
valueRange: range,
statusLabel: App.BuildRequest.aggregateStatuses(requests),
});
- return Ember.Object.create({summary: summary, items: requests, rootSet: rootSet});
+ return Ember.Object.create({summary: summary, requests: requests, rootSet: rootSet});
},
});
this.supportedConfidenceIntervalProbabilities = function () {
var supportedProbabilities = [];
- for (var quantile in tDistributionQuantiles)
- supportedProbabilities.push((1 - (1 - quantile) * 2).toFixed(2));
+ for (var probability in tDistributionByOneSidedProbability)
+ supportedProbabilities.push(oneSidedToTwoSidedProbability(probability).toFixed(2));
return supportedProbabilities
}
// Computes the delta d s.t. (mean - d, mean + d) is the confidence interval with the specified probability in O(1).
this.confidenceIntervalDelta = function (probability, numberOfSamples, sum, squareSum) {
- var quantile = (1 - (1 - probability) / 2);
- if (!(quantile in tDistributionQuantiles)) {
+ var oneSidedProbability = twoSidedToOneSidedProbability(probability);
+ if (!(oneSidedProbability in tDistributionByOneSidedProbability)) {
throw 'We only support ' + this.supportedConfidenceIntervalProbabilities().map(function (probability)
{ return probability * 100 + '%'; } ).join(', ') + ' confidence intervals.';
}
if (numberOfSamples - 2 < 0)
return NaN;
- var deltas = tDistributionQuantiles[quantile];
+ var deltas = tDistributionByOneSidedProbability[oneSidedProbability];
var degreesOfFreedom = numberOfSamples - 1;
if (degreesOfFreedom > deltas.length)
throw 'We only support up to ' + deltas.length + ' degrees of freedom';
return this.computeWelchsT(values1, 0, values1.length, values2, 0, values2.length, probability).significantlyDifferent;
}
+ this.probabilityRangeForWelchsT = function (values1, values2) {
+ var result = this.computeWelchsT(values1, 0, values1.length, values2, 0, values2.length);
+ if (isNaN(result.t) || isNaN(result.degreesOfFreedom))
+ return {t: NaN, degreesOfFreedom:NaN, range: [null, null]};
+
+ var lowerBound = null;
+ var upperBound = null;
+ for (var probability in tDistributionByOneSidedProbability) {
+ var twoSidedProbability = oneSidedToTwoSidedProbability(probability);
+ if (result.t > tDistributionByOneSidedProbability[probability][Math.round(result.degreesOfFreedom - 1)])
+ lowerBound = twoSidedProbability;
+ else if (lowerBound) {
+ upperBound = twoSidedProbability;
+ break;
+ }
+ }
+ return {t: result.t, degreesOfFreedom: result.degreesOfFreedom, range: [lowerBound, upperBound]};
+ }
+
this.computeWelchsT = function (values1, startIndex1, length1, values2, startIndex2, length2, probability) {
var stat1 = sampleMeanAndVarianceForValues(values1, startIndex1, length1);
var stat2 = sampleMeanAndVarianceForValues(values2, startIndex2, length2);
var degreesOfFreedom = sumOfSampleVarianceOverSampleSize * sumOfSampleVarianceOverSampleSize
/ (stat1.variance * stat1.variance / stat1.size / stat1.size / stat1.degreesOfFreedom
+ stat2.variance * stat2.variance / stat2.size / stat2.size / stat2.degreesOfFreedom);
+ var minT = tDistributionByOneSidedProbability[twoSidedToOneSidedProbability(probability || 0.8)][Math.round(degreesOfFreedom - 1)];
return {
t: t,
degreesOfFreedom: degreesOfFreedom,
- significantlyDifferent: t > tDistributionQuantiles[probability || 0.9][Math.round(degreesOfFreedom - 1)],
+ significantlyDifferent: t > minT,
};
}
recursivelySplitIntoTwoSegmentsAtMaxTIfSignificantlyDifferent(values, startIndex + argTMax, length - argTMax, minLength, segments);
}
- // One-sided t-distribution.
- var tDistributionQuantiles = {
+ var tDistributionByOneSidedProbability = {
0.9: [
3.077684, 1.885618, 1.637744, 1.533206, 1.475884, 1.439756, 1.414924, 1.396815, 1.383029, 1.372184,
1.363430, 1.356217, 1.350171, 1.345030, 1.340606, 1.336757, 1.333379, 1.330391, 1.327728, 1.325341,
2.373270, 2.372687, 2.372119, 2.371564, 2.371022, 2.370493, 2.369977, 2.369472, 2.368979, 2.368497,
2.368026, 2.367566, 2.367115, 2.366674, 2.366243, 2.365821, 2.365407, 2.365002, 2.364606, 2.364217]
};
+ function oneSidedToTwoSidedProbability(probability) { return 2 * probability - 1; }
+ function twoSidedToOneSidedProbability(probability) { return (1 - (1 - probability) / 2); }
this.MovingAverageStrategies = [
{
var results = new Array(values.length);
var p = false;
for (var i = 20; i < values.length - 5; i++)
- results[i] = Statistics.testWelchsT(values.slice(i - 20, i), values.slice(i, i + 5), 0.99) ? 5 : 0;
+ results[i] = Statistics.testWelchsT(values.slice(i - 20, i), values.slice(i, i + 5), 0.98) ? 5 : 0;
return results;
}
},