stats/stats_abba.py
# From: http://github.com/thumbtack/abba/python/abba/stats.py, commit 0a7fcda0
# Copyright (c) 2012 Thumbtack, Inc.
import collections
import math
from scipy import stats
def get_z_critical_value(alpha, two_tailed=True):
"""
Returns the z critical value for a particular alpha = 1 - confidence level. By default returns
a two-tailed z-value, meaning the actual tail probability is alpha / 2.
"""
if two_tailed:
alpha /= 2
return stats.distributions.norm.ppf(1 - alpha)
# a value with confidence interval bounds (not necessarily centered around the point estimate)
ValueWithInterval = collections.namedtuple(
'ValueWithInterval',
('value', 'lower_bound', 'upper_bound'),
)
class ValueWithError(object):
"""
A value with standard error, from which a confidence interval can be derived.
"""
def __init__(self, value, error):
self.value = value
self.error = error
def confidence_interval_width(self, z_critical_value):
"""
z_critical_value should be the value at which the right-tail probability for a standard
normal distribution equals half the desired alpha = 1 - confidence level:
P(Z > z_value) = 1 - alpha / 2
where Z is an N(0, 1) random variable. Use get_z_critical_value(), or see
http://en.wikipedia.org/wiki/Standard_normal_table.
"""
return z_critical_value * self.error
def value_with_interval(self, z_critical_value, estimated_value=None):
width = self.confidence_interval_width(z_critical_value)
return ValueWithInterval(
value=estimated_value if estimated_value is not None else self.value,
lower_bound=self.value - width,
upper_bound=self.value + width,
)
class BinomialDistribution(object):
def __init__(self, num_trials, probability):
self.num_trials = num_trials
self.probability = probability
self.expectation = num_trials * probability
self.standard_deviation = math.sqrt(self.expectation * (1 - probability))
self._binomial = stats.binom(num_trials, probability)
def mass(self, count):
return self._binomial.pmf(count)
def cdf(self, count):
return self._binomial.cdf(count)
def survival(self, count):
return 1 - self.cdf(count)
def inverse_cdf(self, probability):
return self._binomial.ppf(probability)
def inverse_survival(self, probability):
return self._binomial.isf(probability)
class Proportion(object):
def __init__(self, num_successes, num_trials):
"""
Represents a binomial proportion with num_successes successful samples out of num_trials
total.
"""
self.num_successes = num_successes
self.num_trials = num_trials
def p_estimate(self, z_critical_value=0):
"""
Generate an adjusted estimate and error using the "Agresti-Coull Interval", see
http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Agresti-Coull_Interval.
The estimated value is an adjusted best estimate for the actual probability. For example, if
0 successes were observed out of 10 samples, it's unlikely the actual probability is zero,
so the adjusted estimate will be slightly above zero.
A z_critical_value of zero yields the ordinary Wald interval.
"""
adjusted_num_trials = float(self.num_trials + z_critical_value**2)
interval_center = (self.num_successes + z_critical_value**2 / 2) / adjusted_num_trials
standard_error = math.sqrt(interval_center * (1 - interval_center) / adjusted_num_trials)
return ValueWithError(interval_center, standard_error)
def mixed_estimate(self, z_critical_value):
"""
Returns an ValueWithInterval with a MLE value and upper/lower bounds from the Agresti-Coull
interval.
"""
return (
self.p_estimate(z_critical_value=z_critical_value)
.value_with_interval(z_critical_value, estimated_value=self.p_estimate().value)
)
def confidence_interval_on_proportion(num_successes, num_trials, confidence_level=0.95):
'''Convenience function with more straightforward interface.'''
return Proportion(num_successes, num_trials).mixed_estimate(
get_z_critical_value(1 - confidence_level)
)
class ProportionComparison(object):
def __init__(self, baseline, variation):
self.baseline = baseline
self.variation = variation
def difference_estimate(self, z_critical_value):
"""
Generate an estimate of the difference in success rates between the variation and the
baseline.
"""
baseline_p = self.baseline.p_estimate(z_critical_value=z_critical_value)
variation_p = self.variation.p_estimate(z_critical_value=z_critical_value)
difference = variation_p.value - baseline_p.value
standard_error = math.sqrt(baseline_p.error ** 2 + variation_p.error ** 2)
return ValueWithError(difference, standard_error)
def difference_ratio(self, z_critical_value):
"""
Return the difference in sucess rates as a proportion of the baseline success rate.
"""
baseline_value = self.baseline.p_estimate(z_critical_value=z_critical_value).value
difference = self.difference_estimate(z_critical_value=z_critical_value)
ratio = difference.value / baseline_value
error = difference.error / baseline_value
return ValueWithError(ratio, error)
def z_test(self, z_multiplier=1):
"""
Perform a large-sample z-test of null hypothesis H0: p_baseline == p_variation against
alternative hypothesis H1: p_baseline < p_variation. Return the (one-tailed) p-value.
z_multiplier: test z-value will be multiplied by this factor before computing a p-value.
See http://en.wikipedia.org/wiki/Statistical_hypothesis_testing#Common_test_statistics,
"Two-proportion z-test, pooled for d0 = 0".
"""
pooled_stats = Proportion(
self.baseline.num_successes + self.variation.num_successes,
self.baseline.num_trials + self.variation.num_trials,
)
pooled_p_value = pooled_stats.p_estimate().value
pooled_variance_of_difference = (
pooled_p_value * (1 - pooled_p_value)
* (1.0 / self.baseline.num_trials + 1.0 / self.variation.num_trials)
)
pooled_standard_error_of_difference = math.sqrt(pooled_variance_of_difference)
test_z_value = self.difference_estimate(0).value / pooled_standard_error_of_difference
adjusted_p_value = stats.distributions.norm.sf(test_z_value * z_multiplier)
return adjusted_p_value
def _binomial_coverage_interval(self, distribution, coverage_alpha):
"""
For the given binomial distribution, compute an interval that covers at least (1 -
coverage_alpha) of the total probability mass, centered at the expectation (unless we're at
the boundary). Uses the normal approximation.
"""
if distribution.num_trials < 1000:
# don't even bother trying to optimize for small-ish sample sizes
return (0, distribution.num_trials)
else:
return (
int(math.floor(distribution.inverse_cdf(coverage_alpha / 2))),
int(math.ceil(distribution.inverse_survival(coverage_alpha / 2))),
)
def _probability_union(self, probability, num_tests):
"""
Given the probability of an event, compute the probability that it happens at least once in
num_tests independent tests. This is used to adjust a p-value for multiple comparisons.
When used to adjust alpha instead, this is called a Sidak correction (the logic is the same,
the formula is inverted):
http://en.wikipedia.org/wiki/Bonferroni_correction#.C5.A0id.C3.A1k_correction
"""
return 1 - (1 - probability)**num_tests
def iterated_test(self, num_tests, coverage_alpha, improvement_only=False):
"""
Compute a p-value testing null hypothesis H0: p_baseline == p_variation against alternative
hypothesis H1: p_baseline != p_variation by summing p-values conditioned on individual
baseline success counts. This provides a more accurate correction for multiple testing but
scales like O(sqrt(self.baseline.num_trials)), so can eventually get slow for very large
values.
Lower coverage_alpha increases accuracy at the cost of longer runtime. Roughly, the result
will be accurate within no more than coverage_alpha (but this ignores error due to the
normal approximation so isn't guaranteed).
If improvement_only=True, computes p-value for alternative hypothesis
H1: p_baseline < p_variation instead.
"""
observed_delta = self.variation.p_estimate().value - self.baseline.p_estimate().value
if observed_delta == 0 and not improvement_only:
# a trivial case that the code below does not handle well
return 1
pooled_proportion = (
(self.baseline.num_successes + self.variation.num_successes)
/ float(self.baseline.num_trials + self.variation.num_trials)
)
variation_distribution = BinomialDistribution(self.variation.num_trials, pooled_proportion)
baseline_distribution = BinomialDistribution(self.baseline.num_trials, pooled_proportion)
baseline_limits = self._binomial_coverage_interval(baseline_distribution, coverage_alpha)
p_value = 0
for baseline_successes in range(baseline_limits[0], baseline_limits[1] + 1):
baseline_proportion = 1.0 * baseline_successes / self.baseline.num_trials
if improvement_only:
lower_trial_count = -1
upper_trial_count = math.ceil(
(baseline_proportion + observed_delta) * self.variation.num_trials
)
else:
observed_absolute_delta = abs(observed_delta)
lower_trial_count = math.floor(
(baseline_proportion - observed_absolute_delta) * self.variation.num_trials
)
upper_trial_count = math.ceil(
(baseline_proportion + observed_absolute_delta) * self.variation.num_trials
)
# p-value of variation success counts "at least as extreme" for this particular
# baseline success count
p_value_at_baseline = (
variation_distribution.cdf(lower_trial_count)
+ variation_distribution.survival(upper_trial_count - 1)
)
# this is exact because we're conditioning on the baseline count, so the multiple
# tests are independent.
adjusted_p_value = self._probability_union(p_value_at_baseline, num_tests)
baseline_probability = baseline_distribution.mass(baseline_successes)
p_value += baseline_probability * adjusted_p_value
# the remaining baseline values we didn't cover contribute less than coverage_alpha to the
# sum, so adding that amount gives us a conservative upper bound.
return p_value + coverage_alpha
Results = collections.namedtuple(
'Results',
(
'num_successes',
'num_trials',
'proportion', # ValueWithInterval
'improvement', # ValueWithInterval
'relative_improvement', # ValueWithInterval
'two_tailed_p_value', # two-tailed p-value for trial != baseline
'improvement_one_tailed_p_value', # one-tailed p-value for trial > baseline
),
)
class Experiment(object):
P_VALUE_PRECISION = 1e-5
def __init__(self, num_trials, baseline_num_successes, baseline_num_trials,
confidence_level=0.95):
"""
num_trials: number of trials to be compared to the baseline
confidence_level: used for all confidence intervals generated
"""
self.num_comparisons = max(1, num_trials)
self._baseline = Proportion(baseline_num_successes, baseline_num_trials)
alpha = (1 - confidence_level) / num_trials # Bonferroni correction
self._z_critical_value = get_z_critical_value(alpha)
def get_baseline_proportion(self):
return self._baseline.mixed_estimate(self._z_critical_value)
def get_results(self, num_successes, num_trials):
trial = Proportion(num_successes, num_trials)
comparison = ProportionComparison(self._baseline, trial)
return Results(
num_successes=num_successes,
num_trials=num_trials,
proportion=trial.mixed_estimate(self._z_critical_value),
improvement=comparison.difference_estimate(self._z_critical_value)
.value_with_interval(
self._z_critical_value,
estimated_value=comparison.difference_estimate(0).value,
),
relative_improvement=comparison.difference_ratio(self._z_critical_value)
.value_with_interval(
self._z_critical_value,
estimated_value=comparison.difference_ratio(0).value,
),
two_tailed_p_value=comparison.iterated_test(
self.num_comparisons,
self.P_VALUE_PRECISION,
),
improvement_one_tailed_p_value=comparison.iterated_test(
self.num_comparisons,
self.P_VALUE_PRECISION,
improvement_only=True,
),
)