# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import pandas as pd
import mozanalysis.bayesian_stats as mabs
from mozanalysis.utils import filter_outliers
[docs]
def bb_mean(values, prob_weights):
"""Calculate the mean of a bootstrap replicate.
Args:
values (pd.Series, ndarray): One dimensional array
of observed values
prob_weights (pd.Series, ndarray): Equally shaped
array of the probability weight associated with
each value.
Returns:
The mean as a np.float.
"""
return np.dot(values, prob_weights)
[docs]
def make_bb_quantile_closure(quantiles):
"""Return a function to calculate quantiles for a bootstrap replicate.
Args:
quantiles (float, list of floats): Quantiles to compute
Returns a function that calculates quantiles for a bootstrap replicate:
Args:
values (pd.Series, ndarray):
One dimensional array of observed values
prob_weights (pd.Series, ndarray):
Equally shaped array of the probability weight associated with
each value.
Returns:
* A quantile as a np.float, or
* several quantiles as a dict keyed by the quantiles
"""
# If https://github.com/numpy/numpy/pull/9211/ is ever merged then
# we can just use that instead.
def get_value_at_quantile(values, cdf, quantile):
"""Return the value at a quantile.
Does no interpolation because our Bayesian bootstrap
implementation calls `np.unique` to tally the values:
if it did not take this shortcut then regardless of whether
we interpolate when returning quantiles, the vast majority
of quantiles would coincide with a value. But since we take
this shortcut, interpolation mostly returns values not in the
dataset. Ergh.
"""
# Add a tolerance of 1e-6 to account for numerical error when
# computing the cdf
arg = np.nonzero(quantile < cdf + 1e-6)[0][0]
return values[arg]
def bb_quantile(values, prob_weights):
"""Calculate quantiles for a bootstrap replicate.
Args:
values (pd.Series, ndarray): One dimensional array
of observed values
prob_weights (pd.Series, ndarray): Equally shaped
array of the probability weight associated with
each value.
Returns:
* A quantile as a np.float, or
* several quantiles as a dict keyed by the quantiles
"""
# assume values is previously sorted, as per np.unique()
cdf = np.cumsum(prob_weights)
if np.isscalar(quantiles):
return get_value_at_quantile(values, cdf, quantiles)
else:
return {q: get_value_at_quantile(values, cdf, q) for q in quantiles}
return bb_quantile
[docs]
def compare_branches(
df,
col_label,
ref_branch_label="control",
stat_fn=bb_mean,
num_samples=10000,
threshold_quantile=None,
individual_summary_quantiles=mabs.DEFAULT_QUANTILES,
comparative_summary_quantiles=mabs.DEFAULT_QUANTILES,
):
"""Jointly sample bootstrapped statistics then compare them.
Args:
df: a pandas DataFrame of queried experiment data in the
standard format (see `mozanalysis.experiment`).
col_label (str): Label for the df column contaning the metric
to be analyzed.
ref_branch_label (str, optional): String in ``df['branch']``
that identifies the branch with respect to which we want to
calculate uplifts - usually the control branch.
stat_fn (callable, optional): A function that either:
* Aggregates each resampled population to a scalar (e.g.
the default, ``bb_mean``), or
* Aggregates each resampled population to a dict of
scalars (e.g. the func returned by
``make_bb_quantile_closure`` when given multiple
quantiles.
In both cases, this function must accept two parameters:
* a one-dimensional ndarray or pandas Series of values,
* an identically shaped object of weights for these values
num_samples (int, optional): The number of bootstrap iterations
to perform.
threshold_quantile (float, optional): An optional threshold
quantile, above which to discard outliers. E.g. `0.9999`.
individual_summary_quantiles (list, optional): Quantiles to
determine the credible intervals on individual branch
statistics. Change these when making Bonferroni corrections.
comparative_summary_quantiles (list, optional): Quantiles to
determine the credible intervals on comparative branch
statistics (i.e. the change relative to the reference
branch, probably the control). Change these when making
Bonferroni corrections.
Returns:
If ``stat_fn`` returns a scalar (this is the default), then
this function returns a dictionary has the following keys and
values:
* 'individual': dictionary mapping each branch name to a pandas
Series that holds the expected value for the bootstrapped
``stat_fn``, and credible intervals.
* 'comparative': dictionary mapping each branch name to a pandas
Series of summary statistics for the possible uplifts of
the bootstrapped ``stat_fn`` relative to the reference branch.
Otherwise, when ``stat_fn`` returns a dict, then this function
returns a similar dictionary, except the Series are replaced with
DataFrames. Each row in each DataFrame corresponds to one output
of ``stat_fn``, and is the Series that would be returned if
``stat_fn`` computed only this statistic.
"""
branch_list = df.branch.unique()
if ref_branch_label not in branch_list:
raise ValueError(
f"Branch label '{ref_branch_label}' not in branch list '{branch_list}"
)
samples = {
# TODO: do we need to control seed_start? If so then we must be careful here
b: get_bootstrap_samples(
df[col_label][df.branch == b],
stat_fn,
num_samples,
threshold_quantile=threshold_quantile,
)
for b in branch_list
}
return mabs.compare_samples(
samples,
ref_branch_label,
individual_summary_quantiles,
comparative_summary_quantiles,
)
[docs]
def bootstrap_one_branch(
data,
stat_fn=bb_mean,
num_samples=10000,
seed_start=None,
threshold_quantile=None,
summary_quantiles=mabs.DEFAULT_QUANTILES,
):
"""Bootstrap ``stat_fn`` for one branch on its own.
Computes ``stat_fn`` for ``num_samples`` resamples of ``data``,
then returns summary statistics for the results.
Args:
data: The data as a list, 1D numpy array, or pandas Series
stat_fn (callable, optional): A function that either:
* Aggregates each resampled population to a scalar (e.g.
the default, ``bb_mean``), or
* Aggregates each resampled population to a dict of
scalars (e.g. the func returned by
``make_bb_quantile_closure`` when given multiple
quantiles.
In both cases, this function must accept two parameters:
* a one-dimensional ndarray or pandas Series of values,
* an identically shaped object of weights for these values
num_samples: The number of bootstrap iterations to perform
seed_start: An int with which to seed numpy's RNG. It must
be unique within this set of calculations.
threshold_quantile (float, optional): An optional threshold
quantile, above which to discard outliers. E.g. ``0.9999``.
summary_quantiles (list, optional): Quantiles to determine the
confidence bands on the branch statistics. Change these
when making Bonferroni corrections.
"""
samples = get_bootstrap_samples(
data, stat_fn, num_samples, seed_start, threshold_quantile
)
return mabs.summarize_one_branch_samples(samples, summary_quantiles)
[docs]
def get_bootstrap_samples(
data,
stat_fn=bb_mean,
num_samples=10000,
seed_start=None,
threshold_quantile=None,
):
"""Return ``stat_fn`` evaluated on resampled data.
Args:
data: The data as a list, 1D numpy array, or pandas series
stat_fn (callable, optional): A function that either:
* Aggregates each resampled population to a scalar (e.g.
the default, ``bb_mean``), or
* Aggregates each resampled population to a dict of
scalars (e.g. the func returned by
``make_bb_quantile_closure`` when given multiple
quantiles.
In both cases, this function must accept two parameters:
* a one-dimensional ndarray or pandas Series of values,
* an identically shaped object of weights for these values
num_samples: The number of samples to return
seed_start: A seed for the random number generator; this
function will use seeds in the range::
[seed_start, seed_start + num_samples)
and these particular seeds must not be used elsewhere
in this calculation. By default, use a random seed.
threshold_quantile (float, optional): An optional threshold
quantile, above which to discard outliers. E.g. ``0.9999``.
Returns:
A Series or DataFrame with one row per sample and one column
per output of ``stat_fn``.
References:
Rubin, Donald B. The Bayesian Bootstrap. Ann. Statist. 9 (1981),
no. 1, 130--134. https://dx.doi.org/10.1214/aos/1176345338
"""
if type(data) is not np.ndarray:
data = np.array(data.to_numpy(dtype="float", na_value=np.nan))
if np.isnan(data).any():
raise ValueError("'data' contains null values")
if threshold_quantile:
data = filter_outliers(data, threshold_quantile)
# For computational efficiency, tally the data. If you are careful
# with the resulting draws from the dirichlet then this should be
# equivalent to not doing this step (and passing np.ones() as the
# counts)
data_values, data_counts = np.unique(data, return_counts=True)
if seed_start is None:
seed_start = np.random.randint(np.iinfo(np.uint32).max)
random_state = np.random.RandomState(seed_start)
summary_stat_samples = [
_resample_and_agg_once(data_values, data_counts, stat_fn, random_state)
for _ in range(0, num_samples)
]
summary_df = pd.DataFrame(summary_stat_samples)
if len(summary_df.columns) == 1:
# Return a Series if stat_fn returns a scalar
return summary_df.iloc[:, 0]
# Else return a DataFrame if stat_fn returns a dict
return summary_df
def _resample_and_agg_once(data_values, data_counts, stat_fn, random_state=None):
if random_state is None:
random_state = np.random.RandomState(None)
prob_weights = random_state.dirichlet(data_counts)
return stat_fn(data_values, prob_weights)