Source code for mozanalysis.bayesian_stats.bayesian_bootstrap

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import pandas as pd

import mozanalysis.bayesian_stats as mabs
from mozanalysis.utils import filter_outliers


[docs] def bb_mean(values, prob_weights): """Calculate the mean of a bootstrap replicate. Args: values (pd.Series, ndarray): One dimensional array of observed values prob_weights (pd.Series, ndarray): Equally shaped array of the probability weight associated with each value. Returns: The mean as a np.float. """ return np.dot(values, prob_weights)
[docs] def make_bb_quantile_closure(quantiles): """Return a function to calculate quantiles for a bootstrap replicate. Args: quantiles (float, list of floats): Quantiles to compute Returns a function that calculates quantiles for a bootstrap replicate: Args: values (pd.Series, ndarray): One dimensional array of observed values prob_weights (pd.Series, ndarray): Equally shaped array of the probability weight associated with each value. Returns: * A quantile as a np.float, or * several quantiles as a dict keyed by the quantiles """ # If https://github.com/numpy/numpy/pull/9211/ is ever merged then # we can just use that instead. def get_value_at_quantile(values, cdf, quantile): """Return the value at a quantile. Does no interpolation because our Bayesian bootstrap implementation calls `np.unique` to tally the values: if it did not take this shortcut then regardless of whether we interpolate when returning quantiles, the vast majority of quantiles would coincide with a value. But since we take this shortcut, interpolation mostly returns values not in the dataset. Ergh. """ # Add a tolerance of 1e-6 to account for numerical error when # computing the cdf arg = np.nonzero(quantile < cdf + 1e-6)[0][0] return values[arg] def bb_quantile(values, prob_weights): """Calculate quantiles for a bootstrap replicate. Args: values (pd.Series, ndarray): One dimensional array of observed values prob_weights (pd.Series, ndarray): Equally shaped array of the probability weight associated with each value. Returns: * A quantile as a np.float, or * several quantiles as a dict keyed by the quantiles """ # assume values is previously sorted, as per np.unique() cdf = np.cumsum(prob_weights) if np.isscalar(quantiles): return get_value_at_quantile(values, cdf, quantiles) else: return {q: get_value_at_quantile(values, cdf, q) for q in quantiles} return bb_quantile
[docs] def compare_branches( df, col_label, ref_branch_label="control", stat_fn=bb_mean, num_samples=10000, threshold_quantile=None, individual_summary_quantiles=mabs.DEFAULT_QUANTILES, comparative_summary_quantiles=mabs.DEFAULT_QUANTILES, ): """Jointly sample bootstrapped statistics then compare them. Args: df: a pandas DataFrame of queried experiment data in the standard format (see `mozanalysis.experiment`). col_label (str): Label for the df column contaning the metric to be analyzed. ref_branch_label (str, optional): String in ``df['branch']`` that identifies the branch with respect to which we want to calculate uplifts - usually the control branch. stat_fn (callable, optional): A function that either: * Aggregates each resampled population to a scalar (e.g. the default, ``bb_mean``), or * Aggregates each resampled population to a dict of scalars (e.g. the func returned by ``make_bb_quantile_closure`` when given multiple quantiles. In both cases, this function must accept two parameters: * a one-dimensional ndarray or pandas Series of values, * an identically shaped object of weights for these values num_samples (int, optional): The number of bootstrap iterations to perform. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. `0.9999`. individual_summary_quantiles (list, optional): Quantiles to determine the credible intervals on individual branch statistics. Change these when making Bonferroni corrections. comparative_summary_quantiles (list, optional): Quantiles to determine the credible intervals on comparative branch statistics (i.e. the change relative to the reference branch, probably the control). Change these when making Bonferroni corrections. Returns: If ``stat_fn`` returns a scalar (this is the default), then this function returns a dictionary has the following keys and values: * 'individual': dictionary mapping each branch name to a pandas Series that holds the expected value for the bootstrapped ``stat_fn``, and credible intervals. * 'comparative': dictionary mapping each branch name to a pandas Series of summary statistics for the possible uplifts of the bootstrapped ``stat_fn`` relative to the reference branch. Otherwise, when ``stat_fn`` returns a dict, then this function returns a similar dictionary, except the Series are replaced with DataFrames. Each row in each DataFrame corresponds to one output of ``stat_fn``, and is the Series that would be returned if ``stat_fn`` computed only this statistic. """ branch_list = df.branch.unique() if ref_branch_label not in branch_list: raise ValueError( f"Branch label '{ref_branch_label}' not in branch list '{branch_list}" ) samples = { # TODO: do we need to control seed_start? If so then we must be careful here b: get_bootstrap_samples( df[col_label][df.branch == b], stat_fn, num_samples, threshold_quantile=threshold_quantile, ) for b in branch_list } return mabs.compare_samples( samples, ref_branch_label, individual_summary_quantiles, comparative_summary_quantiles, )
[docs] def bootstrap_one_branch( data, stat_fn=bb_mean, num_samples=10000, seed_start=None, threshold_quantile=None, summary_quantiles=mabs.DEFAULT_QUANTILES, ): """Bootstrap ``stat_fn`` for one branch on its own. Computes ``stat_fn`` for ``num_samples`` resamples of ``data``, then returns summary statistics for the results. Args: data: The data as a list, 1D numpy array, or pandas Series stat_fn (callable, optional): A function that either: * Aggregates each resampled population to a scalar (e.g. the default, ``bb_mean``), or * Aggregates each resampled population to a dict of scalars (e.g. the func returned by ``make_bb_quantile_closure`` when given multiple quantiles. In both cases, this function must accept two parameters: * a one-dimensional ndarray or pandas Series of values, * an identically shaped object of weights for these values num_samples: The number of bootstrap iterations to perform seed_start: An int with which to seed numpy's RNG. It must be unique within this set of calculations. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. ``0.9999``. summary_quantiles (list, optional): Quantiles to determine the confidence bands on the branch statistics. Change these when making Bonferroni corrections. """ samples = get_bootstrap_samples( data, stat_fn, num_samples, seed_start, threshold_quantile ) return mabs.summarize_one_branch_samples(samples, summary_quantiles)
[docs] def get_bootstrap_samples( data, stat_fn=bb_mean, num_samples=10000, seed_start=None, threshold_quantile=None, ): """Return ``stat_fn`` evaluated on resampled data. Args: data: The data as a list, 1D numpy array, or pandas series stat_fn (callable, optional): A function that either: * Aggregates each resampled population to a scalar (e.g. the default, ``bb_mean``), or * Aggregates each resampled population to a dict of scalars (e.g. the func returned by ``make_bb_quantile_closure`` when given multiple quantiles. In both cases, this function must accept two parameters: * a one-dimensional ndarray or pandas Series of values, * an identically shaped object of weights for these values num_samples: The number of samples to return seed_start: A seed for the random number generator; this function will use seeds in the range:: [seed_start, seed_start + num_samples) and these particular seeds must not be used elsewhere in this calculation. By default, use a random seed. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. ``0.9999``. Returns: A Series or DataFrame with one row per sample and one column per output of ``stat_fn``. References: Rubin, Donald B. The Bayesian Bootstrap. Ann. Statist. 9 (1981), no. 1, 130--134. https://dx.doi.org/10.1214/aos/1176345338 """ if type(data) is not np.ndarray: data = np.array(data.to_numpy(dtype="float", na_value=np.nan)) if np.isnan(data).any(): raise ValueError("'data' contains null values") if threshold_quantile: data = filter_outliers(data, threshold_quantile) # For computational efficiency, tally the data. If you are careful # with the resulting draws from the dirichlet then this should be # equivalent to not doing this step (and passing np.ones() as the # counts) data_values, data_counts = np.unique(data, return_counts=True) if seed_start is None: seed_start = np.random.randint(np.iinfo(np.uint32).max) random_state = np.random.RandomState(seed_start) summary_stat_samples = [ _resample_and_agg_once(data_values, data_counts, stat_fn, random_state) for _ in range(0, num_samples) ] summary_df = pd.DataFrame(summary_stat_samples) if len(summary_df.columns) == 1: # Return a Series if stat_fn returns a scalar return summary_df.iloc[:, 0] # Else return a DataFrame if stat_fn returns a dict return summary_df
def _resample_and_agg_once(data_values, data_counts, stat_fn, random_state=None): if random_state is None: random_state = np.random.RandomState(None) prob_weights = random_state.dirichlet(data_counts) return stat_fn(data_values, prob_weights)