Source code for mozanalysis.bayesian_stats.binary

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import pandas as pd
import scipy.stats as st

import mozanalysis.bayesian_stats as mabs


[docs] def compare_branches( df, col_label, ref_branch_label="control", num_samples=10000, individual_summary_quantiles=mabs.DEFAULT_QUANTILES, comparative_summary_quantiles=mabs.DEFAULT_QUANTILES, ): """Jointly sample conversion rates for branches then compare them. See `compare_branches_from_agg` for more details. Args: df (pd.DataFrame): Queried experiment data in the standard format. col_label (str): Label for the df column contaning the metric to be analyzed. ref_branch_label (str, optional): String in ``df['branch']`` that identifies the the branch with respect to which we want to calculate uplifts - usually the control branch. num_samples (int, optional): The number of samples to compute. individual_summary_quantiles (list, optional): Quantiles to determine the confidence bands on individual branch statistics. Change these when making Bonferroni corrections. comparative_summary_quantiles (list, optional): Quantiles to determine the confidence bands on comparative branch statistics (i.e. the change relative to the reference branch, probably the control). Change these when making Bonferroni corrections. Returns a dictionary: * 'individual': dictionary mapping branch names to a pandas Series of summary stats for the posterior distribution over the branch's conversion rate. * 'comparative': dictionary mapping branch names to a pandas Series of summary statistics for the possible uplifts of the conversion rate relative to the reference branch - see docs for :meth:`mozanalysis.bayesian_stats.summarize_samples.summarize_joint_samples`. """ agg_col = aggregate_col(df, col_label) return compare_branches_from_agg( agg_col, ref_branch_label=ref_branch_label, num_samples=num_samples, individual_summary_quantiles=mabs.DEFAULT_QUANTILES, comparative_summary_quantiles=mabs.DEFAULT_QUANTILES, )
[docs] def aggregate_col(df, col_label): """Return the number of enrollments and conversions per branch. Args: df (pd.DataFrame): Queried experiment data in the standard format. col_label (str): Label for the df column contaning the metric to be analyzed. Returns: A DataFrame. The index is the list of branches. It has the following columns: * num_enrollments: The number of experiment subjects enrolled in this branch who were eligible for the metric. * num_conversions: The number of these enrolled experiment subjects who met the metric's conversion criteria. """ # I would have used `isin` but it seems to be ~100x slower? if not ((df[col_label] == 0) | (df[col_label] == 1)).all(): raise ValueError(f"All values in column '{col_label}' must be 0 or 1.") return ( df.groupby("branch")[col_label] .agg(["count", "sum"]) .rename(columns={"count": "num_enrollments", "sum": "num_conversions"}) )
[docs] def summarize_one_branch_from_agg( s, num_enrollments_label="num_enrollments", num_conversions_label="num_conversions", quantiles=mabs.DEFAULT_QUANTILES, ): """Return stats about a branch's conversion rate. Calculate and return a Series of summary stats for the posterior distribution over the branch's conversion rate. Args: s (pd.Series): Holds the number of enrollments and number of conversions for this branch and metric. num_enrollments_label (str, optional): The label in this Series for the number of enrollments num_conversions_label (str, optional): The label in this Series for the number of conversions quantiles (list, optional): The quantiles to return as summary statistics. Returns: A pandas Series; the index contains the stringified ``quantiles`` plus ``'mean'``. """ beta = st.beta( s.loc[num_conversions_label] + 1, s.loc[num_enrollments_label] - s.loc[num_conversions_label] + 1, ) q_index = [str(v) for v in quantiles] res = pd.Series(index=q_index + ["mean"], dtype=float) res[q_index] = beta.ppf(quantiles) res["mean"] = beta.mean() return res
[docs] def compare_branches_from_agg( df, ref_branch_label="control", num_enrollments_label="num_enrollments", num_conversions_label="num_conversions", num_samples=10000, individual_summary_quantiles=mabs.DEFAULT_QUANTILES, comparative_summary_quantiles=mabs.DEFAULT_QUANTILES, ): """Jointly sample conversion rates for two branches then compare them. Calculates various quantiles on the uplift of the non-control branch's sampled conversion rates with respect to the control branch's sampled conversion rates. The data in `df` is modelled as being generated binomially, with a Beta(1, 1) (uniform) prior over the conversion rate parameter. Args: df: A pandas dataframe of integers. * ``df.index`` lists the experiment branches * ``df.columns`` is ``[num_enrollments_label, num_conversions_label]`` ref_branch_label (str, optional): Label for the df row containing data for the control branch num_enrollments_label: Label for the df column containing the number of enrollments in each branch. num_conversions_label: Label for the df column containing the number of conversions in each branch. num_samples: The number of samples to compute Returns a dictionary: * 'individual': dictionary mapping branch names to a pandas Series of summary stats for the posterior distribution over the branch's conversion rate. * 'comparative': dictionary mapping branch names to a pandas Series of summary statistics for the possible uplifts of the conversion rate relative to the reference branch - see docs for :meth:`mozanalysis.stats.summarize_samples.summarize_joint_samples`. """ assert ref_branch_label in df.index, "What's the reference branch?" samples = get_samples(df, num_enrollments_label, num_conversions_label, num_samples) return { "individual": { b: summarize_one_branch_from_agg( df.loc[b], num_enrollments_label, num_conversions_label, quantiles=individual_summary_quantiles, ) for b in df.index }, "comparative": { b: mabs.summarize_joint_samples( samples[b], samples[ref_branch_label], quantiles=comparative_summary_quantiles, ) for b in df.index.drop(ref_branch_label) }, }
[docs] def get_samples(df, num_enrollments_label, num_conversions_label, num_samples): """Return samples from Beta distributions. Assumes a Beta(1, 1) prior. Args: df: A pandas dataframe of integers: * ``df.index`` lists the experiment branches * ``df.columns`` is ``(num_enrollments_label, num_conversions_label)`` num_enrollments_label: Label for the df column containing the number of enrollments in each branch. num_conversions_label: Label for the df column containing the number of conversions in each branch. num_samples: The number of samples to compute Returns a pandas.DataFrame of sampled conversion rates * columns: list of branches * index: enumeration of samples """ samples = pd.DataFrame(index=np.arange(num_samples), columns=df.index) for branch_label, r in df.iterrows(): # Oh, for a better prior... samples[branch_label] = np.random.beta( r.loc[num_conversions_label] + 1, r.loc[num_enrollments_label] - r.loc[num_conversions_label] + 1, size=num_samples, ) return samples