# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import pandas as pd
import scipy.stats as st
import mozanalysis.bayesian_stats as mabs
[docs]
def compare_branches(
df,
col_label,
ref_branch_label="control",
num_samples=10000,
individual_summary_quantiles=mabs.DEFAULT_QUANTILES,
comparative_summary_quantiles=mabs.DEFAULT_QUANTILES,
):
"""Jointly sample conversion rates for branches then compare them.
See `compare_branches_from_agg` for more details.
Args:
df (pd.DataFrame): Queried experiment data in the standard
format.
col_label (str): Label for the df column contaning the metric
to be analyzed.
ref_branch_label (str, optional): String in ``df['branch']``
that identifies the the branch with respect to which we
want to calculate uplifts - usually the control branch.
num_samples (int, optional): The number of samples to compute.
individual_summary_quantiles (list, optional): Quantiles to
determine the confidence bands on individual branch
statistics. Change these when making Bonferroni corrections.
comparative_summary_quantiles (list, optional): Quantiles to
determine the confidence bands on comparative branch
statistics (i.e. the change relative to the reference
branch, probably the control). Change these when making
Bonferroni corrections.
Returns a dictionary:
* 'individual': dictionary mapping branch names to a pandas
Series of summary stats for the posterior distribution over
the branch's conversion rate.
* 'comparative': dictionary mapping branch names to a pandas
Series of summary statistics for the possible uplifts of the
conversion rate relative to the reference branch - see docs
for
:meth:`mozanalysis.bayesian_stats.summarize_samples.summarize_joint_samples`.
"""
agg_col = aggregate_col(df, col_label)
return compare_branches_from_agg(
agg_col,
ref_branch_label=ref_branch_label,
num_samples=num_samples,
individual_summary_quantiles=mabs.DEFAULT_QUANTILES,
comparative_summary_quantiles=mabs.DEFAULT_QUANTILES,
)
[docs]
def aggregate_col(df, col_label):
"""Return the number of enrollments and conversions per branch.
Args:
df (pd.DataFrame): Queried experiment data in the standard
format.
col_label (str): Label for the df column contaning the metric
to be analyzed.
Returns:
A DataFrame. The index is the list of branches. It has the
following columns:
* num_enrollments: The number of experiment subjects enrolled in
this branch who were eligible for the metric.
* num_conversions: The number of these enrolled experiment subjects
who met the metric's conversion criteria.
"""
# I would have used `isin` but it seems to be ~100x slower?
if not ((df[col_label] == 0) | (df[col_label] == 1)).all():
raise ValueError(f"All values in column '{col_label}' must be 0 or 1.")
return (
df.groupby("branch")[col_label]
.agg(["count", "sum"])
.rename(columns={"count": "num_enrollments", "sum": "num_conversions"})
)
[docs]
def summarize_one_branch_from_agg(
s,
num_enrollments_label="num_enrollments",
num_conversions_label="num_conversions",
quantiles=mabs.DEFAULT_QUANTILES,
):
"""Return stats about a branch's conversion rate.
Calculate and return a Series of summary stats for the posterior
distribution over the branch's conversion rate.
Args:
s (pd.Series): Holds the number of enrollments and number of
conversions for this branch and metric.
num_enrollments_label (str, optional): The label in this Series
for the number of enrollments
num_conversions_label (str, optional): The label in this Series
for the number of conversions
quantiles (list, optional): The quantiles to return as summary
statistics.
Returns:
A pandas Series; the index contains the stringified
``quantiles`` plus ``'mean'``.
"""
beta = st.beta(
s.loc[num_conversions_label] + 1,
s.loc[num_enrollments_label] - s.loc[num_conversions_label] + 1,
)
q_index = [str(v) for v in quantiles]
res = pd.Series(index=q_index + ["mean"], dtype=float)
res[q_index] = beta.ppf(quantiles)
res["mean"] = beta.mean()
return res
[docs]
def compare_branches_from_agg(
df,
ref_branch_label="control",
num_enrollments_label="num_enrollments",
num_conversions_label="num_conversions",
num_samples=10000,
individual_summary_quantiles=mabs.DEFAULT_QUANTILES,
comparative_summary_quantiles=mabs.DEFAULT_QUANTILES,
):
"""Jointly sample conversion rates for two branches then compare them.
Calculates various quantiles on the uplift of the non-control
branch's sampled conversion rates with respect to the control
branch's sampled conversion rates.
The data in `df` is modelled as being generated binomially, with a
Beta(1, 1) (uniform) prior over the conversion rate parameter.
Args:
df: A pandas dataframe of integers.
* ``df.index`` lists the experiment branches
* ``df.columns`` is
``[num_enrollments_label, num_conversions_label]``
ref_branch_label (str, optional): Label for the df row
containing data for the control branch
num_enrollments_label: Label for the df column containing the
number of enrollments in each branch.
num_conversions_label: Label for the df column containing the
number of conversions in each branch.
num_samples: The number of samples to compute
Returns a dictionary:
* 'individual': dictionary mapping branch names to a pandas
Series of summary stats for the posterior distribution over
the branch's conversion rate.
* 'comparative': dictionary mapping branch names to a pandas
Series of summary statistics for the possible uplifts of the
conversion rate relative to the reference branch - see docs
for
:meth:`mozanalysis.stats.summarize_samples.summarize_joint_samples`.
"""
assert ref_branch_label in df.index, "What's the reference branch?"
samples = get_samples(df, num_enrollments_label, num_conversions_label, num_samples)
return {
"individual": {
b: summarize_one_branch_from_agg(
df.loc[b],
num_enrollments_label,
num_conversions_label,
quantiles=individual_summary_quantiles,
)
for b in df.index
},
"comparative": {
b: mabs.summarize_joint_samples(
samples[b],
samples[ref_branch_label],
quantiles=comparative_summary_quantiles,
)
for b in df.index.drop(ref_branch_label)
},
}
[docs]
def get_samples(df, num_enrollments_label, num_conversions_label, num_samples):
"""Return samples from Beta distributions.
Assumes a Beta(1, 1) prior.
Args:
df: A pandas dataframe of integers:
* ``df.index`` lists the experiment branches
* ``df.columns`` is
``(num_enrollments_label, num_conversions_label)``
num_enrollments_label: Label for the df column containing the
number of enrollments in each branch.
num_conversions_label: Label for the df column containing the
number of conversions in each branch.
num_samples: The number of samples to compute
Returns a pandas.DataFrame of sampled conversion rates
* columns: list of branches
* index: enumeration of samples
"""
samples = pd.DataFrame(index=np.arange(num_samples), columns=df.index)
for branch_label, r in df.iterrows():
# Oh, for a better prior...
samples[branch_label] = np.random.beta(
r.loc[num_conversions_label] + 1,
r.loc[num_enrollments_label] - r.loc[num_conversions_label] + 1,
size=num_samples,
)
return samples