Source code for mozanalysis.frequentist_stats.sample_size

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at
"""module for sample size calculations"""

from collections import UserDict
from datetime import datetime
from math import pi

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from import Styler
from scipy.stats import norm
from statsmodels.stats.power import tt_ind_solve_power, zt_ind_solve_power
from statsmodels.stats.proportion import samplesize_proportions_2indep_onetail

from import BigQueryContext
from mozanalysis.experiment import TimeSeriesResult
from mozanalysis.metrics import Metric
from mozanalysis.segments import Segment
from mozanalysis.sizing import HistoricalTarget
from mozanalysis.utils import get_time_intervals

[docs] class ResultsHolder(UserDict): """ Object to hold results from different methods. It extends the dictionary objects so that users can interact with it like a dictionary with the same keys/values as before, making it backward compatible. """ def __init__( self, *args, metrics: dict | None = None, params: dict | None = None, **kwargs ): """ Args: metrics (list, optional): List of metrics used to generate the results. Defaults to None. params (dict, optional): Parameters used to generated the results. Defaults to None. """ super().__init__(*args, **kwargs) self._metrics = metrics self._params = params # create this attribute to hold only the results data # this dict is what was returned from the sample size # methods historically = { k: v for k, v in if k not in ["metrics", "params"] } @property def metrics(self): """List of metrics used to generate the results. Defaults to None""" return self._metrics @property def params(self): """Parameters used to generated the results. Defaults to None""" return self._params
[docs] @staticmethod def make_friendly_name(ugly_name: str) -> str: """Turns a name into a friendly name by replacing underscores with spaces and capitlizing words other than ones like "per", "of", etc Args: ugly_name (str): name to make pretty Returns: pretty_name (str): reformatted name """ keep_all_lowercase = ["per", "of"] split_name = ugly_name.split("_") split_name = [ el[0].upper() + el[1:] if el not in keep_all_lowercase else el for el in split_name ] pretty_name = " ".join(split_name) return pretty_name
[docs] class SampleSizeResultsHolder(ResultsHolder): """ Object to hold results from different methods. It extends the dictionary objects so that users can interact with it with a dictionary with the same keys/values as before, making it backward compatible. The dictionary functionality is extended to include additional attributes to hold metadata, a method for plotting results and a method for returning results as a dataframe """ @property def dataframe(self) -> pd.DataFrame: """dataframe property returns data as a dataframe rather than a dict """ return pd.DataFrame(
[docs] def plot_results(self, result_name: str = "sample_size_per_branch"): """plots the outputs of the sampling methods Args: result_name (str): sample size method output to plot. Defaults to sample_size_per_branch """ nice_metric_map = { ( el.friendly_name if hasattr(el, "friendly_name") else self.make_friendly_name( ) for el in self._metrics } nice_result = self.make_friendly_name(result_name) df = self.dataframe.rename(index=nice_metric_map) df[result_name].plot(kind="bar") plt.ylabel(nice_result) plt.xlabel("Metric")
[docs] class EmpiricalEffectSizeResultsHolder(ResultsHolder): "ResultsHolder for empirical_effect_size_sample_size_calc"
[docs] def style_empirical_sizing_result( self, empirical_sizing_df: pd.DataFrame ) -> Styler: """Pretty-print the DF returned by `empirical_sizing()`. Returns a pandas Styler object. """ return ( empirical_sizing_df[ [ "relative_effect_size", "population_percent_per_branch", "sample_size_per_branch", "effect_size_value", "mean_value", "std_dev_value", ] ] .rename( columns={ "relative_effect_size": "rel_effect_size", "population_percent_per_branch": "branch_pop_pct", "sample_size_per_branch": "branch_sample_size", } ) .style.format(thousands=",", precision=4) .format(subset="branch_sample_size", thousands=",", precision=0) .format("{:.2f}%", subset="branch_pop_pct") .format("{:.2%}", subset="rel_effect_size") )
@property def dataframe(self) -> pd.DataFrame: """dataframe property returns results consolidated into a dataframe""" formatted_results = {} for m, r in metric_result = {} for k, v in r.items(): if isinstance(v, dict): for vk, vv in v.items(): if vk == "period_start_day": vk = "period" metric_result[f"{k}_{vk}"] = vv else: metric_result[k] = v formatted_results[m] = metric_result df = pd.DataFrame.from_dict(formatted_results, orient="index") return df[ [ "relative_effect_size", "effect_size_value", "mean_value", "std_dev_value", "sample_size_per_branch", "population_percent_per_branch", "effect_size_period", "mean_period", "std_dev_period", ] ]
[docs] def get_styled_dataframe(self) -> Styler: """returns styled dataframe for results from empirical_effect_size_sample_size_calc Arguments: style (bool): If true, return a cleaned up and formatted pandas Styler. Otherwise return a dataframe Returns: Styler: styled dataframe for visualization """ df = self.dataframe return self.style_empirical_sizing_result(df)
[docs] class SampleSizeCurveResultHolder(ResultsHolder): def __init__(self, *args, **kwargs): # uses the __init__ method for ResultsHolder super().__init__(*args, **kwargs) # need to modify results so it matches the old format sim_var = self._params["sim_var"] self.raw_df = pd.concat( [el.set_index(sim_var, append=True) for el in] ) metrics = [ for el in self._metrics] results_dict = {} for el in metrics: results_dict[el] = self.raw_df.loc[el, :].reset_index() = results_dict def set_raw_data_stats(self, input_data: pd.DataFrame) -> pd.DataFrame: outlier_percentile = self._params["outlier_percentile"] / 100 overall_stats = ( input_data[[ for m in self._metrics]] .agg( [ "mean", "std", lambda d: d[d <= d.quantile(outlier_percentile)].mean(), lambda d: d[d <= d.quantile(outlier_percentile)].std(), ] ) .transpose() ) overall_stats.columns = ["mean", "std", "mean_trimmed", "std_trimmed"] overall_stats["trim_change_mean"] = ( overall_stats["mean_trimmed"] - overall_stats["mean"] ).abs() / overall_stats["mean"] overall_stats["trim_change_std"] = ( overall_stats["std_trimmed"] - overall_stats["std"] ).abs() / overall_stats["std"] self._raw_data_stats = overall_stats @property def dataframe(self) -> pd.DataFrame: """dataframe property""" return self.raw_df
[docs] def get_styled_dataframe( self, input_data: pd.DataFrame = None, show_population_pct: bool = True, simulated_values: list[float] | None = None, append_stats: bool = False, highlight_lessthan: list[float] | None = None, trim_highlight_threshold: float = 0.15, ) -> Styler: """ Returns styled dataframe useful for visualization Args: input_data (pd.DataFrame, optional): Metric data used for summary stats. Defaults to None. show_population_pct (bool, optional): Controls whether output is a percent of population or a count. Defaults to True. simulated_values (List[float], optional): List of values that were varied to create curves. Defaults to None. append_stats (bool, optional): Controls whether or not to append summary stats to output dataframe. Defaults to False. highlight_lessthan (List[float], optional): list of sample size thresholds to highlight in the results. For each threshold, sample sizes lower than it (but higher than any other thresholds) are highlighted in a predefined colour. When `show_population_pct` is `True`, thresholds should be expressed as a percentage between 0 and 100, not a decimal between 0 and 1 (for example, to set a threshold for 5%, supply `[5]`). At most 3 different thresholds are supported: only the 3 lowest thresholds supplied will be used, and any others are silently ignored. Defaults to None. trim_highlight_threshold (float, optional): if summary stats are shown, cases for which the trimmed mean differs from the raw mean by more than this threshold are highlighted. These metrics are strongly affected by outliers. The threshold should be a relative difference value between 0 and 1.. Defaults to 0.15. Returns: Styler: styled output """ # choose which column to output if show_population_pct: subset_col = "population_percent_per_branch" else: subset_col = "sample_size_per_branch" # reformate raw_df to make different simulatd values into columns if simulated_values is None: simulated_values = self._params["simulated_values"] pretty_df = self.raw_df[subset_col].unstack() else: pretty_df = self.raw_df[subset_col].unstack()[simulated_values] # get stats if input_data is non-null and append_stats is True if append_stats: if input_data is None: raise ValueError("append_stats is true but no raw data was provided") self.set_raw_data_stats(input_data) pretty_df = pd.concat([pretty_df, self._raw_data_stats], axis="columns") disp = ( "{:.2f}%" if show_population_pct else "{:,.0f}", subset=simulated_values ) # Round displayed effect size values to smallest precision for readability .format_index( axis="columns", precision=int(-np.floor(np.log10(simulated_values)).min()), ) ) if append_stats: large_change_format_str = "color:maroon; font-weight:bold" disp = ( disp.set_properties( subset=self._raw_data_stats.columns, **{"background-color": "tan", "color": "black"}, ) .format("{:.2%}", subset=["trim_change_mean", "trim_change_std"]) # highlight large changes in mean because of trimming .applymap( lambda x: ( large_change_format_str if x > trim_highlight_threshold else "" ), subset=["trim_change_mean"], ) ) # Colours chosen to work reasonably well in either light or dark mode # Ordered from highest to lowest cutoff highlight_colours = ["lightgreen", "skyblue", "gold"] if highlight_lessthan: # Only 3 cutoffs suported. Any others are silently dropped highlight_lessthan = sorted(highlight_lessthan)[:3] # Apply from highest to lowest cutoff so that lower cutoffs overwrite higher for lim, colour in list( zip(highlight_lessthan, highlight_colours, strict=False) )[::-1]: disp = disp.highlight_between( subset=simulated_values, right=lim, props=f"background-color:{colour};color:black", ) return disp
[docs] def sample_size_curves( df: pd.DataFrame, metrics_list: list, solver, effect_size: float | np.ndarray | pd.Series | list[float] = 0.01, power: float | np.ndarray | pd.Series | list[float] = 0.80, alpha: float | np.ndarray | pd.Series | list[float] = 0.05, **solver_kwargs, ) -> SampleSizeCurveResultHolder: """ Loop over a list of different parameters to produce sample size estimates given those parameters. A single parameter in [effect_size, power, alpha] should be passed a list; the sample size curve will be calculated with this as the variable. Args: df: A pandas DataFrame of queried historical data. metrics_list (list of mozanalysis.metrics.Metric): List of metrics used to construct the results df from HistoricalTarget. The names of these metrics are used to return results for sample size calculation for each. solver (any function that returns sample size as function of effect_size, power, alpha): The solver being used to calculate sample size. effect_size (float or ArrayLike, default .01): For test of differences in proportions, the absolute difference; for tests of differences in mean, the percent change. alpha (float or ArrayLike, default .05): Significance level for the experiment. power (float or ArrayLike, default .90): Probability of detecting an effect, when a significant effect exists. **solver_kwargs (dict): Arguments necessary for the provided solver. Returns: SampleSizeCurveResultHolder: The data attribute contains a dictionary of pd.DataFrame objects. An item in the dictionary is created for each metric in metric_list, containing a DataFrame of sample size per branch, number of clients that satisfied targeting, and population proportion per branch at each value of the iterable parameter. Additional methods for ease of use are documented in the class. """ params = {"effect_size": effect_size, "power": power, "alpha": alpha} sim_var = [k for k, v in params.items() if type(v) in [list, np.ndarray, pd.Series]] if len(sim_var) != 1: raise ValueError( "Exactly one of effect_size, power, and alpha must be ArrayLike" ) sim_var = sim_var[0] test_vals = params[sim_var] del params[sim_var] results = {} for v in test_vals: sample_sizes = solver( df, metrics_list, **{sim_var: v}, **params, **solver_kwargs ).dataframe sample_sizes[sim_var] = v results[v] = sample_sizes # add sim_var to metadata params["sim_var"] = sim_var params["simulated_values"] = test_vals return SampleSizeCurveResultHolder( results, metrics=metrics_list, params={**params, **solver_kwargs} )
[docs] def difference_of_proportions_sample_size_calc( df: pd.DataFrame, metrics_list: list[Metric], effect_size: float = 0.01, alpha: float = 0.05, power: float = 0.90, outlier_percentile: float = 99.5, ) -> SampleSizeResultsHolder: """ Perform sample size calculation for an experiment to test for a difference in proportions. Args: df: A pandas DataFrame of queried historical data. metrics_list (list of mozanalysis.metrics.Metric): List of metrics used to construct the results df from HistoricalTarget. The names of these metrics are used to return results for sample size calculation for each effect_size (float, default .01): Difference in proportion for the minimum detectable effect -- effect_size = p(event under alt) - p(event under null) alpha (float, default .05): Significance level for the experiment. power (float, default .90): Probability of detecting an effect, when a significant effect exists. outlier_percentile(float, default .995): Percentile at which to trim each columns. Returns: SampleSizeResultsHolder: The data attribute contains a dictionary. Keys in the dictionary are the metrics column names from the DataFrame; values are the required sample size per branch to achieve the desired power for that metric. Additional methods for ease of use are documented in the class. """ def _get_sample_size_col(col): p = np.percentile(df[col], q=[outlier_percentile])[0] mean = df.loc[df[col] <= p, col].mean() p2 = mean + effect_size return samplesize_proportions_2indep_onetail( diff=effect_size, prop2=p2, power=power, ratio=1, alpha=alpha, value=0 ) metric_names = [ for m in metrics_list] results = {} for col in metric_names: sample_size = _get_sample_size_col(col) pop_percent = 100.0 * (sample_size / len(df)) results[col] = { "sample_size_per_branch": sample_size, "population_percent_per_branch": pop_percent, "number_of_clients_targeted": len(df), } params = { "effect_size": effect_size, "alpha": alpha, "power": power, "outlier_percentile": outlier_percentile, } return SampleSizeResultsHolder(results, metrics=metrics_list, params=params)
[docs] def z_or_t_ind_sample_size_calc( df: pd.DataFrame, metrics_list: list[Metric], test: str = "z", effect_size: float = 0.01, alpha: float = 0.05, power: float = 0.90, outlier_percentile: float = 99.5, ) -> SampleSizeResultsHolder: """ Perform sample size calculation for an experiment based on independent samples t or z tests. Args: df: A pandas DataFrame of queried historical data. metrics_list (list of mozanalysis.metrics.Metric): List of metrics used to construct the results df from HistoricalTarget. The names of these metrics are used to return results for sample size calculation for each test (str, default `z`): `z` or `t` to indicate which solver to use effect_size (float, default .01): Percent change in metrics expected as a result of the experiment treatment alpha (float, default .05): Significance level for the experiment. power (float, default .90): Probability of detecting an effect, when a significant effect exists. outlier_percentile(float, default .995): Percentile at which to trim each columns. Returns: SampleSizeResultsHolder: The data attribute contains a dictionary. Keys in the dictionary are the metrics column names from the DataFrame; values are the required sample size per branch to achieve the desired power for that metric. Additional methods for ease of use are documented in the class. """ tests = { "normal": zt_ind_solve_power, "z": zt_ind_solve_power, "t": tt_ind_solve_power, } solver = tests[test] def _get_sample_size_col(col): p = np.percentile(df[col], q=[outlier_percentile])[0] sd = df.loc[df[col] <= p, col].std() mean = df.loc[df[col] <= p, col].mean() es = (effect_size * mean) / sd return solver(effect_size=es, alpha=alpha, power=power, nobs1=None) metric_names = [ for m in metrics_list] results = {} for col in metric_names: sample_size = _get_sample_size_col(col) pop_percent = 100.0 * (sample_size / len(df)) results[col] = { "sample_size_per_branch": sample_size, "population_percent_per_branch": pop_percent, "number_of_clients_targeted": len(df), } params = { "effect_size": effect_size, "alpha": alpha, "power": power, "outlier_percentile": outlier_percentile, "solver": solver, "test": test, } return SampleSizeResultsHolder(results, metrics=metrics_list, params=params)
[docs] def empirical_effect_size_sample_size_calc( res: TimeSeriesResult, bq_context: BigQueryContext, metric_list: list, quantile: float = 0.90, power: float = 0.80, alpha: float = 0.05, parent_distribution: str = "normal", plot_effect_sizes: bool = False, ) -> EmpiricalEffectSizeResultsHolder: """ Perform sample size calculation with empirical effect size and asymptotic approximation of Wilcoxen-Mann-Whitney U Test. Empirical effect size is estimated using a quantile of week-to-week changes over the course of the study, and the variance in the test statistic is estimated as a quantile of weekly variance in metrics. Sample size calculation is based on the asymptotic relative efficiency (ARE) of the U test to the T test (see Stapleton 2008, pg 266, or Mathematisch-Naturwissenschaftliche_Fakultaet/Psychologie/AAP/gpower/GPowerManual.pdf) Args: res: A TimeSeriesResult, generated by mozanalysis.sizing.HistoricalTarget.get_time_series_data. bq_context: A object that handles downloading time series data from BigQuery. metrics_list (list of mozanalysis.metrics.Metric): List of metrics used to construct the results df from HistoricalTarget. The names of these metrics are used to return results for sample size calculation for each. quantile (float, default .90): Quantile used to calculate the effect size as the quantile of week-to-week metric changes and the variance of the mean. alpha (float, default .05): Significance level for the experiment. power (float, default .90): Probability of detecting an effect, when a significant effect exists. parent_distribution (str, default "normal"): Distribution of the parent data; must be normal, uniform, logistic, or laplace. plot_effect_sizes (bool, default False): Whether or not to plot the distribution of effect sizes observed in historical data. Returns: EmpiricalEffectSizeResultsHolder: The data attribute contains a dictionary. Keys in the dictionary are the metrics column names from the DataFrame; values are dictionaries containing the required sample size per branch to achieve the desired power for that metric, along with additional information. Additional methods for ease of use are documented in the class. """ def _mann_whitney_solve_sample_size_approximation( effect_size, std, alpha=0.05, power=0.8, parent_distribution="normal" ): """ Wilcoxen-Mann-Whitney rank sum test sample size calculation, based on asymptotic efficiency relative to the t-test. """ rel_effect_size = effect_size / std are = { "uniform": 1.0, "normal": pi / 3.0, "logistic": 9.0 / (pi**2), "laplace": 2.0 / 3.0, } if parent_distribution not in are: raise ValueError(f"Parent distribution must be in {are.keys()}") t_sample_size = tt_ind_solve_power( effect_size=rel_effect_size, power=power, alpha=alpha ) return t_sample_size * are[parent_distribution] res_mean, pop_size = res.get_aggregated_data( bq_context=bq_context, metric_list=metric_list, aggregate_function="AVG" ) res_mean.sort_values(by="analysis_window_start", ascending=True, inplace=True) res_std, _ = res.get_aggregated_data( bq_context=bq_context, metric_list=metric_list, aggregate_function="STDDEV" ) size_dict = {} for m in metric_list: res_mean["diff"] = res_mean[].diff().abs() if plot_effect_sizes: print(f"{}: plotting effect sizes observed in historical data") print("Summary statistics") print(res_mean["diff"].describe()) print("Histogram of effect sizes") plt.hist(res_mean["diff"], bins=20) m_quantile = res_mean["diff"].quantile(q=quantile, interpolation="nearest") m_std = res_std[].quantile(q=quantile, interpolation="nearest") effect_size = { "value": m_quantile, "period_start_day": res_mean.loc[ res_mean["diff"] == m_quantile, "analysis_window_start" ].values[0], } effect_size_base_period = effect_size["period_start_day"] - 7 metric_value = { "value": res_mean.loc[ res_mean["analysis_window_start"] == effect_size_base_period, ].values[0], "period_start_day": effect_size_base_period, } std = { "value": m_std, "period_start_day": res_std.loc[ res_std[] == m_std, "analysis_window_start" ].values[0], } sample_size = _mann_whitney_solve_sample_size_approximation( effect_size=effect_size["value"], std=std["value"], power=power, alpha=alpha, parent_distribution=parent_distribution, ) size_dict[] = { "effect_size": effect_size, "mean": metric_value, "std_dev": std, "relative_effect_size": effect_size["value"] / metric_value["value"], "sample_size_per_branch": sample_size, "number_of_clients_targeted": pop_size, "population_percent_per_branch": 100.0 * (sample_size / pop_size), } params = { "quantile": quantile, "power": power, "alpha": alpha, "parent_distribution": parent_distribution, } return EmpiricalEffectSizeResultsHolder( size_dict, metrics=metric_list, params=params )
[docs] def poisson_diff_solve_sample_size( df: pd.DataFrame, metrics_list: list[Metric], effect_size: float = 0.01, alpha: float = 0.05, power: float = 0.90, outlier_percentile: float = 99.5, ) -> SampleSizeResultsHolder: """ Sample size for test of difference of Poisson rates, based on Poisson rate's asymptotic normality. Args: df: A pandas DataFrame of queried historical data. metrics_list (list of mozanalysis.metrics.Metric): List of metrics used to construct the results df from HistoricalTarget. The names of these metrics are used to return results for sample size calculation for each test (str, default `z`): `z` or `t` to indicate which solver to use effect_size (float, default .01): Percent change in metrics expected as a result of the experiment treatment alpha (float, default .05): Significance level for the experiment. power (float, default .90): Probability of detecting an effect, when a significant effect exists. outlier_percentile(float, default .995): Percentile at which to trim each columns. Returns: SampleSizeResultsHolder: The data attribute contains a dictionary. Keys in the dictionary are the metrics column names from the DataFrame; values are the required sample size per branch to achieve the desired power for that metric. Additional methods for ease of use are documented in the class """ def _get_sample_size_col(col): p = np.percentile(df[col], q=[outlier_percentile])[0] sd = df.loc[df[col] <= p, col].std() mean = df.loc[df[col] <= p, col].mean() es = (effect_size * mean) / sd z_alpha = norm.ppf(1 - alpha / 2) z_power = norm.ppf(power) denom = (es / (z_alpha + z_power)) ** 2 sample_size = (mean + es) / denom return sample_size metric_names = [ for m in metrics_list] results = {} for col in metric_names: sample_size = _get_sample_size_col(col) pop_percent = 100.0 * (sample_size / len(df)) results[col] = { "sample_size_per_branch": sample_size, "population_percent_per_branch": pop_percent, "number_of_clients_targeted": len(df), } params = { "effect_size": effect_size, "alpha": alpha, "power": power, "outlier_percentile": outlier_percentile, } return SampleSizeResultsHolder(results, metrics=metrics_list, params=params)
[docs] def variable_enrollment_length_sample_size_calc( bq_context: BigQueryContext, start_date: str | datetime, max_enrollment_days: int, analysis_length: int, metric_list: list[Metric], target_list: list[Segment], variable_window_length: int = 7, experiment_name: str | None = "", app_id: str | None = "", to_pandas: bool = True, **sizing_kwargs, ) -> dict[str, dict[str, int] | pd.DataFrame]: """ Sample size calculation over a variable enrollment window. This function will fetch a DataFrame with metrics defined in metric_list for a target population defined in the target_list over an enrollment window of length max_enrollment_days. Sample size calculation is performed using clients enrolled in the first variable_window_length dates in that max enrollment window; that window is incrementally widened by the variable window length and sample size calculation performed again, until the last enrollment date is reached. Args: bq_context: A object that handles downloading data from BigQuery. start_date (str or datetime in %Y-%m-%d format): First date of enrollment for sizing job. max_enrollment_days (int): Maximum number of dates to consider for the enrollment period for the experiment in question. analysis_length (int): Number of days to record metrics for each client in the experiment in question. metric_list (list of mozanalysis.metrics.Metric): List of metrics used to construct the results df from HistoricalTarget. The names of these metrics are used to return results for sample size calculation for each. target_list (list of mozanalysis.segments.Segment): List of segments used to identify clients to include in the study. variable_window_length (int): Length of the intervals used to extend the enrollment period incrementally. Sample sizes are recalculated over each variable enrollment period. experiment_name (str): Optional name used to name the target and metric tables in BigQuery. app_id (str): Application that experiment will be run on. **sizing_kwargs: Arguments to pass to z_or_t_ind_sample_size_calc Returns: A dictionary. Keys in the dictionary are the metrics column names from the DataFrame; values are the required sample size per branch to achieve the desired power for that metric. """ if variable_window_length > max_enrollment_days: raise ValueError( "Enrollment window length is larger than the max enrollment length." ) ht = HistoricalTarget( start_date=start_date, analysis_length=analysis_length, num_dates_enrollment=max_enrollment_days, experiment_name=experiment_name, app_id=app_id, ) df = ht.get_single_window_data( bq_context=bq_context, metric_list=metric_list, target_list=target_list ) interval_end_dates = get_time_intervals( start_date, variable_window_length, max_enrollment_days, ) def _for_interval_sample_size_calculation(i): df_interval = df.loc[df["enrollment_date"] < interval_end_dates[i]] res = z_or_t_ind_sample_size_calc( df=df_interval, metrics_list=metric_list, test="t", **sizing_kwargs ) final_res = {} for key in res: final_res[key] = { "enrollment_end_date": interval_end_dates[i], **res[key], } return final_res results_dict = {} for m in metric_list: results_dict[] = [] for i in range(len(interval_end_dates)): res = _for_interval_sample_size_calculation(i) for m in metric_list: results_dict[].append(res[]) for m in results_dict: results_dict[m] = pd.DataFrame(results_dict[m]) return results_dict