from presc.evaluations.utils import is_discrete
from presc.utils import include_exclude_list
from presc.configuration import PrescConfig
from presc import global_config
from numpy import histogram, histogram_bin_edges
from pandas import Series, MultiIndex
import matplotlib.pyplot as plt
from confuse import ConfigError
def _histogram_bin_labels(bin_edges):
"""Returns a Series of labels for histogram bins generated using
`numpy.histogram_bin_edges`."""
labs = [
f"[{bin_edges[i]:g}, {bin_edges[i+1]:g})" for i in range(len(bin_edges) - 1)
]
labs[-1] = labs[-1][:-1] + "]"
return Series(labs)
[docs]def compute_conditional_distribution(
data_col, true_labs, pred_labs, as_categorical=False, binning="fd", common_bins=True
):
"""Compute a distributional summary.
The metric is computed within unique values of the grouping column
(categorical) or within bins partitioning its range (continuous).
Parameters
----------
data_col :
A column of data from a test dataset.
true_labs : Series
A series of true labels for the test dataset.
pred_labs : Series
A series of labels predicted by a model for the test dataset.
as_categorical : bool
Should the data column be treated as categorical, ie. binned
on its unique values? If it is not numeric, this param is ignored.
binning : str
Binning scheme to use for a numerical column, passed to `numpy.histogram`.
Can be a fixed number of bins or a string indicating a binning scheme
common_bins : bool
Should the bins be computed over the entire column and shared
across groups (`True`) or computed within each group (`False`)
Returns
-------
ConditionalDistributionResult
"""
grouping = [true_labs, pred_labs]
if is_discrete(data_col):
as_categorical = True
if as_categorical:
grouping.append(data_col)
distribs = data_col.groupby(grouping).size()
if common_bins:
# Extend the index in each label group to include all data values
data_vals = distribs.index.get_level_values(-1).unique()
y_vals = distribs.index.droplevel(-1).unique()
full_ind = MultiIndex.from_tuples(
[(yt, yp, x) for yt, yp in y_vals.values for x in data_vals],
names=distribs.index.names,
)
distribs = distribs.reindex(index=full_ind, fill_value=0)
bin_edges = Series(data_vals)
else:
# Convert the innermost index level to a Series of bin edges.
bin_edges = distribs.rename(None).reset_index(level=-1).iloc[:, 0]
else:
if common_bins:
bins = histogram_bin_edges(data_col, bins=binning)
else:
bins = binning
# distribs will be a series with values (<hist_values>, <bin_edges>)
distribs = data_col.groupby(grouping).apply(lambda x: histogram(x, bins=bins))
bin_edges = distribs.map(lambda x: x[1])
bin_ind_tuples = []
for y in distribs.index:
bin_ind_tuples.extend(
[(y[0], y[1], x) for x in _histogram_bin_labels(bin_edges.loc[y])]
)
index_with_bins = MultiIndex.from_tuples(
bin_ind_tuples, names=distribs.index.names + [None]
)
distribs = Series(
distribs.map(lambda x: x[0]).explode().values, index=index_with_bins
)
if common_bins:
# Retain the unique bin edges as an array
bin_edges = Series(bin_edges.iloc[0])
return ConditionalDistributionResult(
vals=distribs,
bins=Series(bin_edges),
categorical=as_categorical,
binning=binning,
common_bins=common_bins,
)
[docs]class ConditionalDistributionResult:
"""Result of the conditional distribution computation for a single column of data.
Attributes
----------
vals : Series
A Series listing the bin counts for each group, indexed by (<true_label>,
<predicted_label>, <bin_label>).
bins: Series
A Series listing the bin endpoints. If `common_bins` is `False`,
this should be indexed by (<true_label>, <predicted_label>) and list
the endpoints for each group. If the data was treated as numeric, this
will have length `len(vals)+1` (within each group), otherwise
`len(vals)`.
categorical : bool
Was the feature treated as categorical?
binning : str
The binning scheme used
common_bins : bool
Were common bins used across all groups?
"""
def __init__(self, vals, bins, categorical, binning, common_bins):
self.vals = vals
self.bins = bins
self.categorical = categorical
self.binning = binning
self.common_bins = common_bins
[docs] def display_result(self, xlab):
"""Display the distributions for the given data column.
Parameters
----------
xlab : str
Label to display on the x-axis.
"""
for y_true, y_pred in self.vals.index.droplevel(-1).unique():
counts = self.vals.loc[(y_true, y_pred)]
if isinstance(self.bins.index, MultiIndex):
bins = self.bins.loc[(y_true, y_pred)]
else:
bins = self.bins
if self.categorical:
plt.bar(
bins.astype("str"),
counts,
)
else:
bins = bins.values
plt.hist(
(bins[:-1] + bins[1:]) / 2,
bins=len(counts),
weights=counts,
range=(bins.min(), bins.max()),
)
plt.xlabel(xlab)
plt.ylabel("Frequency")
plt.title(f"Group: {y_true}_predicted_as_{y_pred}")
plt.show(block=False)
[docs]class ConditionalDistribution:
"""Computation of data distributions conditional on prediction results.
Attributes
----------
model :
The ClassificationModel to run the evaluation for.
test_dataset : presc.dataset.Dataset
A Dataset to use for evaluation.
settings : dict
An optional dict specifying option values under `evaluations.conditional_distribution`,
eg. `{"computation.binning": 5}`, These are restricted to the class instance and do not change the global config.
config : presc.configuration.PrescConfig
An optional PrescConfig instance to read options from. This will be overridden by `settings` values.
"""
def __init__(self, model, test_dataset, settings=None, config=None):
source_config = config or global_config
self._config = PrescConfig(source_config)
if settings:
self._config.set({"evaluations": {"conditional_distribution": settings}})
self._model = model
self._test_dataset = test_dataset
self._test_pred = self._model.predict_labels(test_dataset).rename("predicted")
[docs] def compute_for_column(self, colname, **kwargs):
"""Compute the evaluation for the given dataset column.
Parameters
----------
colname : str
A column in the dataset to compute distributions for.
kwargs:
On-the-fly overrides to the config option values for the computation.
Returns
-------
ConditionalDistributionResult
"""
comp_config = PrescConfig(self._config)
comp_config = comp_config["evaluations"]["conditional_distribution"][
"computation"
]
col_overrides = comp_config["columns"][colname]
try:
col_overrides = col_overrides.get()
except ConfigError:
col_overrides = None
if col_overrides:
comp_config.set(col_overrides)
if kwargs:
comp_config.set(kwargs)
return compute_conditional_distribution(
data_col=self._test_dataset.df[colname],
true_labs=self._test_dataset.labels,
pred_labs=self._test_pred,
as_categorical=comp_config["as_categorical"].get(bool),
binning=comp_config["binning"].get(),
common_bins=comp_config["common_bins"].get(bool),
)
[docs] def display(self, colnames=None):
"""Computes and displays the conditional distribution result for each
specified column.
Parameters
----------
colnames : list of str
A list of column names to run the evaluation over, creating a plot
for each. If not supplied, defaults to columns specifed in the config.
"""
if colnames:
incl = colnames
excl = None
else:
eval_config = self._config["evaluations"]["conditional_distribution"]
incl = eval_config["columns_include"].get()
excl = eval_config["columns_exclude"].get()
cols = include_exclude_list(
self._test_dataset.column_names, included=incl, excluded=excl
)
for colname in cols:
eval_result = self.compute_for_column(colname)
eval_result.display_result(xlab=colname)