Source code for presc.dataset

[docs]class Dataset: """ Convenience API for a dataset used with classification model. Wraps a pandas DataFrame and provides shortcuts to access feature and label columns. It also allows for other columns, eg. computed columns, to be included or added later. Attributes ---------- df : Pandas DataFrame label_col : str The name of the column containing the labels feature_cols : Array of str An array-like of column names corresponding to model features. If not specified, all columns aside from the label column will be assumed to be features. """ def __init__(self, df, label_col, feature_cols=None): self._df = df self._label_col = label_col if feature_cols is None: feature_cols = [c for c in df.columns if c != label_col] self._feature_cols = feature_cols @property def size(self): return self._df.shape[0] @property def feature_names(self): """Returns the feature names as a list.""" return list(self._feature_cols) @property def features(self): """Returns the dataset feature columns.""" return self._df[self._feature_cols] @property def labels(self): """Returns the dataset label column.""" return self._df[self._label_col] @property def other_cols(self): """Returns the dataset columns other than features and label.""" other_cols = [ c for c in self._df.columns if c != self._label_col and c not in self._feature_cols ] return self._df[other_cols] @property def column_names(self): """Returns feature and other column names.""" other_colnames = list(self.other_cols.columns) return self.feature_names + other_colnames @property def df(self): """Returns the underlying DataFrame.""" return self._df
[docs] def subset(self, subset_rows, by_position=False): """ Returns a Dataset corresponding to a subset of this one. Parameters ---------- subset_rows : Selector for the rows to include in the subset (that can be passed to `.loc` or `.iloc`). by_position : bool If `True`, `subset_rows` is interpeted as row numbers (used with `.iloc`). Otherwise, `subset_rows` is used with `.loc`. """ indexer = "iloc" if by_position else "loc" subset_df = getattr(self._df, indexer)[subset_rows] return self.__class__( subset_df, label_col=self._label_col, feature_cols=self._feature_cols )