Source code for presc.copies.copying

from presc.dataset import Dataset
from presc.copies.sampling import mixed_data_sampling, labeling, sampling_balancer
from presc.copies.evaluations import (
    empirical_fidelity_error,
    replacement_capability,
    summary_metrics,
)


[docs]class ClassifierCopy: """Represents a classifier copy and its associated sampling method of choice. Each instance wraps the original ML classifier with a ML classifier copy and the sampling method to carry out the copy. Methods allow to carry out the copy the original classifier, evaluate the quality of the copy, and to generate additional data using the original classifier with the sampling method specified on instatiation. Attributes ---------- original : sklearn-type classifier Original ML classifier to be copied. copy : sklearn-type classifier ML classifier that will be used for the copy. numerical_sampling : function Any of the numerical sampling functions defined in PRESC: `grid_sampling`, `uniform_sampling`, `normal_sampling`... The `balancing sampler` can only be used if the feature space does not contain any categorical variable. post_sampling_labeling: bool Whether generated data must be labeled after the sampling or not. If the chosen sampling function already does class labeling (such as balancing samplers) then it should be set to False. If the parameter enforce_balance is set to True then this parameter does not have any effect. enforce_balance : bool Force class balancing for sampling functions that do not normally carry it out intrinsically. label_col : str Name of the label column. **k_sampling_parameters : Parameters needed for the `numerical_sampling` function. """ def __init__( self, original, copy, numerical_sampling, post_sampling_labeling=True, enforce_balance=False, label_col="class", **k_sampling_parameters ): self.original = original self.copy = copy self.numerical_sampling = numerical_sampling self.post_sampling_labeling = post_sampling_labeling if enforce_balance: self.post_sampling_labeling = False self.enforce_balance = enforce_balance self.label_col = label_col self.k_sampling_parameters = k_sampling_parameters if "random_state" in self.k_sampling_parameters.keys(): self.random_state = self.k_sampling_parameters["random_state"] else: self.random_state = None
[docs] def copy_classifier(self, get_training_data=False, **k_mod_sampling_parameters): """Copies the classifier using data generated with the original model. Generates synthetic data using only basic information of the features (dynamic range, mean and sigma), labels it using the original model, and trains the copy model with this synthetic data. It can also return the generated synthetic data used for training. Parameters ---------- get_training_data : bool If `True` this method returns the synthetic data generated from the original classifier that was used to train the copy. **k_mod_sampling_parameters : If the "nsamples" and/or "random_state" parameters of the sampling function have to be changed in order to obtain a different set of synthetic data, they can be specified here. Returns ------- presc.dataset.Dataset Outputs a PRESC Dataset with the training samples and their labels (if `get_training_data` set to `True`). """ # Generate synthetic data df_generated = self.generate_synthetic_data(**k_mod_sampling_parameters) # Copy the classifier self.copy.fit(df_generated.features, df_generated.labels) if get_training_data: return df_generated
[docs] def generate_synthetic_data(self, **k_mod_sampling_parameters): """Generates synthetic data using the original model. Generates samples following the sampling strategy specified on instantiation for the numerical features and a discrete distribution for the categorical features, and then labels them using the original model. If the same data needs to be generated then simply use a specific random seed. Parameters ---------- **k_mod_sampling_parameters : If the "nsamples" and/or "random_state" parameters of the sampling function have to be changed in order to obtain a different set of synthetic data, they can be specified here. Returns ------- presc.dataset.Dataset Outputs a PRESC Dataset with the generated samples and their labels. """ # Random state needs to be fixed to obtain the same training data k_sampling_parameters_gen = self.k_sampling_parameters.copy() # Update sampling parameters which have been specified on calling the method k_sampling_parameters_gen.update(k_mod_sampling_parameters) if self.enforce_balance: # Call balancer generating function with sampling parameters # (sampling_balancer returns a pandas dataframe) X_generated = sampling_balancer( original_classifier=self.original, **k_sampling_parameters_gen ) else: # Call generating function with sampling parameters # (mixed_data_sampling returns a pandas dataframe) X_generated = mixed_data_sampling( feature_parameters=k_sampling_parameters_gen.pop("feature_parameters"), numerical_sampling=self.numerical_sampling, **k_sampling_parameters_gen ) # If the type of sampling function attempts to balance the synthetic # dataset, it returns the features AND the labels. Otherwise, it returns # only the features, and the labeling function must be called. if self.post_sampling_labeling: df_generated = labeling( X_generated, self.original, label_col=self.label_col ) else: df_generated = Dataset(X_generated, label_col=self.label_col) return df_generated
[docs] def compute_fidelity_error(self, test_data): """Computes the empirical fidelity error of the classifier copy. Quantifies the resemblance of the copy to the original classifier. This value is zero when the copy makes exactly the same predictions than the original classifier (including misclassifications). Parameters ---------- test_data : array-like Dataset with the unlabeled samples to evaluate the resemblance of the copy to the original classifier. Returns ------- float The numerical value of the empirical fidelity error of the copy with this dataset. """ y_pred_original = self.original.predict(test_data) y_pred_copy = self.copy.predict(test_data) return empirical_fidelity_error(y_pred_original, y_pred_copy)
[docs] def replacement_capability(self, test_data): """Computes the replacement capability of a classifier copy. Quantifies the ability of the copy model to substitute the original model, i.e. maintaining the same accuracy in its predictions. This value is one when the accuracy of the copy model is the same as the original model, although the individual predictions may be different, approaching zero if the accuracy of the copy is much smaller than the original, and it can even take values larger than one if the copy model is better than the original. Parameters ---------- test_data : presc.dataset.Dataset Subset of the original data reserved to evaluate the resemblance of the copy to the original classifier. Or synthetic data generated from the original model with the same purpose. Returns ------- float The numerical value of the replacement capability. """ y_pred_original = self.original.predict(test_data.features) y_pred_copy = self.copy.predict(test_data.features) return replacement_capability(test_data.labels, y_pred_original, y_pred_copy)
[docs] def evaluation_summary(self, test_data=None, synthetic_data=None): """Computes several metrics to evaluate the classifier copy. Summary of metrics that evaluate the quality of a classifier copy, not only to assess its performance as classifier but to quantify its resemblance to the original classifier. Accuracy of the original and the copy models (using the original test data), and the empirical fidelity error and replacement capability of the copy (using the original test data and/or the generated synthetic data). This is a wrapper of the `summary_metrics` function applied to the copy and original models in this instance. Parameters ---------- original_model : sklearn-type classifier Original ML classifier to be copied. copy_model : presc.copies.copying.ClassifierCopy ML classifier copy from the original ML classifier. test_data : presc.dataset.Dataset Subset of the original data reserved for testing. synthetic_data : presc.dataset.Dataset Synthetic data generated using the original model. show_results : bool If `True` the metrics are also printed. Returns ------- dict The values of all metrics. """ results = summary_metrics( original_model=self.original, copy_model=self, test_data=test_data, synthetic_data=synthetic_data, show_results=True, ) return results