Source code for presc.copies.copying

from presc.dataset import Dataset
from presc.copies.sampling import mixed_data_sampling, labeling, sampling_balancer
from presc.copies.evaluations import (
    empirical_fidelity_error,
    replacement_capability,
    summary_metrics,
)


[docs]class ClassifierCopy:
    """Represents a classifier copy and its associated sampling method of choice.

    Each instance wraps the original ML classifier with a ML classifier copy and
    the sampling method to carry out the copy. Methods allow to carry out
    the copy the original classifier, evaluate the quality of the copy, and to
    generate additional data using the original classifier with the sampling
    method specified on instatiation.

    Attributes
    ----------
        original : sklearn-type classifier
            Original ML classifier to be copied.
        copy : sklearn-type classifier
            ML classifier that will be used for the copy.
        numerical_sampling : function
            Any of the numerical sampling functions defined in PRESC:
            `grid_sampling`, `uniform_sampling`, `normal_sampling`... The
            `balancing sampler` can only be used if the feature space does not
            contain any categorical variable.
        post_sampling_labeling: bool
            Whether generated data must be labeled after the sampling or not.
            If the chosen sampling function already does class labeling (such as
            balancing samplers) then it should be set to False. If the parameter
            enforce_balance is set to True then this parameter does not have any
            effect.
        enforce_balance : bool
            Force class balancing for sampling functions that do not normally
            carry it out intrinsically.
        label_col : str
            Name of the label column.
        **k_sampling_parameters :
            Parameters needed for the `numerical_sampling` function.
    """

    def __init__(
        self,
        original,
        copy,
        numerical_sampling,
        post_sampling_labeling=True,
        enforce_balance=False,
        label_col="class",
        **k_sampling_parameters
    ):
        self.original = original
        self.copy = copy
        self.numerical_sampling = numerical_sampling
        self.post_sampling_labeling = post_sampling_labeling
        if enforce_balance:
            self.post_sampling_labeling = False
        self.enforce_balance = enforce_balance
        self.label_col = label_col
        self.k_sampling_parameters = k_sampling_parameters
        if "random_state" in self.k_sampling_parameters.keys():
            self.random_state = self.k_sampling_parameters["random_state"]
        else:
            self.random_state = None

[docs]    def copy_classifier(self, get_training_data=False, **k_mod_sampling_parameters):
        """Copies the classifier using data generated with the original model.

        Generates synthetic data using only basic information of the features
        (dynamic range, mean and sigma), labels it using the original model,
        and trains the copy model with this synthetic data. It can also return
        the generated synthetic data used for training.

        Parameters
        ----------
        get_training_data : bool
            If `True` this method returns the synthetic data generated from the
            original classifier that was used to train the copy.
        **k_mod_sampling_parameters :
            If the "nsamples" and/or "random_state" parameters of the sampling
            function have to be changed in order to obtain a different set of
            synthetic data, they can be specified here.

        Returns
        -------
        presc.dataset.Dataset
            Outputs a PRESC Dataset with the training samples and their labels
            (if `get_training_data` set to `True`).
        """
        # Generate synthetic data
        df_generated = self.generate_synthetic_data(**k_mod_sampling_parameters)
        # Copy the classifier
        self.copy.fit(df_generated.features, df_generated.labels)

        if get_training_data:
            return df_generated

[docs]    def generate_synthetic_data(self, **k_mod_sampling_parameters):
        """Generates synthetic data using the original model.

        Generates samples following the sampling strategy specified on
        instantiation for the numerical features and a discrete distribution for
        the categorical features, and then labels them using the original model.
        If the same data needs to be generated then simply use a specific
        random seed.

        Parameters
        ----------
        **k_mod_sampling_parameters :
            If the "nsamples" and/or "random_state" parameters of the sampling
            function have to be changed in order to obtain a different set of
            synthetic data, they can be specified here.

        Returns
        -------
        presc.dataset.Dataset
            Outputs a PRESC Dataset with the generated samples and their labels.
        """
        # Random state needs to be fixed to obtain the same training data
        k_sampling_parameters_gen = self.k_sampling_parameters.copy()

        # Update sampling parameters which have been specified on calling the method
        k_sampling_parameters_gen.update(k_mod_sampling_parameters)

        if self.enforce_balance:
            # Call balancer generating function with sampling parameters
            # (sampling_balancer returns a pandas dataframe)
            X_generated = sampling_balancer(
                original_classifier=self.original, **k_sampling_parameters_gen
            )
        else:
            # Call generating function with sampling parameters
            # (mixed_data_sampling returns a pandas dataframe)
            X_generated = mixed_data_sampling(
                feature_parameters=k_sampling_parameters_gen.pop("feature_parameters"),
                numerical_sampling=self.numerical_sampling,
                **k_sampling_parameters_gen
            )

        # If the type of sampling function attempts to balance the synthetic
        # dataset, it returns the features AND the labels. Otherwise, it returns
        # only the features, and the labeling function must be called.
        if self.post_sampling_labeling:
            df_generated = labeling(
                X_generated, self.original, label_col=self.label_col
            )
        else:
            df_generated = Dataset(X_generated, label_col=self.label_col)

        return df_generated

[docs]    def compute_fidelity_error(self, test_data):
        """Computes the empirical fidelity error of the classifier copy.

        Quantifies the resemblance of the copy to the original classifier. This
        value is zero when the copy makes exactly the same predictions than the
        original classifier (including misclassifications).

        Parameters
        ----------
        test_data : array-like
            Dataset with the unlabeled samples to evaluate the resemblance of
            the copy to the original classifier.

        Returns
        -------
        float
            The numerical value of the empirical fidelity error of the copy with
            this dataset.
        """
        y_pred_original = self.original.predict(test_data)
        y_pred_copy = self.copy.predict(test_data)

        return empirical_fidelity_error(y_pred_original, y_pred_copy)

[docs]    def replacement_capability(self, test_data):
        """Computes the replacement capability of a classifier copy.

        Quantifies the ability of the copy model to substitute the original
        model, i.e. maintaining the same accuracy in its predictions. This value
        is one when the accuracy of the copy model is the same as the original
        model, although the individual predictions may be different, approaching
        zero if the accuracy of the copy is much smaller than the original, and
        it can even take values larger than one if the copy model is better than
        the original.

        Parameters
        ----------
        test_data : presc.dataset.Dataset
            Subset of the original data reserved to evaluate the resemblance of
            the copy to the original classifier. Or synthetic data generated
            from the original model with the same purpose.

        Returns
        -------
        float
            The numerical value of the replacement capability.
        """
        y_pred_original = self.original.predict(test_data.features)
        y_pred_copy = self.copy.predict(test_data.features)
        return replacement_capability(test_data.labels, y_pred_original, y_pred_copy)

[docs]    def evaluation_summary(self, test_data=None, synthetic_data=None):
        """Computes several metrics to evaluate the classifier copy.

        Summary of metrics that evaluate the quality of a classifier copy, not
        only to assess its performance as classifier but to quantify its
        resemblance to the original classifier. Accuracy of the original and the
        copy models (using the original test data), and the empirical fidelity
        error and replacement capability of the copy (using the original test
        data and/or the generated synthetic data). This is a wrapper of the
        `summary_metrics` function applied to the copy and original models in
        this instance.

        Parameters
        ----------
        original_model : sklearn-type classifier
            Original ML classifier to be copied.
        copy_model : presc.copies.copying.ClassifierCopy
            ML classifier copy from the original ML classifier.
        test_data : presc.dataset.Dataset
            Subset of the original data reserved for testing.
        synthetic_data : presc.dataset.Dataset
            Synthetic data generated using the original model.
        show_results : bool
            If `True` the metrics are also printed.

        Returns
        -------
        dict
            The values of all metrics.
        """
        results = summary_metrics(
            original_model=self.original,
            copy_model=self,
            test_data=test_data,
            synthetic_data=synthetic_data,
            show_results=True,
        )
        return results