Source code for presc.copies.sampling

import copy
import numpy as np
import pandas as pd
from presc.dataset import Dataset
from presc.evaluations.utils import is_discrete


[docs]def dynamical_range(df, verbose=False):
    """Returns the dynamic range, mean, and sigma of the dataset features.

    Parameters
    ----------
    df : pandas DataFrame
        The dataset with all the numerical features to analyze.
    verbose : bool
        If set to True the feature parameters are printed.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each feature entry contains a nested dictionary
        with the values of the minimum and maximum values of the dynamic range
        of the dataset, as well as the mean and sigma of the distribution
        (nested dictionary keys are "min", "max", "mean" and "sigma").
    """
    range_dict = {}
    for feature in df:
        range_dict[feature] = {
            "min": df[feature].min(),
            "max": df[feature].max(),
            "mean": df[feature].mean(),
            "sigma": df[feature].std(),
        }

        if verbose:
            print(
                f"\n{feature}"
                f"\n  min: {range_dict[feature]['min']:.4f}"
                f"\n  max: {range_dict[feature]['max']:.4f}"
                f"\n    (interval: "
                f"{range_dict[feature]['max'] - range_dict[feature]['min']:.4f})"
                f"\n  mean: {range_dict[feature]['mean']:.4f}"
                f"\n  sigma: {range_dict[feature]['sigma']:.4f}"
            )

    return range_dict


[docs]def reduce_feature_space(feature_parameters, sigmas=1):
    """Force feature minimum/maximum values to x times the standard deviation.

    This function will adjust the minimum and maximum values of each feature to
    the range determined by taking the feature's mean value and substracting and
    adding to it the specified number of standard deviations. But only for the
    features that have the mean and standard deviation specified.

    Normally this will reduce the feature space by leaving out the range of most
    extreme values and will facilitate that any sampling based on the feature
    minimum and maximum values becomes more efficient. This is a more notorious
    problem when the dictionary describing the features has ben extracted
    automatically from an original dataset which contains outliers.

    Parameters
    ----------
    feature_parameters: dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each feature entry contains a nested dictionary
        with the values of the minimum and maximum values of the dynamic range
        of the dataset, as well as the mean and sigma of the distribution
        (nested dictionary keys are "min", "max", "mean" and "sigma").
    sigmas : float
        The factor by which the standard deviation will be multiplied in order
        to define the symmetric interval around the mean.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each feature entry contains a nested dictionary
        with the values of the minimum and maximum values of the dynamic range
        of the dataset, as well as the mean and sigma of the distribution
        (nested dictionary keys are "min", "max", "mean" and "sigma").
    """
    modified_feature_space = copy.deepcopy(feature_parameters)
    for feature in feature_parameters:
        if (
            {"mean", "sigma"}.issubset(set(feature_parameters[feature].keys()))
            and feature_parameters[feature]["mean"]
            and feature_parameters[feature]["sigma"]
        ):
            modified_feature_space[feature]["min"] = (
                feature_parameters[feature]["mean"]
                - sigmas * feature_parameters[feature]["sigma"]
            )
            modified_feature_space[feature]["max"] = (
                feature_parameters[feature]["mean"]
                + sigmas * feature_parameters[feature]["sigma"]
            )
    return modified_feature_space


[docs]def find_categories(df, add_nans=False):
    """Returns the categories of the dataset features.

    Parameters
    ----------
    df : pandas DataFrame
        The dataset with all the categorical features to analyze.
    add_nans : bool
        If True the sampler adds a "NaNs" category for the features that have
        any null values and assigns it the appropriate fraction.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each feature entry contains a nested dictionary
        with its categories and the fraction of each category present in the
        analyzed dataset (the nested dictionary key for this information is
        "categories", which is also a dictionary with one entry per category).
    """
    categories_dict = {}
    for feature in df:
        if is_discrete(df[feature]):
            # Remove NaN values from selection
            df_no_nans = df[df[feature].notnull()]

            # Log fraction of NaN values if required
            if add_nans:
                nan_fraction = df[feature].isnull().sum() / len(df)
                total_length = len(df)
            else:
                nan_fraction = 0
                total_length = len(df_no_nans)

            categories_dict[feature] = {
                "categories": {
                    key: None for key in df_no_nans[feature].unique().tolist()
                }
            }
            for category in categories_dict[feature]["categories"].keys():
                categories_dict[feature]["categories"][category] = (
                    df_no_nans[feature].value_counts()[category] / total_length
                )
            if add_nans and nan_fraction != 0:
                categories_dict[feature]["categories"]["NaNs"] = nan_fraction

    return categories_dict


[docs]def build_equal_category_dict(feature_categories):
    """Assigns equal probability to all categories of each feature.

    Parameters
    ----------
    feature_categories : dict of lists
        A dictionary with an entry per feature, with the list of categories
        that each feature has.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each feature entry contains a nested dictionary
        with its categories and the identical fraction for all categories from
        the same feature (the nested dictionary key for this information is
        "categories", which is also a dictionary with one entry per category).
    """
    categories_dict = {}
    for feature, categories in feature_categories.items():
        categories_dict[feature] = {
            "categories": {key: 1 / len(categories) for key in categories}
        }
    return categories_dict


[docs]def mixed_data_features(df, add_nans=False):
    """Extracts the numerical/categorical feature parameters from a dataset.

    Parameters
    ----------
    df : pandas DataFrame
        The dataset with all the features to analyze (both numerical and
        categorical).
    add_nans : bool
        If True the sampler adds a "NaNs" category for the categorical features
        that have any null values and assigns it the appropriate fraction.

    Returns
    -------
    dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys are the
        column names), where each numerical feature entry contains a nested
        dictionary with the values of the minimum and maximum values of the
        dynamic range of the dataset, as well as the mean and sigma of the
        distribution, and each categorical feature entry contains a nested
        dictionary with its categories and the fraction of each category present
        in the analyzed dataset (nested dictionary keys are "min", "max",
        "mean", "sigma", and "categories", which is also a dictionary with one
        entry per category).
    """
    features_dict = {}
    for feature in df:
        df_feature = df[[feature]]
        if is_discrete(df[feature]):
            single_feature_parameters = find_categories(df_feature, add_nans=add_nans)
        else:
            single_feature_parameters = dynamical_range(df_feature)
        features_dict[feature] = single_feature_parameters[feature]

    return features_dict


[docs]def grid_sampling(feature_parameters, nsamples=500, random_state=None):
    """Sample the classifier with a grid-like sampling.

    Generates synthetic samples with a regular grid-like distribution within the
    feature space described in `feature_parameters`. Computes the grid spacing
    so that all features have the same number of different values.

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys should
        be the feature names), and where each feature entry must contain a
        nested dictionary with at least the entries corresponding to the minimum
        and maximum values of the dynamic range. Dictionary keys for these
        values should be "min" and "max", respectively.
    nsamples : int
        Maximum number of samples to generate. The exact number will depend on
        the parameter space.
    random_state : int
        Parameter not used in `grid_sampling`.

    Returns
    -------
    pandas DataFrame
        Dataset with a regular grid-like generated sampling of the feature space
        characterized by the `feature_parameters`."""
    # Compute number of points per feature (assuming same number of points)
    nfeatures = len(feature_parameters)
    npoints = int(nsamples ** (1 / nfeatures))

    # Generate grid

    feature_list = []
    feature_names = []
    for key in feature_parameters:
        feature_list.append(
            np.linspace(
                feature_parameters[key]["min"], feature_parameters[key]["max"], npoints
            )
        )
        feature_names.append(key)

    X_generated = pd.DataFrame()
    for index, item in enumerate(np.meshgrid(*feature_list)):
        X_generated[index] = item.ravel()
    X_generated.columns = feature_names

    return X_generated


[docs]def uniform_sampling(feature_parameters, nsamples=500, random_state=None):
    """Sample the classifier with a random uniform sampling.

    Generates synthetic samples with a random uniform distribution within the
    feature space described in `feature_parameters`.

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys should
        be the feature names), and where each feature entry must contain a
        nested dictionary with at least the entries corresponding to the minimum
        and maximum values of the dynamic range. Dictionary keys for these
        values should be "min" and "max", respectively.
    nsamples : int
        Number of samples to generate.
    random_state : int
        Random seed used to generate the sampling data.

    Returns
    -------
    pandas DataFrame
        Dataset with a random uniform generated sampling of the feature space
        characterized by the `feature_parameters`.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    # Generate random uniform data
    X_generated = pd.DataFrame()
    for key in feature_parameters:
        X_generated[key] = np.random.uniform(
            feature_parameters[key]["min"],
            feature_parameters[key]["max"],
            size=nsamples,
        )

    return X_generated


[docs]def normal_sampling(
    feature_parameters,
    nsamples=500,
    random_state=None,
):
    """Sample the classifier with a normal distribution sampling.

    Generates synthetic samples with a normal distribution according to the
    feature space described by `feature_parameters`. Features are assumed to be
    independent (not correlated).

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys should
        be the feature names), and where each feature entry must contain a
        nested dictionary with at least the entries corresponding to the mean
        and standard deviation values of the dataset. Dictionary keys for these
        values should be "mean" and "sigma", respectively.
    nsamples : int
        Number of samples to generate.
    random_state : int
        Random seed used to generate the sampling data.

    Returns
    -------
    pandas DataFrame
        Dataset with a generated sampling following a normal distribution of
        the feature space characterized by the `feature_parameters`.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    # Compute number of features
    nfeatures = len(feature_parameters)

    # Rename columns
    feature_names = []
    mus = []
    sigmas = []
    for key in feature_parameters:
        feature_names.append(key)
        mus.append(feature_parameters[key]["mean"])
        sigmas.append(feature_parameters[key]["sigma"])

    mus = np.array(mus)
    covariate_matrix = np.eye(nfeatures, nfeatures) * (np.array(sigmas)) ** 2

    # Generate normal distribution data
    X_generated = pd.DataFrame(
        np.random.multivariate_normal(mus, covariate_matrix, size=nsamples)
    )

    # Rename columns
    X_generated.columns = feature_names

    return X_generated


[docs]def spherical_balancer_sampling(
    nsamples=1000,
    nfeatures=30,
    original_classifier=None,
    max_iter=10,
    nbatch=10000,
    radius_min=0,
    radius_max=1,
    label_col="class",
    random_state=None,
    verbose=False,
):
    """Sample the classifier with a balancer spherical distribution sampling.

    Generates synthetic samples with a spherical (shell) distribution between a
    minimum and a maximum radius values and then labels them using the original
    classifier. This function will attempt to obtain a balanced dataset by
    generating the same number of samples for all classes (`nsamplesxclass`),
    unless it reaches the maximum number of iterations. When used within the
    ClassifierCopy class, the `balancing_sampler` must be set to True.

    This sampler works better when features have standardized values.

    Parameters
    ----------
    nsamples : int
        Number of samples to generate.
    nfeatures : int
        Number of features of the generated samples.
    original_classifier : sklearn-type classifier
        Original ML classifier used to generate the synthetic data.
    max_iter : int
        The maximum number of iterations generating batches to attempt to obtain
        the samples per class specified in `nsamplesxclass`.
    nbatch: int
        Number of tentative samples to generate in each batch.
    radius_min : float
        Minimum radius of the spherical shell distribution. It will be a
        spherical distribution if this value is set to zero.
    radius_max : float
        Maximum radius of the spherical (shell) distribution.
    label_col : str
        Name of the label column.
    random_state : int
        Random seed used to generate the sampling data.
    verbose : bool
        If True the sampler prints information about each batch.

    Returns
    -------
    pandas DataFrame
        Dataset with a generated sampling following a spherical distribution of
        the feature space, with features and labels.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    nsamplesxclass = int(nsamples / nfeatures)

    if verbose:
        print(
            f"\nGenerating samples between => min: {radius_min} and max: {radius_max}"
        )

    df_generated = pd.DataFrame()

    for iteration in range(max_iter):
        if verbose:
            print("Generating batch", iteration + 1)

        # Generate `nbatch` normalised vectors in random directions
        v = np.random.multivariate_normal(
            np.zeros((nfeatures,)), np.eye(nfeatures, nfeatures), size=nbatch
        )
        v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

        # Scale the vectors between `radius_min` and `radius_max`
        alpha = np.random.uniform(low=radius_min, high=radius_max, size=nbatch)
        df_batch = pd.DataFrame(
            np.dot(alpha[:, np.newaxis], np.ones((1, nfeatures))) * v
        )
        # Label synthetic data with original classifier
        df_batch[label_col] = original_classifier.predict(df_batch)

        # Temporarily, add samples from the new batch to the old dataframe
        df_temp = pd.concat([df_generated, df_batch])

        # Keep a maximum of `nclass_samples` samples from each class
        detected_classes = df_temp[label_col].value_counts()
        df_generated = pd.DataFrame()
        for class_name in detected_classes.index.tolist():
            df_generated = pd.concat(
                [
                    df_generated,
                    df_temp[df_temp[label_col] == class_name].iloc[:nsamplesxclass],
                ]
            )

        # If there are no incomplete classes finish iteration, otherwise show classes
        incomplete_classes = detected_classes[
            detected_classes < nsamplesxclass
        ].sort_index()
        if len(incomplete_classes) == 0:
            return df_generated
        elif verbose:
            print("\nClasses:", incomplete_classes.index.tolist())
            print("Samples:", incomplete_classes.tolist())

    return df_generated


[docs]def categorical_sampling(feature_parameters, nsamples=500, random_state=None):
    """Sample the classifier with a discrete distribution sampling.

    Generates synthetic samples with a discrete distribution according to the
    probabilities described by `feature_parameters`. Features are assumed to be
    independent (not correlated).

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys should
        be the feature names), where each feature entry must contain a
        nested dictionary with its categories and their fraction. The key for
        the nested dictionary of categories should be "categories", and the keys
        for the fractions should be the category name.
    nsamples : int
        Number of samples to generate.
    random_state : int
        Random seed used to generate the sampling data.

    Returns
    -------
    pandas DataFrame
        Dataset with a generated sampling following the discrete distribution of
        the feature space characterized by the `feature_parameters`.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    # Generate random data with the specified probabilities
    X_generated = pd.DataFrame()
    for feature in feature_parameters:
        categories = list(feature_parameters[feature]["categories"].keys())
        category_probabilities = [
            feature_parameters[feature]["categories"][category]
            for category in categories
        ]
        X_generated[feature] = pd.Series(
            np.random.choice(categories, p=category_probabilities, size=nsamples)
        ).astype("category")

    return X_generated


[docs]def mixed_data_sampling(
    feature_parameters,
    numerical_sampling,
    nsamples=500,
    random_state=None,
    **remaining_parameters,
):
    """Sample the classifier with a mix of a numerical and categorical sampler.

    Generates synthetic samples with the specified distribution for the
    numerical features and with a discrete distribution for the categorical
    features. The parameters describing the feature space needed to compute the
    distributions are described in the `feature_parameters` dictionary. Features
    are assumed to be independent (not correlated).

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys should
        be the feature names), where each feature entry must contain a
        nested dictionary with its categories and their probability. The key for
        the nested dictionary of categories should be "categories", and the keys
        for the probabilities should be the category name.
    numerical_sampling : function
        Any of the non balancing numerical sampling functions defined in PRESC:
        `grid_sampling`, `uniform_sampling`, `normal_sampling`...
    nsamples : int
        Number of samples to generate.
    random_state : int
        Random seed used to generate the sampling data.

    Returns
    -------
    pandas DataFrame
        Dataset with a generated sampling following the specified numerical
        sampling distribution for the numerical features and the discrete
        distribution for the categorical features, following the feature space
        characterized by the `feature_parameters`.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    # Generate the lists of numerical and categorical data
    features_numerical = []
    features_categorical = []
    for feature in feature_parameters:
        if "categories" in feature_parameters[feature]:
            features_categorical.append(feature)
        else:
            features_numerical.append(feature)

    # Generate feature parameter dictionaries for the numerical/categorical samplers
    feature_parameters_numerical = {
        feature: feature_parameters[feature] for feature in features_numerical
    }
    feature_parameters_categorical = {
        feature: feature_parameters[feature] for feature in features_categorical
    }

    # Generate the numerical/categorical features of each sample separately
    X_generated_numerical = numerical_sampling(
        nsamples=nsamples,
        random_state=random_state,
        feature_parameters=feature_parameters_numerical,
    )
    X_generated_categorical = categorical_sampling(
        nsamples=nsamples,
        random_state=random_state,
        feature_parameters=feature_parameters_categorical,
    )

    # Combine the numerical/categorical features respecting the structure of the
    # original dataset
    X_generated = pd.concat(
        [X_generated_numerical, X_generated_categorical], axis="columns"
    )
    X_generated = X_generated[feature_parameters.keys()]

    return X_generated


[docs]def image_random_sampling(
    feature_parameters={
        "images": {"x_pixels": 28, "y_pixels": 28, "min": 0, "max": 253}
    },
    nsamples=500,
    random_state=None,
):
    """Sample the feature space of images using random pixels.

    Generates synthetic samples using a random uniform distribution to
    establish the value for each image pixel. Hence, they are images of noise.
    It only generates one channel (that is, black and white images).

    For most image datasets, which are not random and have structure, this is a
    very inefficient sampling method to generate synthetic image samples and
    explore the feature space. It is provided here for illustrating purposes
    only.

    The default generates 28x28 images with pixel values between 0 and 253.

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary which specifies the characteristics of the feature space
        of the images. It should have one entry 'images' with a nested
        dictionary with the entries 'x_pixels', 'y_pixels', 'min' and 'max',
        which specify the number of pixels of the image in each dimension, and
        the minimum and maximum possible values of the pixels. The values in the
        default dictionary are:

        feature_parameters = {"images": {"x_pixels": 28, "y_pixels": 28,
                                         "min": 0, "max": 253}}
    nsamples : int
        Number of image samples to generate.
    random_state : int
        Random seed used to generate the sampling data.

    Returns
    -------
    pandas DataFrame
        Dataset with a list of images that have the value of their pixels
        generated with a random uniform sampling of the feature space as
        specified in the `feature_parameters`.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    images = [None] * nsamples
    for image_index in range(nsamples):
        # Generate random image
        images[image_index] = np.random.randint(
            low=feature_parameters["images"]["min"],
            high=feature_parameters["images"]["max"] + 1,
            size=(
                feature_parameters["images"]["x_pixels"],
                feature_parameters["images"]["y_pixels"],
            ),
        )
    X_generated_images = pd.DataFrame({"images": images})
    return X_generated_images


[docs]def image_vae_sampling(
    feature_parameters={
        "images": {
            "min": 0,
            "max": 254,
            "autoencoder": None,
            "autoencoder_latent_dim": 2,
            "autoencoder_edge_factor": 5,
        }
    },
    nsamples=500,
    random_state=None,
):
    """Sample the feature space of images using a variational autoencoder.

    Generates synthetic samples of the same manifold as the variational
    autoencoder training data sampling the latent space, which represents images
    with the a Gaussian distribution for each latent dimension.

    For image datasets, which are not random and have structure, this is an
    efficient sampling method to generate relevant synthetic image samples and
    explore the feature space.

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary which specifies the characteristics of the feature space
        of the images. It should have one entry 'images' with a nested
        dictionary with the entries 'x_pixels', 'y_pixels', 'min' and 'max',
        which specify the number of pixels of the image in each dimension, and
        the minimum and maximum possible values of the pixels. The values in the
        default dictionary are:

        feature_parameters = {"images": {"min": 0, "max": 254,
                                   "autoencoder": None,
                                   "autoencoder_latent_dim": 2,
                                   "autoencoder_edge_factor": 5}}

        It is neccessary to specify the autoencoder for it to work.

    nsamples : int
        Number of image samples to generate.
    random_state : int
        Random seed used to generate the sampling data.

    Returns
    -------
    pandas DataFrame
        Dataset with a list of images that have been generated sampling randomly
        the latent space of the variational autoencoder, as specified in the
        `feature_parameters`.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    autoencoder = feature_parameters["images"]["autoencoder"]
    maximum_1 = feature_parameters["images"]["max"] + 1
    latent_dim = feature_parameters["images"]["autoencoder_latent_dim"]
    edge_factor = feature_parameters["images"]["autoencoder_edge_factor"]

    images = [None] * nsamples
    for image_index in range(nsamples):
        # Generate random image using autoencoder
        z_sample = (np.random.rand(1, latent_dim) - 0.5) * edge_factor
        images[image_index] = (
            (autoencoder.decoder.predict(z_sample)[:, :, :, 0][0]) * (maximum_1)
        ).astype(int)

    X_generated_images = pd.DataFrame({"images": images})
    return X_generated_images


[docs]def labeling(X, original_classifier, label_col="class"):
    """Labels the samples from a dataset according to a classifier.

    Parameters
    ----------
    X : pandas DataFrame
        Dataset with the features but not the labels.
    original_classifier : sklearn-type classifier
        Classifier to use for the labeling of the samples.
    label_col : str
        Name of the label column.

    Returns
    -------
    presc.dataset.Dataset
        Outputs a PRESC Dataset with the samples and their labels.
    """
    df_labeled = X.copy()

    # Label synthetic data with original classifier
    df_labeled[label_col] = original_classifier.predict(df_labeled)
    df_labeled[label_col] = df_labeled[label_col].astype("category")

    # Instantiate dataset wrapper
    df_labeled = Dataset(df_labeled, label_col=label_col)

    return df_labeled


[docs]def sampling_balancer(
    feature_parameters,
    numerical_sampling,
    original_classifier,
    nsamples=1000,
    max_iter=10,
    nbatch=1000,
    label_col="class",
    random_state=None,
    verbose=False,
    **remaining_parameters,
):
    """Generate balanced synthetic data using any sampling function.

    This function will attempt to obtain a balanced dataset with non-balancing
    samplers by generating the same number of samples for all classes,
    unless it reaches the maximum number of iterations. To use within the
    ClassifierCopy class, the `enforce_balance` must be set to True.

    Note that the algorithm needs to find at least one sample of a different
    class in order detect that class and keep iterating through the batch
    generation of samples to try to get them all. Therefore, it is not
    guaranteed that it will find all the classes and successfully balance the
    synthetic dataset in extreme cases of imbalance. However, the batch size
    `nbatch` can be set to a larger number if we suspect that is the case, and
    this will increase the probability to find at least a sample of a different
    class in the first round. Thereafter, if the algorithm is already iterating
    to find a minoritary class, it is more likely that other classes that occupy
    a very small hypervolume will show up as well.

    Parameters
    ----------
    feature_parameters : dict of dicts
        A dictionary with an entry per dataset feature (dictionary keys should
        be the feature names), where each feature entry must contain a
        nested dictionary with its categories and their fraction. The key for
        the nested dictionary of categories should be "categories", and the keys
        for the fractions should be the category name.
    numerical_sampling : function
        Any of the non balancing numerical sampling functions defined in PRESC:
        `grid_sampling`, `uniform_sampling`, `normal_sampling`...
    original_classifier : sklearn-type classifier
        Original ML classifier used to generate the synthetic data.
    nsamples : int
        Number of samples to generate.
    max_iter : int
        The maximum number of iterations generating batches to attempt to obtain
        the samples per class specified in `nsamplesxclass`.
    nbatch: int
        Number of tentative samples to generate in each batch.
    label_col : str
        Name of the label column.
    random_state : int
        Random seed used to generate the sampling data.
    verbose : bool
        If True the sampler prints information about each batch.

    Returns
    -------
    pandas DataFrame
        Dataset with a generated sampling following the specified numerical
        sampling distribution for the numerical features and the discrete
        distribution for the categorical features, following the feature space
        characterized by the `feature_parameters`, where the function has
        tried to balance the samples for each class.
    """
    if random_state is not None:
        np.random.seed(seed=random_state)

    df_generated = pd.DataFrame()

    for iteration in range(max_iter):
        if verbose:
            print("Generating batch", iteration + 1)

        # Generate `nbatch` samples
        df_batch = mixed_data_sampling(
            feature_parameters, numerical_sampling, nsamples=nbatch, random_state=None
        )
        # Label synthetic data with original classifier
        df_batch[label_col] = original_classifier.predict(df_batch)

        # Temporarily, add samples from the new batch to the old dataframe
        df_temp = pd.concat([df_generated, df_batch])

        # Keep a maximum of `nsamplesxclass` samples from each class
        detected_classes = df_temp[label_col].value_counts()
        nsamplesxclass = int(nsamples / len(detected_classes))

        df_generated = pd.DataFrame()
        for class_name in detected_classes.index.tolist():
            df_generated = pd.concat(
                [
                    df_generated,
                    df_temp[df_temp[label_col] == class_name].iloc[:nsamplesxclass],
                ]
            )

        # If there are no incomplete classes finish iteration, otherwise show classes
        incomplete_classes = detected_classes[
            detected_classes < nsamplesxclass
        ].sort_index()
        if len(incomplete_classes) == 0:
            return df_generated
        elif verbose:
            print("\nClasses generated:", incomplete_classes.index.tolist())
            print("Samples per class:", incomplete_classes.tolist(), "\n")

    return df_generated