Source code for skactiveml.pool._information_gain_maximization

import numpy as np
from sklearn import clone

from skactiveml.base import (
    SingleAnnotatorPoolQueryStrategy,
    ProbabilisticRegressor,
)

from skactiveml.pool.utils import (
    _update_reg,
    conditional_expect,
    _cross_entropy,
)
from skactiveml.utils import (
    check_type,
    simple_batch,
    MISSING_LABEL,
    is_unlabeled,
)


[docs] class KLDivergenceMaximization(SingleAnnotatorPoolQueryStrategy): """Regression based Kullback-Leibler Divergence Maximization This class implements a query strategy [1]_, which selects those samples that maximize the expected Kullback-Leibler divergence from the new model to the old model, where the new model is the model that results from adding the samples to the training set and the expectation is performed over the model parameters. Parameters ---------- integration_dict_target_val : dict, default=None Dictionary for integration arguments, i.e. `integration method` etc., used for calculating the expected `y` value for the candidate samples. For details see method `skactiveml.pool.utils.conditional_expect`. integration_dict_cross_entropy : dict, default=None Dictionary for integration arguments, i.e. `integration method` etc., used for calculating the cross entropy between the updated conditional estimator by the `X_cand` value and the old conditional estimator. For details see method `skactiveml.pool.utils.conditional_expect`. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or RandomState instance, default=None Random state for candidate selection. References ---------- .. [1] D. Elreedy, A. F. Atiya, and S. I. Shaheen. A Novel Active Learning Regression Framework for Balancing the Exploration-Exploitation Trade-Off. Entropy, 21(7):651, 2019. """ def __init__( self, integration_dict_target_val=None, integration_dict_cross_entropy=None, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( random_state=random_state, missing_label=missing_label ) self.integration_dict_target_val = integration_dict_target_val self.integration_dict_cross_entropy = integration_dict_cross_entropy
[docs] def query( self, X, y, reg, fit_reg=True, sample_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). reg : skactiveml.base.ProbabilisticRegressor Predicts the entropy and the cross entropy and the potential y-values for the candidate samples. fit_reg : bool, default=True Defines whether the regressor should be fitted on `X`, `y`, and `sample_weight`. sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `utilities` refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, ...)`, `utilities` refers to the indexing in `candidates`. """ X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) check_type(reg, "reg", ProbabilisticRegressor) check_type(fit_reg, "fit_reg", bool) X_eval = X[is_unlabeled(y, missing_label=self.missing_label_)] if len(X_eval) == 0: raise ValueError( "The training data contains no unlabeled " "data." ) if self.integration_dict_target_val is None: self.integration_dict_target_val = {"method": "assume_linear"} if self.integration_dict_cross_entropy is None: self.integration_dict_cross_entropy = { "method": "gauss_hermite", "n_integration_samples": 10, } check_type( self.integration_dict_target_val, "self.integration_dict", dict ) check_type( self.integration_dict_cross_entropy, "self.integration_dict", dict ) X_cand, mapping = self._transform_candidates(candidates, X, y) if fit_reg: if sample_weight is None: reg = clone(reg).fit(X, y) else: reg = clone(reg).fit(X, y, sample_weight) utilities_cand = self._kullback_leibler_divergence( X_eval, X_cand, mapping, reg, X, y, sample_weight=sample_weight ) if mapping is None: utilities = utilities_cand else: utilities = np.full(len(X), np.nan) utilities[mapping] = utilities_cand return simple_batch( utilities, self.random_state_, batch_size=batch_size, return_utilities=return_utilities, )
def _kullback_leibler_divergence( self, X_eval, X_cand, mapping, reg, X, y, sample_weight=None ): """Calculates the expected Kullback-Leibler divergence over the evaluation set if each candidate sample where to be labeled. Parameters ---------- X_eval : array-like of shape (n_samples, n_features) The samples where the information gain should be evaluated. X_cand : array-like of shape (n_candidate_samples, n_features) The candidate samples that determine the information gain. mapping : array-like of shape (n_candidate_samples,) or None A mapping between `X_cand` and `X` if it exists. reg: ProbabilisticRegressor Predicts the entropy, predicts values. X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight: array-like of shape (n_samples,), default=None Weights of training samples in `X`. Returns ------- kl_div : numpy.ndarray of shape (n_candidate_samples,) The calculated expected Kullback-Leibler divergence. """ def new_kl_divergence(idx, x_cand, y_pot): reg_new = _update_reg( reg, X, y, sample_weight=sample_weight, y_update=y_pot, idx_update=idx, X_update=x_cand, mapping=mapping, ) entropy_post = np.sum( reg_new.predict(X_eval, return_entropy=True)[1] ) cross_ent = np.sum( _cross_entropy( X_eval, reg_new, reg, integration_dict=self.integration_dict_cross_entropy, random_state=self.random_state_, ) ) return cross_ent - entropy_post kl_div = conditional_expect( X_cand, new_kl_divergence, reg, random_state=self.random_state_, **self.integration_dict_target_val, ) return kl_div