import numpy as np
from sklearn import clone
from skactiveml.base import (
SingleAnnotatorPoolQueryStrategy,
ProbabilisticRegressor,
)
from skactiveml.pool.utils import (
_update_reg,
_conditional_expect,
_cross_entropy,
)
from skactiveml.utils import (
check_type,
simple_batch,
MISSING_LABEL,
is_unlabeled,
)
[docs]class KLDivergenceMaximization(SingleAnnotatorPoolQueryStrategy):
"""Regression based Kullback-Leibler Divergence Maximization
This class implements a query [1]_, which selects those samples
that maximize the expected Kullback-Leibler divergence, where it is assumed
that the target probabilities for different samples are independent.
Parameters
----------
integration_dict_target_val : dict, default=None
Dictionary for integration arguments, i.e. `integration method` etc.,
used for calculating the expected `y` value for the candidate samples.
For details see method `skactiveml.pool.utils._conditional_expect`.
integration_dict_cross_entropy : dict, default=None
Dictionary for integration arguments, i.e. `integration method` etc.,
used for calculating the cross entropy between the updated conditional
estimator by the `X_cand` value and the old conditional estimator.
For details see method `conditional_expect`.
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : int or RandomState instance, default=None
Random state for candidate selection.
References
----------
.. [1] D. Elreedy, A. F. Atiya, and S. I. Shaheen. A Novel Active Learning
Regression Framework for Balancing the Exploration-Exploitation
Trade-Off. Entropy, 21(7):651, 2019.
"""
def __init__(
self,
integration_dict_target_val=None,
integration_dict_cross_entropy=None,
missing_label=MISSING_LABEL,
random_state=None,
):
super().__init__(
random_state=random_state, missing_label=missing_label
)
self.integration_dict_target_val = integration_dict_target_val
self.integration_dict_cross_entropy = integration_dict_cross_entropy
[docs] def query(
self,
X,
y,
reg,
fit_reg=True,
sample_weight=None,
candidates=None,
batch_size=1,
return_utilities=False,
):
"""Determines for which candidate samples labels are to be queried.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples)
Labels of the training data set (possibly including unlabeled ones
indicated by `self.missing_label`).
reg : skactiveml.base.ProbabilisticRegressor
Predicts the entropy and the cross entropy and the potential
y-values for the candidate samples.
fit_reg : bool, default=True
Defines whether the regressor should be fitted on `X`, `y`, and
`sample_weight`.
sample_weight : array-like of shape (n_samples,), default=None
Weights of training samples in `X`.
candidates : None or array-like of shape (n_candidates), dtype=int or \
array-like of shape (n_candidates, n_features), default=None
- If `candidates` is `None`, the unlabeled samples from
`(X,y)` are considered as `candidates`.
- If `candidates` is of shape `(n_candidates,)` and of type
`int`, `candidates` is considered as the indices of the
samples in `(X,y)`.
- If `candidates` is of shape `(n_candidates, *)`, the
candidate samples are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all
query strategies.
batch_size : int, default=1
The number of samples to be selected in one AL cycle.
return_utilities : bool, default=False
If `True`, also return the utilities based on the query strategy.
Returns
-------
query_indices : numpy.ndarray of shape (batch_size)
The query indices indicate for which candidate sample a label is to
be queried, e.g., `query_indices[0]` indicates the first selected
sample.
- If `candidates` is `None` or of shape
`(n_candidates,)`, the indexing refers to the samples in
`X`.
- If `candidates` is of shape `(n_candidates, n_features)`,
the indexing refers to the samples in `candidates`.
utilities : numpy.ndarray of shape (batch_size, n_samples)
The utilities of samples after each selected sample of the batch,
e.g., `utilities[0]` indicates the utilities used for selecting
the first sample (with index `query_indices[0]`) of the batch.
Utilities for labeled samples will be set to np.nan.
- If `candidates` is `None`, the indexing refers to the samples
in `X`.
- If `candidates` is of shape `(n_candidates,)` and of type
`int`, `utilities` refers to the samples in `X`.
- If `candidates` is of shape `(n_candidates, *)`, `utilities`
refers to the indexing in `candidates`.
"""
X, y, candidates, batch_size, return_utilities = self._validate_data(
X, y, candidates, batch_size, return_utilities, reset=True
)
check_type(reg, "reg", ProbabilisticRegressor)
check_type(fit_reg, "fit_reg", bool)
X_eval = X[is_unlabeled(y, missing_label=self.missing_label_)]
if len(X_eval) == 0:
raise ValueError(
"The training data contains no unlabeled " "data."
)
if self.integration_dict_target_val is None:
self.integration_dict_target_val = {"method": "assume_linear"}
if self.integration_dict_cross_entropy is None:
self.integration_dict_cross_entropy = {
"method": "gauss_hermite",
"n_integration_samples": 10,
}
check_type(
self.integration_dict_target_val, "self.integration_dict", dict
)
check_type(
self.integration_dict_cross_entropy, "self.integration_dict", dict
)
X_cand, mapping = self._transform_candidates(candidates, X, y)
if fit_reg:
if sample_weight is None:
reg = clone(reg).fit(X, y)
else:
reg = clone(reg).fit(X, y, sample_weight)
utilities_cand = self._kullback_leibler_divergence(
X_eval, X_cand, mapping, reg, X, y, sample_weight=sample_weight
)
if mapping is None:
utilities = utilities_cand
else:
utilities = np.full(len(X), np.nan)
utilities[mapping] = utilities_cand
return simple_batch(
utilities,
self.random_state_,
batch_size=batch_size,
return_utilities=return_utilities,
)
def _kullback_leibler_divergence(
self, X_eval, X_cand, mapping, reg, X, y, sample_weight=None
):
"""Calculates the expected Kullback-Leibler divergence over the
evaluation set if each candidate sample where to be labeled.
Parameters
----------
X_eval : array-like of shape (n_samples, n_features)
The samples where the information gain should be evaluated.
X_cand : array-like of shape (n_candidate_samples, n_features)
The candidate samples that determine the information gain.
mapping : array-like of shape (n_candidate_samples,) or None
A mapping between `X_cand` and `X` if it exists.
reg: ProbabilisticRegressor
Predicts the entropy, predicts values.
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e., including the labeled
and unlabeled samples.
y : array-like of shape (n_samples,)
Labels of the training data set (possibly including unlabeled ones
indicated by `self.missing_label`).
sample_weight: array-like of shape (n_samples,), default=None
Weights of training samples in `X`.
Returns
-------
kl_div : numpy.ndarray of shape (n_candidate_samples,)
The calculated expected Kullback-Leibler divergence.
"""
def new_kl_divergence(idx, x_cand, y_pot):
reg_new = _update_reg(
reg,
X,
y,
sample_weight=sample_weight,
y_update=y_pot,
idx_update=idx,
X_update=x_cand,
mapping=mapping,
)
entropy_post = np.sum(
reg_new.predict(X_eval, return_entropy=True)[1]
)
cross_ent = np.sum(
_cross_entropy(
X_eval,
reg_new,
reg,
integration_dict=self.integration_dict_cross_entropy,
random_state=self.random_state_,
)
)
return cross_ent - entropy_post
kl_div = _conditional_expect(
X_cand,
new_kl_divergence,
reg,
random_state=self.random_state_,
**self.integration_dict_target_val,
)
return kl_div