import numpy as np
from sklearn import clone
from sklearn.utils import check_array
from skactiveml.base import (
ProbabilisticRegressor,
SingleAnnotatorPoolQueryStrategy,
)
from skactiveml.utils import check_type, simple_batch, MISSING_LABEL
from skactiveml.pool.utils import _update_reg, _conditional_expect
[docs]class ExpectedModelVarianceReduction(SingleAnnotatorPoolQueryStrategy):
"""Expected Model Variance Reduction.
This class implements the active learning strategy expected model variance
minimization, which tries to select the sample that minimizes the expected
model variance.
Parameters
----------
integration_dict : dict, optional (default=None)
Dictionary for integration arguments, i.e. `integration method` etc.,
used for calculating the expected `y` value for the candidate samples.
For details see method `skactiveml.pool.utils._conditional_expect`.
missing_label : scalar or string or np.nan or None,
(default=skactiveml.utils.MISSING_LABEL)
Value to represent a missing label.
random_state : int | np.random.RandomState, optional (default=None)
Random state for candidate selection.
References
----------
[1] Cohn, David A and Ghahramani, Zoubin and Jordan, Michael I. Active
learning with statistical models, pages 129--145, 1996.
"""
def __init__(
self,
integration_dict=None,
missing_label=MISSING_LABEL,
random_state=None,
):
super().__init__(
random_state=random_state, missing_label=missing_label
)
self.integration_dict = integration_dict
[docs] def query(
self,
X,
y,
reg,
fit_reg=True,
sample_weight=None,
candidates=None,
X_eval=None,
batch_size=1,
return_utilities=False,
):
"""Determines for which candidate samples labels are to be queried.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples)
Labels of the training data set (possibly including unlabeled ones
indicated by `self.missing_label`).
reg : ProbabilisticRegressor
Predicts the output and the conditional distribution.
fit_reg : bool, optional (default=True)
Defines whether the regressor should be fitted on `X`, `y`, and
`sample_weight`.
sample_weight : array-like of shape (n_samples), optional
(default=None)
Weights of training samples in `X`.
candidates : None or array-like of shape (n_candidates), dtype=int or
array-like of shape (n_candidates, n_features),
optional (default=None)
If candidates is None, the unlabeled samples from (X,y) are
considered as candidates.
If candidates is of shape (n_candidates) and of type int,
candidates is considered as the indices of the samples in (X,y).
If candidates is of shape (n_candidates, n_features), the
candidates are directly given in candidates (not necessarily
contained in X).
X_eval : array-like of shape (n_eval_samples, n_features),
optional (default=None)
Evaluation data set that is used for estimating the probability
distribution of the feature space.
batch_size : int, optional (default=1)
The number of samples to be selected in one AL cycle.
return_utilities : bool, optional (default=False)
If true, also return the utilities based on the query strategy.
Returns
-------
query_indices : numpy.ndarray of shape (batch_size)
The query_indices indicate for which candidate sample a label is
to queried, e.g., `query_indices[0]` indicates the first selected
sample.
If candidates is None or of shape (n_candidates), the indexing
refers to samples in X.
If candidates is of shape (n_candidates, n_features), the indexing
refers to samples in candidates.
utilities : numpy.ndarray of shape (batch_size, n_samples) or
numpy.ndarray of shape (batch_size, n_candidates)
The utilities of samples after each selected sample of the batch,
e.g., `utilities[0]` indicates the utilities used for selecting
the first sample (with index `query_indices[0]`) of the batch.
Utilities for labeled samples will be set to np.nan.
If candidates is None or of shape (n_candidates), the indexing
refers to samples in X.
If candidates is of shape (n_candidates, n_features), the indexing
refers to samples in candidates.
"""
X, y, candidates, batch_size, return_utilities = self._validate_data(
X, y, candidates, batch_size, return_utilities, reset=True
)
check_type(reg, "reg", ProbabilisticRegressor)
check_type(fit_reg, "fit_reg", bool)
if X_eval is None:
X_eval = X
else:
X_eval = check_array(X_eval)
self._check_n_features(X_eval, reset=False)
if self.integration_dict is None:
self.integration_dict = {"method": "assume_linear"}
check_type(self.integration_dict, "self.integration_dict", dict)
X_cand, mapping = self._transform_candidates(candidates, X, y)
if fit_reg:
if sample_weight is None:
reg = clone(reg).fit(X, y)
else:
reg = clone(reg).fit(X, y, sample_weight)
old_model_variance = np.average(
reg.predict(X_eval, return_std=True)[1] ** 2
)
def new_model_variance(idx, x_cand, y_pot):
reg_new = _update_reg(
reg,
X,
y,
sample_weight=sample_weight,
y_update=y_pot,
idx_update=idx,
X_update=x_cand,
mapping=mapping,
)
_, new_model_std = reg_new.predict(X_eval, return_std=True)
return np.average(new_model_std**2)
ex_model_variance = _conditional_expect(
X_cand,
new_model_variance,
reg,
random_state=self.random_state_,
**self.integration_dict
)
utilities_cand = old_model_variance - ex_model_variance
if mapping is None:
utilities = utilities_cand
else:
utilities = np.full(len(X), np.nan)
utilities[mapping] = utilities_cand
return simple_batch(
utilities,
batch_size=batch_size,
random_state=self.random_state_,
return_utilities=return_utilities,
)