"""
Module implementing `ContrastiveAL`, which is a deep active learning strategy
selecting contrastive samples.
"""
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.base import clone
from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
MISSING_LABEL,
is_labeled,
simple_batch,
check_scalar,
check_type,
check_equal_missing_label,
)
[docs]class ContrastiveAL(SingleAnnotatorPoolQueryStrategy):
"""Contrastive Active Learning (ContrastiveAL)
This class implements the Contrastive Active Learning (ContrastiveAL) query
strategy [1], which selects samples similar in the (classifier's learned)
feature space, while the classifier predicts maximally different
class-membership probabilities.
Parameters
----------
nearest_neighbors_dict : dict, default=None
The parameters passed to the nearest neighboring algorithm
`sklearn.neighbors.NearestNeighbors`.
clf_embedding_flag_name : str or None, default=None
Name of the flag, which is passed to the `predict_proba` method for
getting the (learned) sample representations. If
`clf_embedding_flag_name=None` and `predict_proba` returns only one
output, the input samples `X` are used. If `predict_proba` returns
two outputs or `clf_embedding_name` is not `None`,
`(proba, embeddings)` are expected as outputs.
eps : float > 0, default=1e-7
Minimum probability threshold to compute log-probabilities.
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : None or int or np.random.RandomState, default=None
The random state to use.
References
----------
.. [1] Margatina, Katerina, Giorgos Vernikos, Loïc Barrault, and Nikolaos
Aletras. "Active Learning by Acquiring Contrastive Examples." In EMNLP,
pp. 650-663. 2021.
"""
def __init__(
self,
nearest_neighbors_dict=None,
clf_embedding_flag_name=None,
eps=1e-7,
missing_label=MISSING_LABEL,
random_state=None,
):
super().__init__(
missing_label=missing_label, random_state=random_state
)
self.nearest_neighbors_dict = nearest_neighbors_dict
self.clf_embedding_flag_name = clf_embedding_flag_name
self.eps = eps
[docs] def query(
self,
X,
y,
clf,
fit_clf=True,
sample_weight=None,
candidates=None,
batch_size=1,
return_utilities=False,
):
"""Query the next samples to be labeled.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples,)
Labels of the training data set (possibly including unlabeled ones
indicated by self.missing_label).
clf : skactiveml.base.SkactivemlClassifier
Model implementing the methods `fit` and `predict_proba`.
fit_clf : bool, default=True
Defines whether the classifier should be fitted on `X`, `y`, and
`sample_weight`.
sample_weight: array-like of shape (n_samples,), default=None
Weights of training samples in `X`.
candidates : None or array-like of shape (n_candidates) with \
dtype=int or array-like of shape (n_candidates, n_features), \
default=None
If `candidates` is `None`, the unlabeled samples from `(X, y)`
are considered as candidates.
If `candidates` is of shape `(n_candidates)` and of type `int`,
`candidates` is considered as a list of the indices of the samples
in `(X, y)`.
If `candidates` is of shape `(n_candidates, n_features)`, the
candidate samples are directly given in `candidates` (not
necessarily contained in `X`).
batch_size : int, default=1
The number of samples to be selected in one AL cycle.
return_utilities : bool, default=False
If `True`, also return the utilities based on the query strategy.
Returns
----------
query_indices : numpy.ndarray of shape (batch_size,)
The `query_indices` indicate for which candidate sample a label is
to be queried, e.g., `query_indices[0]` indicates the first
selected sample.
If `candidates` is `None` or of shape `(n_candidates,)`, the
indexing refers to samples in X.
utilities : numpy.ndarray of shape (batch_size, n_samples)
The utilities of samples for selecting each sample of the batch.
Here, utilities refers to the Kullback-Leibler divergence between
the sample's own and its labeled nearest neighbors' predicted
class-membership probabilities. If `candidates` is `None` or of
shape `(n_candidates,)`, the indexing refers to the samples in `X`.
"""
# Check parameters.
X, y, candidates, batch_size, return_utilities = self._validate_data(
X, y, candidates, batch_size, return_utilities, reset=True
)
X_cand, mapping = self._transform_candidates(candidates, X, y)
X_labeled = X[is_labeled(y, self.missing_label_)]
if not (
isinstance(self.nearest_neighbors_dict, dict)
or self.nearest_neighbors_dict is None
):
raise TypeError(
"Pass a dictionary with corresponding parameter names and "
"values according to the `init` function of "
"`sklearn.neighbors.NearestNeighbors`."
)
nearest_neighbors_dict = (
{}
if self.nearest_neighbors_dict is None
else self.nearest_neighbors_dict.copy()
)
check_scalar(
self.eps,
"eps",
min_val=0,
max_val=0.1,
target_type=(float, int),
min_inclusive=False,
)
check_type(clf, "clf", SkactivemlClassifier)
check_equal_missing_label(clf.missing_label, self.missing_label_)
check_scalar(fit_clf, "fit_clf", bool)
if fit_clf:
if sample_weight is None:
clf = clone(clf).fit(X, y)
else:
clf = clone(clf).fit(X, y, sample_weight)
if len(X_labeled) > 0:
# Obtain classifier predictions and optionally learned feature
# embeddings (cf. line 3 and 4 in [1]).
predict_proba_kwargs = {}
if self.clf_embedding_flag_name is not None:
predict_proba_kwargs = {self.clf_embedding_flag_name: True}
P_labeled = clf.predict_proba(X_labeled, **predict_proba_kwargs)
P_cand = clf.predict_proba(X_cand, **predict_proba_kwargs)
if isinstance(P_labeled, tuple):
P_labeled, X_labeled = P_labeled
if isinstance(P_cand, tuple):
P_cand, X_cand = P_cand
# Clip probabilities to avoid zeros.
np.clip(P_labeled, a_min=self.eps, a_max=1, out=P_labeled)
P_labeled /= P_labeled.sum(axis=1, keepdims=True)
np.clip(P_cand, a_min=self.eps, a_max=1, out=P_cand)
P_cand /= P_cand.sum(axis=1, keepdims=True)
# Find nearest labeled samples of candidate samples
# (cf. line 2 in [1]).
nn = NearestNeighbors(**nearest_neighbors_dict).fit(X_labeled)
max_n_neighbors = min(nn.n_neighbors, len(X_labeled))
nn_indices = nn.kneighbors(
X_cand, n_neighbors=max_n_neighbors, return_distance=False
)
# Compute KL divergences between class-membership probabilities of
# candidates and their respective neighbors
# (cf. line 5 and 6 in [1]).
P_labeled = P_labeled[nn_indices]
utilities_cand = P_labeled * np.log(P_labeled / P_cand[:, None, :])
utilities_cand = utilities_cand.sum(axis=-1).mean(axis=-1)
else:
# Fallback to random sampling, if there are no labeled samples.
utilities_cand = np.zeros(len(X_cand))
if mapping is None:
utilities = utilities_cand
else:
utilities = np.full(len(X), np.nan)
utilities[mapping] = utilities_cand
return simple_batch(
utilities,
self.random_state_,
batch_size=batch_size,
return_utilities=return_utilities,
)