Source code for skactiveml.pool._contrastive_al

"""
Module implementing `ContrastiveAL`, which is a deep active learning strategy
selecting contrastive samples.
"""

import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.base import clone

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    is_labeled,
    simple_batch,
    check_scalar,
    check_type,
    check_equal_missing_label,
)


[docs]class ContrastiveAL(SingleAnnotatorPoolQueryStrategy): """Contrastive Active Learning (ContrastiveAL) This class implements the Contrastive Active Learning (ContrastiveAL) query strategy [1], which selects samples similar in the (classifier's learned) feature space, while the classifier predicts maximally different class-membership probabilities. Parameters ---------- nearest_neighbors_dict : dict, default=None The parameters passed to the nearest neighboring algorithm `sklearn.neighbors.NearestNeighbors`. clf_embedding_flag_name : str or None, default=None Name of the flag, which is passed to the `predict_proba` method for getting the (learned) sample representations. If `clf_embedding_flag_name=None` and `predict_proba` returns only one output, the input samples `X` are used. If `predict_proba` returns two outputs or `clf_embedding_name` is not `None`, `(proba, embeddings)` are expected as outputs. eps : float > 0, default=1e-7 Minimum probability threshold to compute log-probabilities. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : None or int or np.random.RandomState, default=None The random state to use. References ---------- .. [1] Margatina, Katerina, Giorgos Vernikos, Loïc Barrault, and Nikolaos Aletras. "Active Learning by Acquiring Contrastive Examples." In EMNLP, pp. 650-663. 2021. """ def __init__( self, nearest_neighbors_dict=None, clf_embedding_flag_name=None, eps=1e-7, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( missing_label=missing_label, random_state=random_state ) self.nearest_neighbors_dict = nearest_neighbors_dict self.clf_embedding_flag_name = clf_embedding_flag_name self.eps = eps
[docs] def query( self, X, y, clf, fit_clf=True, sample_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Query the next samples to be labeled. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by self.missing_label). clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. sample_weight: array-like of shape (n_samples,), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates) with \ dtype=int or array-like of shape (n_candidates, n_features), \ default=None If `candidates` is `None`, the unlabeled samples from `(X, y)` are considered as candidates. If `candidates` is of shape `(n_candidates)` and of type `int`, `candidates` is considered as a list of the indices of the samples in `(X, y)`. If `candidates` is of shape `(n_candidates, n_features)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ---------- query_indices : numpy.ndarray of shape (batch_size,) The `query_indices` indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to samples in X. utilities : numpy.ndarray of shape (batch_size, n_samples) The utilities of samples for selecting each sample of the batch. Here, utilities refers to the Kullback-Leibler divergence between the sample's own and its labeled nearest neighbors' predicted class-membership probabilities. If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. """ # Check parameters. X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) X_cand, mapping = self._transform_candidates(candidates, X, y) X_labeled = X[is_labeled(y, self.missing_label_)] if not ( isinstance(self.nearest_neighbors_dict, dict) or self.nearest_neighbors_dict is None ): raise TypeError( "Pass a dictionary with corresponding parameter names and " "values according to the `init` function of " "`sklearn.neighbors.NearestNeighbors`." ) nearest_neighbors_dict = ( {} if self.nearest_neighbors_dict is None else self.nearest_neighbors_dict.copy() ) check_scalar( self.eps, "eps", min_val=0, max_val=0.1, target_type=(float, int), min_inclusive=False, ) check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) check_scalar(fit_clf, "fit_clf", bool) if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) if len(X_labeled) > 0: # Obtain classifier predictions and optionally learned feature # embeddings (cf. line 3 and 4 in [1]). predict_proba_kwargs = {} if self.clf_embedding_flag_name is not None: predict_proba_kwargs = {self.clf_embedding_flag_name: True} P_labeled = clf.predict_proba(X_labeled, **predict_proba_kwargs) P_cand = clf.predict_proba(X_cand, **predict_proba_kwargs) if isinstance(P_labeled, tuple): P_labeled, X_labeled = P_labeled if isinstance(P_cand, tuple): P_cand, X_cand = P_cand # Clip probabilities to avoid zeros. np.clip(P_labeled, a_min=self.eps, a_max=1, out=P_labeled) P_labeled /= P_labeled.sum(axis=1, keepdims=True) np.clip(P_cand, a_min=self.eps, a_max=1, out=P_cand) P_cand /= P_cand.sum(axis=1, keepdims=True) # Find nearest labeled samples of candidate samples # (cf. line 2 in [1]). nn = NearestNeighbors(**nearest_neighbors_dict).fit(X_labeled) max_n_neighbors = min(nn.n_neighbors, len(X_labeled)) nn_indices = nn.kneighbors( X_cand, n_neighbors=max_n_neighbors, return_distance=False ) # Compute KL divergences between class-membership probabilities of # candidates and their respective neighbors # (cf. line 5 and 6 in [1]). P_labeled = P_labeled[nn_indices] utilities_cand = P_labeled * np.log(P_labeled / P_cand[:, None, :]) utilities_cand = utilities_cand.sum(axis=-1).mean(axis=-1) else: # Fallback to random sampling, if there are no labeled samples. utilities_cand = np.zeros(len(X_cand)) if mapping is None: utilities = utilities_cand else: utilities = np.full(len(X), np.nan) utilities[mapping] = utilities_cand return simple_batch( utilities, self.random_state_, batch_size=batch_size, return_utilities=return_utilities, )