Source code for skactiveml.pool._contrastive_al

"""
Module implementing `ContrastiveAL`, which is a deep active learning strategy
selecting contrastive samples.
"""

import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.base import clone

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    is_labeled,
    simple_batch,
    check_scalar,
    check_type,
    check_equal_missing_label,
)


[docs]class ContrastiveAL(SingleAnnotatorPoolQueryStrategy): """Contrastive Active Learning (ContrastiveAL) This class implements the Contrastive Active Learning (ContrastiveAL) query strategy [1]_, which selects samples similar in the (classifier's learned) feature space, while the classifier predicts maximally different class-membership probabilities. Parameters ---------- nearest_neighbors_dict : dict, default=None The parameters passed to the nearest neighboring algorithm `sklearn.neighbors.NearestNeighbors`. clf_embedding_flag_name : str or None, default=None Name of the flag, which is passed to the `predict_proba` method for getting the (learned) sample representations. - If `clf_embedding_flag_name=None` and `predict_proba` returns only one output, the input samples `X` are used. - If `predict_proba` returns two outputs or `clf_embedding_name` is not `None`, `(proba, embeddings)` are expected as outputs. eps : float > 0, default=1e-7 Minimum probability threshold to compute log-probabilities. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : None or int or np.random.RandomState, default=None The random state to use. References ---------- .. [1] K. Margatina, G. Vernikos, L. Barrault, and N. Aletras. Active Learning by Acquiring Contrastive Examples. In Conf. Empir. Methods Nat. Lang. Process., pages 650–663, 2021. """ def __init__( self, nearest_neighbors_dict=None, clf_embedding_flag_name=None, eps=1e-7, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( missing_label=missing_label, random_state=random_state ) self.nearest_neighbors_dict = nearest_neighbors_dict self.clf_embedding_flag_name = clf_embedding_flag_name self.eps = eps
[docs] def query( self, X, y, clf, fit_clf=True, sample_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). clf : skactiveml.base.SkactivemlClassifier Classifier implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier `clf` should be fitted on `X`, `y`, and `sample_weight`. sample_weight: array-like of shape (n_samples), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size,) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) or \ numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. """ # Check parameters. X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) X_cand, mapping = self._transform_candidates(candidates, X, y) X_labeled = X[is_labeled(y, self.missing_label_)] if not ( isinstance(self.nearest_neighbors_dict, dict) or self.nearest_neighbors_dict is None ): raise TypeError( "Pass a dictionary with corresponding parameter names and " "values according to the `init` function of " "`sklearn.neighbors.NearestNeighbors`." ) nearest_neighbors_dict = ( {} if self.nearest_neighbors_dict is None else self.nearest_neighbors_dict.copy() ) check_scalar( self.eps, "eps", min_val=0, max_val=0.1, target_type=(float, int), min_inclusive=False, ) check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) check_scalar(fit_clf, "fit_clf", bool) if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) if len(X_labeled) > 0: # Obtain classifier predictions and optionally learned feature # embeddings (cf. line 3 and 4 in [1]). predict_proba_kwargs = {} if self.clf_embedding_flag_name is not None: predict_proba_kwargs = {self.clf_embedding_flag_name: True} P_labeled = clf.predict_proba(X_labeled, **predict_proba_kwargs) P_cand = clf.predict_proba(X_cand, **predict_proba_kwargs) if isinstance(P_labeled, tuple): P_labeled, X_labeled = P_labeled if isinstance(P_cand, tuple): P_cand, X_cand = P_cand # Clip probabilities to avoid zeros. np.clip(P_labeled, a_min=self.eps, a_max=1, out=P_labeled) P_labeled /= P_labeled.sum(axis=1, keepdims=True) np.clip(P_cand, a_min=self.eps, a_max=1, out=P_cand) P_cand /= P_cand.sum(axis=1, keepdims=True) # Find nearest labeled samples of candidate samples # (cf. line 2 in [1]). nn = NearestNeighbors(**nearest_neighbors_dict).fit(X_labeled) max_n_neighbors = min(nn.n_neighbors, len(X_labeled)) nn_indices = nn.kneighbors( X_cand, n_neighbors=max_n_neighbors, return_distance=False ) # Compute KL divergences between class-membership probabilities of # candidates and their respective neighbors # (cf. line 5 and 6 in [1]). P_labeled = P_labeled[nn_indices] utilities_cand = P_labeled * np.log(P_labeled / P_cand[:, None, :]) utilities_cand = utilities_cand.sum(axis=-1).mean(axis=-1) else: # Fallback to random sampling, if there are no labeled samples. utilities_cand = np.zeros(len(X_cand)) if mapping is None: utilities = utilities_cand else: utilities = np.full(len(X), np.nan) utilities[mapping] = utilities_cand return simple_batch( utilities, self.random_state_, batch_size=batch_size, return_utilities=return_utilities, )