Source code for skactiveml.pool._badge

"""
Module implementing the pool-based query strategy Batch Active Learning by
Diverse Gradient Embedding (BADGE).
"""

import numpy as np
from sklearn import clone
from sklearn.metrics import pairwise_distances_argmin_min

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_type,
    check_equal_missing_label,
    unlabeled_indices,
    check_scalar,
)


[docs]class Badge(SingleAnnotatorPoolQueryStrategy): """Batch Active Learning by Diverse Gradient Embedding (BADGE) This class implements the BADGE algorithm [1]_, which is designed to incorporate both predictive uncertainty and sample diversity into every selected batch. Parameters ---------- clf_embedding_flag_name : str or None, default=None Name of the flag, which is passed to the `predict_proba` method for getting the (learned) sample representations. - If `clf_embedding_flag_name=None` and `predict_proba` returns only one output, the input samples `X` are used. - If `predict_proba` returns two outputs or `clf_embedding_name` is not `None`, `(proba, embeddings)` are expected as outputs. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : None or int or np.random.RandomState, default=None The random state to use. References ---------- .. [1] J. T. Ash, C. Zhang, A. Krishnamurthy, J. Langford, and A. Agarwal. Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds. In Int. Conf. Learn. Represent., 2020. """ def __init__( self, clf_embedding_flag_name=None, missing_label=MISSING_LABEL, random_state=None, ): self.clf_embedding_flag_name = clf_embedding_flag_name super().__init__( missing_label=missing_label, random_state=random_state )
[docs] def query( self, X, y, clf, fit_clf=True, sample_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). clf : skactiveml.base.SkactivemlClassifier Classifier implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier `clf` should be fitted on `X`, `y`, and `sample_weight`. sample_weight: array-like of shape (n_samples,), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates,), dtype=int or\ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size,) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) or \ numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. """ # Validate input parameters X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) X_cand, mapping = self._transform_candidates(candidates, X, y) # Validate classifier type check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) check_scalar(fit_clf, "fit_clf", bool) if self.clf_embedding_flag_name is not None: check_scalar( self.clf_embedding_flag_name, "clf_embedding_flag_name", str ) # Fit the classifier if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) # find the unlabeled dataset if candidates is None: X_unlbld = X_cand unlbld_mapping = mapping elif mapping is not None: unlbld_mapping = unlabeled_indices( y[mapping], missing_label=self.missing_label ) X_unlbld = X_cand[unlbld_mapping] unlbld_mapping = mapping[unlbld_mapping] else: X_unlbld = X_cand unlbld_mapping = np.arange(len(X_cand)) # gradient embedding, aka predict class membership probabilities if self.clf_embedding_flag_name is not None: probas, X_unlbld = clf.predict_proba( X_unlbld, **{self.clf_embedding_flag_name: True} ) else: probas = clf.predict_proba(X_unlbld) if isinstance(probas, tuple): probas, X_unlbld = probas y_pred = probas.argmax(axis=-1) proba_factor = probas - np.eye(probas.shape[1])[y_pred] g_x = proba_factor[:, :, None] * X_unlbld[:, None, :] g_x = g_x.reshape(*g_x.shape[:-2], -1) # init the utilities if mapping is not None: utilities = np.full( shape=(batch_size, X.shape[0]), fill_value=np.nan ) else: utilities = np.full( shape=(batch_size, X_cand.shape[0]), fill_value=np.nan ) # sampling with kmeans++ query_indicies = [] query_indicies_in_unlbld = [] idx_in_unlbld = [] d_2_s = [] for i in range(batch_size): if i == 0: d_2 = _d_2(g_x, idx_in_unlbld) else: d_2 = _d_2(g_x, [idx_in_unlbld], d_2_s[i - 1]) d_2_s.append(d_2) d_2_sum = np.sum(d_2) if d_2_sum == 0: d_2_s[-1] = np.full(shape=len(g_x), fill_value=np.inf) d_2 = np.ones(shape=len(g_x)) d_2[query_indicies_in_unlbld] = 0 d_2_sum = np.sum(d_2) d_probas = d_2 / d_2_sum utilities[i, unlbld_mapping] = d_probas utilities[i, query_indicies] = np.nan if i == 0 and d_2_sum != 0: idx_in_unlbld = np.argmax(d_2, axis=-1) else: idx_in_unlbld_array = self.random_state_.choice( len(d_probas), 1, replace=False, p=d_probas ) idx_in_unlbld = idx_in_unlbld_array[0] query_indicies_in_unlbld.append(idx_in_unlbld) idx = unlbld_mapping[idx_in_unlbld] query_indicies.append(idx) if return_utilities: return query_indicies, utilities else: return query_indicies
def _d_2(g_x, query_indices, d_latest=None): """ Calculates the D^2 value of the embedding features of unlabeled data. Parameters ---------- g_x : np.ndarray of shape (n_unlabeled_samples, n_features) The results after gradient embedding query_indices : numpy.ndarray of shape (n_query_indices,) the query indications that correspond to the unlabeled samples. d_latest : np.ndarray of shape (n_unlabeled_samples,) default=None The distance between each data point and its nearest centre. This is used to simplify the calculation of the later distances for the next selected sample. Returns ------- D2 : numpy.ndarray of shape (n_unlabeled_samples,) The D^2 value, for the first sample, is the value inf. """ if len(query_indices) == 0: return np.sum(g_x**2, axis=-1) query_indices = g_x[query_indices] _, D = pairwise_distances_argmin_min(X=g_x, Y=query_indices) if d_latest is not None: D2 = np.minimum(d_latest, np.square(D)) return D2