Source code for skactiveml.pool._badge

"""
Module implementing the pool-based query strategy Batch Active Learning by
Diverse Gradient Embedding (BADGE).
"""

import numpy as np
from sklearn import clone
from sklearn.metrics import pairwise_distances_argmin_min

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_type,
    check_equal_missing_label,
    unlabeled_indices,
    check_scalar,
)


[docs]class Badge(SingleAnnotatorPoolQueryStrategy): """ Batch Active Learning by Diverse Gradient Embedding (BADGE) This class implements the BADGE algorithm [1]. This query strategy is designed to incorporate both predictive uncertainty and sample diversity into every selected batch. Parameters ---------- missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : None or int or np.random.RandomState, default=None The random state to use. clf_embedding_flag_name : str or None, default=None Name of the flag, which is passed to the `predict_proba` method for getting the (learned) sample representations. If `clf_embedding_flag_name=None` and `predict_proba` returns only one output, the input samples `X` are used. If `predict_proba` returns two outputs or `clf_embedding_name` is not `None`, `(proba, embeddings)` are expected as outputs. References ---------- .. [1] J. Ash, Jordan T., Chicheng Zhang, Akshay Krishnamurthy, John Langford, and Alekh Agarwal, "Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds." ICLR, 2019. """ def __init__( self, clf_embedding_flag_name=None, missing_label=MISSING_LABEL, random_state=None, ): self.clf_embedding_flag_name = clf_embedding_flag_name super().__init__( missing_label=missing_label, random_state=random_state )
[docs] def query( self, X, y, clf, fit_clf=True, sample_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Query the next samples to be labeled. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, ) Labels of the training data set (possibly including unlabeled samples, indicated by self.missing_label). clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. fit_clf : bool, optional (default=True) Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. sample_weight: array-like of shape (n_samples), optional (default=None) Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If candidates is None, the unlabeled samples from (X,y) are considered as candidates. If candidates is of shape (n_candidates) and of type int, candidates is considered as the indices of the samples in (X,y). If candidates is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. batch_size : int, optional (default=1) The number of samples to be selected in one AL cycle. return_utilities : bool, optional (default=False) If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size) The query_indices indicate for which candidate sample a label is being queried for a label, e.g., `query_indices[0]` indicates the first selected sample. If candidates is None or of shape (n_candidates), the indexing refers to samples in X. If candidates is of shape (n_candidates, n_features), the indexing refers to samples in candidates. utilities : numpy.ndarray of shape (batch_size, n_samples) or numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples before each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. For the case where the samples are uniformly randomly selected from the set, the sum of all utility of samples will be 1. The utilities represent here the probabilities of samples being chosen. If candidates is None or of shape (n_candidates), the indexing refers to samples in X. If candidates is of shape (n_candidates, n_features), the indexing refers to samples in candidates. """ # Validate input parameters X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) X_cand, mapping = self._transform_candidates(candidates, X, y) # Validate classifier type check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) check_scalar(fit_clf, "fit_clf", bool) if self.clf_embedding_flag_name is not None: check_scalar( self.clf_embedding_flag_name, "clf_embedding_flag_name", str ) # Fit the classifier if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) # find the unlabeled dataset if candidates is None: X_unlbld = X_cand unlbld_mapping = mapping elif mapping is not None: unlbld_mapping = unlabeled_indices( y[mapping], missing_label=self.missing_label ) X_unlbld = X_cand[unlbld_mapping] unlbld_mapping = mapping[unlbld_mapping] else: X_unlbld = X_cand unlbld_mapping = np.arange(len(X_cand)) # gradient embedding, aka predict class membership probabilities if self.clf_embedding_flag_name is not None: probas, X_unlbld = clf.predict_proba( X_unlbld, **{self.clf_embedding_flag_name: True} ) else: probas = clf.predict_proba(X_unlbld) if isinstance(probas, tuple): probas, X_unlbld = probas y_pred = probas.argmax(axis=-1) proba_factor = probas - np.eye(probas.shape[1])[y_pred] g_x = proba_factor[:, :, None] * X_unlbld[:, None, :] g_x = g_x.reshape(*g_x.shape[:-2], -1) # init the utilities if mapping is not None: utilities = np.full( shape=(batch_size, X.shape[0]), fill_value=np.nan ) else: utilities = np.full( shape=(batch_size, X_cand.shape[0]), fill_value=np.nan ) # sampling with kmeans++ query_indicies = [] query_indicies_in_unlbld = [] idx_in_unlbld = [] d_2_s = [] for i in range(batch_size): if i == 0: d_2 = _d_2(g_x, idx_in_unlbld) else: d_2 = _d_2(g_x, [idx_in_unlbld], d_2_s[i - 1]) d_2_s.append(d_2) d_2_sum = np.sum(d_2) if d_2_sum == 0: d_2_s[-1] = np.full(shape=len(g_x), fill_value=np.inf) d_2 = np.ones(shape=len(g_x)) d_2[query_indicies_in_unlbld] = 0 d_2_sum = np.sum(d_2) d_probas = d_2 / d_2_sum utilities[i, unlbld_mapping] = d_probas utilities[i, query_indicies] = np.nan if i == 0 and d_2_sum != 0: idx_in_unlbld = np.argmax(d_2, axis=-1) else: idx_in_unlbld_array = self.random_state_.choice( len(d_probas), 1, replace=False, p=d_probas ) idx_in_unlbld = idx_in_unlbld_array[0] query_indicies_in_unlbld.append(idx_in_unlbld) idx = unlbld_mapping[idx_in_unlbld] query_indicies.append(idx) if return_utilities: return query_indicies, utilities else: return query_indicies
def _d_2(g_x, query_indices, d_latest=None): """ Calculates the D^2 value of the embedding features of unlabeled data. Parameters ---------- g_x : np.ndarray of shape (n_unlabeled_samples, n_features) The results after gradient embedding query_indices : numpy.ndarray of shape (n_query_indices) the query indications that correspond to the unlabeled samples. d_latest : np.ndarray of shape (n_unlabeled_samples) default=None The distance between each data point and its nearest centre. This is used to simplify the calculation of the later distances for the next selected sample. Returns ------- D2 : numpy.ndarray of shape (n_unlabeled_samples) The D^2 value, for the first sample, is the value inf. """ if len(query_indices) == 0: return np.sum(g_x**2, axis=-1) query_indices = g_x[query_indices] _, D = pairwise_distances_argmin_min(X=g_x, Y=query_indices) if d_latest is not None: D2 = np.minimum(d_latest, np.square(D)) return D2