Source code for skactiveml.pool._badge

"""
Module implementing the pool-based query strategy Batch Active Learning by
Diverse Gradient Embedding (BADGE).
"""

import numpy as np
from sklearn import clone
from sklearn.metrics import pairwise_distances_argmin_min

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_type,
    check_equal_missing_label,
    unlabeled_indices,
    check_scalar,
)


[docs]class Badge(SingleAnnotatorPoolQueryStrategy):
    """
    Batch Active Learning by Diverse Gradient Embedding (BADGE)

    This class implements the BADGE algorithm [1]. This query strategy is
    designed to incorporate both predictive uncertainty and
    sample diversity into every selected batch.

    Parameters
    ----------
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : None or int or np.random.RandomState, default=None
        The random state to use.
    clf_embedding_flag_name : str or None, default=None
        Name of the flag, which is passed to the `predict_proba` method for
        getting the (learned) sample representations. If
        `clf_embedding_flag_name=None` and `predict_proba` returns only one
        output, the input samples `X` are used. If `predict_proba` returns
        two outputs or `clf_embedding_name` is not `None`,
        `(proba, embeddings)` are expected as outputs.

    References
    ----------
    .. [1] J. Ash, Jordan T., Chicheng Zhang, Akshay Krishnamurthy, John
       Langford, and Alekh Agarwal, "Deep Batch Active Learning by Diverse,
       Uncertain Gradient Lower Bounds." ICLR, 2019.
    """

    def __init__(
        self,
        clf_embedding_flag_name=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        self.clf_embedding_flag_name = clf_embedding_flag_name
        super().__init__(
            missing_label=missing_label, random_state=random_state
        )

[docs]    def query(
        self,
        X,
        y,
        clf,
        fit_clf=True,
        sample_weight=None,
        candidates=None,
        batch_size=1,
        return_utilities=False,
    ):
        """Query the next samples to be labeled.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples.
        y : array-like of shape (n_samples, )
            Labels of the training data set (possibly including unlabeled
            samples, indicated by self.missing_label).
        clf : skactiveml.base.SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`.
        fit_clf : bool, optional (default=True)
            Defines whether the classifier should be fitted on `X`, `y`, and
            `sample_weight`.
        sample_weight: array-like of shape (n_samples), optional (default=None)
            Weights of training samples in `X`.
        candidates : None or array-like of shape (n_candidates), dtype=int or
            array-like of shape (n_candidates, n_features),
            optional (default=None)
            If candidates is None, the unlabeled samples from (X,y) are
            considered as candidates.
            If candidates is of shape (n_candidates) and of type int,
            candidates is considered as the indices of the samples in (X,y).
            If candidates is of shape (n_candidates, n_features), the
            candidates are directly given in candidates (not necessarily
            contained in X). This is not supported by all query strategies.
        batch_size : int, optional (default=1)
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray of shape (batch_size)
            The query_indices indicate for which candidate sample a label is
            being queried for a label, e.g., `query_indices[0]` indicates the
            first selected sample.
            If candidates is None or of shape (n_candidates), the indexing
            refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        utilities : numpy.ndarray of shape (batch_size, n_samples) or
            numpy.ndarray of shape (batch_size, n_candidates)
            The utilities of samples before each selected sample of the batch,
            e.g., `utilities[0]` indicates the utilities used for selecting
            the first sample (with index `query_indices[0]`) of the batch.
            Utilities for labeled samples will be set to np.nan.
            For the case where the samples are uniformly randomly selected from
            the set, the sum of all utility of samples will be 1.
            The utilities represent here the probabilities of samples being
            chosen.
            If candidates is None or of shape (n_candidates), the indexing
            refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        """
        # Validate input parameters
        X, y, candidates, batch_size, return_utilities = self._validate_data(
            X, y, candidates, batch_size, return_utilities, reset=True
        )

        X_cand, mapping = self._transform_candidates(candidates, X, y)

        # Validate classifier type
        check_type(clf, "clf", SkactivemlClassifier)
        check_equal_missing_label(clf.missing_label, self.missing_label_)
        check_scalar(fit_clf, "fit_clf", bool)
        if self.clf_embedding_flag_name is not None:
            check_scalar(
                self.clf_embedding_flag_name, "clf_embedding_flag_name", str
            )

        # Fit the classifier
        if fit_clf:
            if sample_weight is None:
                clf = clone(clf).fit(X, y)
            else:
                clf = clone(clf).fit(X, y, sample_weight)

        # find the unlabeled dataset
        if candidates is None:
            X_unlbld = X_cand
            unlbld_mapping = mapping
        elif mapping is not None:
            unlbld_mapping = unlabeled_indices(
                y[mapping], missing_label=self.missing_label
            )
            X_unlbld = X_cand[unlbld_mapping]
            unlbld_mapping = mapping[unlbld_mapping]
        else:
            X_unlbld = X_cand
            unlbld_mapping = np.arange(len(X_cand))

        # gradient embedding, aka predict class membership probabilities
        if self.clf_embedding_flag_name is not None:
            probas, X_unlbld = clf.predict_proba(
                X_unlbld, **{self.clf_embedding_flag_name: True}
            )
        else:
            probas = clf.predict_proba(X_unlbld)
            if isinstance(probas, tuple):
                probas, X_unlbld = probas

        y_pred = probas.argmax(axis=-1)
        proba_factor = probas - np.eye(probas.shape[1])[y_pred]
        g_x = proba_factor[:, :, None] * X_unlbld[:, None, :]
        g_x = g_x.reshape(*g_x.shape[:-2], -1)

        # init the utilities
        if mapping is not None:
            utilities = np.full(
                shape=(batch_size, X.shape[0]), fill_value=np.nan
            )
        else:
            utilities = np.full(
                shape=(batch_size, X_cand.shape[0]), fill_value=np.nan
            )

        # sampling with kmeans++
        query_indicies = []
        query_indicies_in_unlbld = []
        idx_in_unlbld = []
        d_2_s = []
        for i in range(batch_size):
            if i == 0:
                d_2 = _d_2(g_x, idx_in_unlbld)
            else:
                d_2 = _d_2(g_x, [idx_in_unlbld], d_2_s[i - 1])
            d_2_s.append(d_2)

            d_2_sum = np.sum(d_2)
            if d_2_sum == 0:
                d_2_s[-1] = np.full(shape=len(g_x), fill_value=np.inf)
                d_2 = np.ones(shape=len(g_x))
                d_2[query_indicies_in_unlbld] = 0
                d_2_sum = np.sum(d_2)

            d_probas = d_2 / d_2_sum

            utilities[i, unlbld_mapping] = d_probas
            utilities[i, query_indicies] = np.nan

            if i == 0 and d_2_sum != 0:
                idx_in_unlbld = np.argmax(d_2, axis=-1)
            else:
                idx_in_unlbld_array = self.random_state_.choice(
                    len(d_probas), 1, replace=False, p=d_probas
                )
                idx_in_unlbld = idx_in_unlbld_array[0]
            query_indicies_in_unlbld.append(idx_in_unlbld)

            idx = unlbld_mapping[idx_in_unlbld]
            query_indicies.append(idx)

        if return_utilities:
            return query_indicies, utilities
        else:
            return query_indicies


def _d_2(g_x, query_indices, d_latest=None):
    """
    Calculates the D^2 value of the embedding features of unlabeled data.

    Parameters
    ----------
    g_x : np.ndarray of shape (n_unlabeled_samples, n_features)
        The results after gradient embedding
    query_indices : numpy.ndarray of shape (n_query_indices)
        the query indications that correspond to the unlabeled samples.
    d_latest : np.ndarray of shape (n_unlabeled_samples) default=None
        The distance between each data point and its nearest centre.
        This is used to simplify the calculation of the later distances for the
        next selected sample.

    Returns
    -------
    D2 : numpy.ndarray of shape (n_unlabeled_samples)
        The D^2 value, for the first sample, is the value inf.
    """
    if len(query_indices) == 0:
        return np.sum(g_x**2, axis=-1)
    query_indices = g_x[query_indices]
    _, D = pairwise_distances_argmin_min(X=g_x, Y=query_indices)
    if d_latest is not None:
        D2 = np.minimum(d_latest, np.square(D))
    return D2