Source code for skactiveml.pool._badge

"""
Module implementing the pool-based query strategy Batch Active Learning by
Diverse Gradient Embedding (BADGE).
"""

import numpy as np
from sklearn import clone
from sklearn.metrics import pairwise_distances_argmin_min

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_type,
    check_equal_missing_label,
    unlabeled_indices,
    check_scalar,
)


[docs]class Badge(SingleAnnotatorPoolQueryStrategy):
    """Batch Active Learning by Diverse Gradient Embedding (BADGE)

    This class implements the BADGE algorithm [1]_, which is designed to
    incorporate both predictive uncertainty and sample diversity into every
    selected batch.

    Parameters
    ----------
    clf_embedding_flag_name : str or None, default=None
        Name of the flag, which is passed to the `predict_proba` method for
        getting the (learned) sample representations.

        - If `clf_embedding_flag_name=None` and `predict_proba` returns
          only one output, the input samples `X` are used.
        - If `predict_proba` returns two outputs or `clf_embedding_name` is
          not `None`, `(proba, embeddings)` are expected as outputs.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : None or int or np.random.RandomState, default=None
        The random state to use.

    References
    ----------
    .. [1] J. T. Ash, C. Zhang, A. Krishnamurthy, J. Langford, and A. Agarwal.
       Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds.
       In Int. Conf. Learn. Represent., 2020.
    """

    def __init__(
        self,
        clf_embedding_flag_name=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        self.clf_embedding_flag_name = clf_embedding_flag_name
        super().__init__(
            missing_label=missing_label, random_state=random_state
        )

[docs]    def query(
        self,
        X,
        y,
        clf,
        fit_clf=True,
        sample_weight=None,
        candidates=None,
        batch_size=1,
        return_utilities=False,
    ):
        """Determines for which candidate samples labels are to be queried.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e., including the labeled
            and unlabeled samples.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        clf : skactiveml.base.SkactivemlClassifier
            Classifier implementing the methods `fit` and `predict_proba`.
        fit_clf : bool, default=True
            Defines whether the classifier `clf` should be fitted on `X`, `y`,
            and `sample_weight`.
        sample_weight: array-like of shape (n_samples,), default=None
            Weights of training samples in `X`.
        candidates : None or array-like of shape (n_candidates,), dtype=int or\
                array-like of shape (n_candidates, n_features), default=None
            - If `candidates` is `None`, the unlabeled samples from
              `(X,y)` are considered as `candidates`.
            - If `candidates` is of shape `(n_candidates,)` and of type
              `int`, `candidates` is considered as the indices of the
              samples in `(X,y)`.
            - If `candidates` is of shape `(n_candidates, *)`, the
              candidate samples are directly given in `candidates` (not
              necessarily contained in `X`).
        batch_size : int, default=1
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, default=False
            If `True`, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray of shape (batch_size,)
            The query indices indicate for which candidate sample a label is
            to be queried, e.g., `query_indices[0]` indicates the first
            selected sample.

            - If `candidates` is `None` or of shape
              `(n_candidates,)`, the indexing refers to the samples in
              `X`.
            - If `candidates` is of shape `(n_candidates, n_features)`,
              the indexing refers to the samples in `candidates`.
        utilities : numpy.ndarray of shape (batch_size, n_samples) or \
                numpy.ndarray of shape (batch_size, n_candidates)
            The utilities of samples after each selected sample of the batch,
            e.g., `utilities[0]` indicates the utilities used for selecting
            the first sample (with index `query_indices[0]`) of the batch.
            Utilities for labeled samples will be set to np.nan.

            - If `candidates` is `None` or of shape
              `(n_candidates,)`, the indexing refers to the samples in
              `X`.
            - If `candidates` is of shape `(n_candidates, n_features)`,
              the indexing refers to the samples in `candidates`.
        """
        # Validate input parameters
        X, y, candidates, batch_size, return_utilities = self._validate_data(
            X, y, candidates, batch_size, return_utilities, reset=True
        )

        X_cand, mapping = self._transform_candidates(candidates, X, y)

        # Validate classifier type
        check_type(clf, "clf", SkactivemlClassifier)
        check_equal_missing_label(clf.missing_label, self.missing_label_)
        check_scalar(fit_clf, "fit_clf", bool)
        if self.clf_embedding_flag_name is not None:
            check_scalar(
                self.clf_embedding_flag_name, "clf_embedding_flag_name", str
            )

        # Fit the classifier
        if fit_clf:
            if sample_weight is None:
                clf = clone(clf).fit(X, y)
            else:
                clf = clone(clf).fit(X, y, sample_weight)

        # find the unlabeled dataset
        if candidates is None:
            X_unlbld = X_cand
            unlbld_mapping = mapping
        elif mapping is not None:
            unlbld_mapping = unlabeled_indices(
                y[mapping], missing_label=self.missing_label
            )
            X_unlbld = X_cand[unlbld_mapping]
            unlbld_mapping = mapping[unlbld_mapping]
        else:
            X_unlbld = X_cand
            unlbld_mapping = np.arange(len(X_cand))

        # gradient embedding, aka predict class membership probabilities
        if self.clf_embedding_flag_name is not None:
            probas, X_unlbld = clf.predict_proba(
                X_unlbld, **{self.clf_embedding_flag_name: True}
            )
        else:
            probas = clf.predict_proba(X_unlbld)
            if isinstance(probas, tuple):
                probas, X_unlbld = probas

        y_pred = probas.argmax(axis=-1)
        proba_factor = probas - np.eye(probas.shape[1])[y_pred]
        g_x = proba_factor[:, :, None] * X_unlbld[:, None, :]
        g_x = g_x.reshape(*g_x.shape[:-2], -1)

        # init the utilities
        if mapping is not None:
            utilities = np.full(
                shape=(batch_size, X.shape[0]), fill_value=np.nan
            )
        else:
            utilities = np.full(
                shape=(batch_size, X_cand.shape[0]), fill_value=np.nan
            )

        # sampling with kmeans++
        query_indicies = []
        query_indicies_in_unlbld = []
        idx_in_unlbld = []
        d_2_s = []
        for i in range(batch_size):
            if i == 0:
                d_2 = _d_2(g_x, idx_in_unlbld)
            else:
                d_2 = _d_2(g_x, [idx_in_unlbld], d_2_s[i - 1])
            d_2_s.append(d_2)

            d_2_sum = np.sum(d_2)
            if d_2_sum == 0:
                d_2_s[-1] = np.full(shape=len(g_x), fill_value=np.inf)
                d_2 = np.ones(shape=len(g_x))
                d_2[query_indicies_in_unlbld] = 0
                d_2_sum = np.sum(d_2)

            d_probas = d_2 / d_2_sum

            utilities[i, unlbld_mapping] = d_probas
            utilities[i, query_indicies] = np.nan

            if i == 0 and d_2_sum != 0:
                idx_in_unlbld = np.argmax(d_2, axis=-1)
            else:
                idx_in_unlbld_array = self.random_state_.choice(
                    len(d_probas), 1, replace=False, p=d_probas
                )
                idx_in_unlbld = idx_in_unlbld_array[0]
            query_indicies_in_unlbld.append(idx_in_unlbld)

            idx = unlbld_mapping[idx_in_unlbld]
            query_indicies.append(idx)

        if return_utilities:
            return query_indicies, utilities
        else:
            return query_indicies


def _d_2(g_x, query_indices, d_latest=None):
    """
    Calculates the D^2 value of the embedding features of unlabeled data.

    Parameters
    ----------
    g_x : np.ndarray of shape (n_unlabeled_samples, n_features)
        The results after gradient embedding
    query_indices : numpy.ndarray of shape (n_query_indices,)
        the query indications that correspond to the unlabeled samples.
    d_latest : np.ndarray of shape (n_unlabeled_samples,) default=None
        The distance between each data point and its nearest centre.
        This is used to simplify the calculation of the later distances for the
        next selected sample.

    Returns
    -------
    D2 : numpy.ndarray of shape (n_unlabeled_samples,)
        The D^2 value, for the first sample, is the value inf.
    """
    if len(query_indices) == 0:
        return np.sum(g_x**2, axis=-1)
    query_indices = g_x[query_indices]
    _, D = pairwise_distances_argmin_min(X=g_x, Y=query_indices)
    if d_latest is not None:
        D2 = np.minimum(d_latest, np.square(D))
    return D2