Source code for skactiveml.pool._uncertainty_sampling

"""
Module implementing various uncertainty based query strategies.
"""

# Authors: Pascal Mergard <Pascal.Mergard@student.uni-kassel.de>
#          Marek Herde <marek.herde@uni-kassel.de>

import numpy as np
from sklearn import clone
from sklearn.utils.validation import check_array

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_cost_matrix,
    simple_batch,
    check_classes,
    check_type,
    check_equal_missing_label,
)


[docs]class UncertaintySampling(SingleAnnotatorPoolQueryStrategy):
    """Uncertainty Sampling (US)

    This class implement various uncertainty based query strategies, i.e., the
    standard uncertainty measures [1]_, cost-sensitive ones [2]_, and one
    optimizing expected average precision [3]_.

    Parameters
    ----------
    method : 'least_confident' or 'margin' or 'entropy' or \
            'expected_average_precision', default='least_confident'
        The method to calculate the uncertainty.
    cost_matrix : array-like of shape (n_classes, n_classes)
        Cost matrix with `cost_matrix[i,j]` defining the cost of predicting
        class `j` for a sample with the actual class `i`. Only supported for
        `least_confident` and `margin_sampling` variant.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : int or np.random.RandomState
        The random state to use.

    References
    ----------
    .. [1] Settles, Burr. Active learning literature survey. University of
       Wisconsin-Madison Department of Computer Sciences, 2009.

    .. [2] P.-L. Chen and H.-T. Lin. Active Learning for Multiclass
       Cost-Sensitive Classification Using Probabilistic Models. In Conf.
       Technol. Appl. Artif. Intell., pages 13–18, 2013.

    .. [3] H. Wang, X. Chang, L. Shi, Y. Yang, and Y.-D. Shen. Uncertainty
       Sampling for Action Recognition via Maximizing Expected Average
       Precision. In Int. Jt. Conf. Artif. Intell., pages 964–970, 2018.
    """

    def __init__(
        self,
        method="least_confident",
        cost_matrix=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        super().__init__(
            missing_label=missing_label, random_state=random_state
        )
        self.method = method
        self.cost_matrix = cost_matrix

[docs]    def query(
        self,
        X,
        y,
        clf,
        fit_clf=True,
        sample_weight=None,
        utility_weight=None,
        candidates=None,
        batch_size=1,
        return_utilities=False,
    ):
        """Determines for which candidate samples labels are to be queried.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e., including the labeled
            and unlabeled samples.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        clf : skactiveml.base.SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`.
        fit_clf : bool, default=True
            Defines whether the classifier should be fitted on `X`, `y`, and
            `sample_weight`.
        sample_weight : array-like of shape (n_samples,), default=None
            Weights of training samples in `X`.
        utility_weight : array-like, default=None
            Weight for each candidate (multiplied with utilities). Usually,
            this is to be the density of a candidate. The length of
            `utility_weight` is usually `n_samples`, except for the case when
            `candidates` contains samples (ndim >= 2). Then the length is
            `n_candidates`.
        candidates : None or array-like of shape (n_candidates), dtype=int or \
                array-like of shape (n_candidates, n_features), default=None
            - If `candidates` is `None`, the unlabeled samples from
              `(X,y)` are considered as `candidates`.
            - If `candidates` is of shape `(n_candidates,)` and of type
              `int`, `candidates` is considered as the indices of the
              samples in `(X,y)`.
            - If `candidates` is of shape `(n_candidates, *)`, the
              candidate samples are directly given in `candidates` (not
              necessarily contained in `X`).
        batch_size : int, default=1
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, default=False
            If true, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray of shape (batch_size)
            The query indices indicate for which candidate sample a label is to
            be queried, e.g., `query_indices[0]` indicates the first selected
            sample.

            - If `candidates` is `None` or of shape
              `(n_candidates,)`, the indexing refers to the samples in
              `X`.
            - If `candidates` is of shape `(n_candidates, n_features)`,
              the indexing refers to the samples in `candidates`.
        utilities : numpy.ndarray of shape (batch_size, n_samples)
            The utilities of samples after each selected sample of the batch,
            e.g., `utilities[0]` indicates the utilities used for selecting
            the first sample (with index `query_indices[0]`) of the batch.
            Utilities for labeled samples will be set to np.nan.

            - If `candidates` is `None`, the indexing refers to the samples
              in `X`.
            - If `candidates` is of shape `(n_candidates,)` and of type
              `int`, `utilities` refers to the samples in `X`.
            - If `candidates` is of shape `(n_candidates, *)`, `utilities`
              refers to the indexing in `candidates`.
        """
        # Validate input parameters.
        X, y, candidates, batch_size, return_utilities = self._validate_data(
            X, y, candidates, batch_size, return_utilities, reset=True
        )

        X_cand, mapping = self._transform_candidates(candidates, X, y)

        # Validate classifier type.
        check_type(clf, "clf", SkactivemlClassifier)
        check_equal_missing_label(clf.missing_label, self.missing_label_)

        # Validate classifier type.
        check_type(fit_clf, "fit_clf", bool)

        # Check `utility_weight`.
        if utility_weight is None:
            if mapping is None:
                utility_weight = np.ones(len(X_cand))
            else:
                utility_weight = np.ones(len(X))
        utility_weight = check_array(utility_weight, ensure_2d=False)

        if mapping is None and not len(X_cand) == len(utility_weight):
            raise ValueError(
                f"'utility_weight' must have length 'n_candidates' but "
                f"{len(X_cand)} != {len(utility_weight)}."
            )
        if mapping is not None and not len(X) == len(utility_weight):
            raise ValueError(
                f"'utility_weight' must have length 'n_samples' but "
                f"{len(utility_weight)} != {len(X)}."
            )

        # Validate method.
        if not isinstance(self.method, str):
            raise TypeError(
                "{} is an invalid type for method. Type {} is "
                "expected".format(type(self.method), str)
            )

        # sample_weight is checked by clf when fitted

        # Fit the classifier.
        if fit_clf:
            if sample_weight is not None:
                clf = clone(clf).fit(X, y, sample_weight)
            else:
                clf = clone(clf).fit(X, y)

        # Predict class-membership probabilities.
        probas = clf.predict_proba(X_cand)

        # Choose the method and calculate corresponding utilities.
        with np.errstate(divide="ignore"):
            if self.method in [
                "least_confident",
                "margin_sampling",
                "entropy",
            ]:
                utilities_cand = uncertainty_scores(
                    probas=probas,
                    method=self.method,
                    cost_matrix=self.cost_matrix,
                )
            elif self.method == "expected_average_precision":
                classes = clf.classes_
                utilities_cand = expected_average_precision(classes, probas)
            else:
                raise ValueError(
                    "The given method {} is not valid. Supported methods are "
                    "'entropy', 'least_confident', 'margin_sampling' and "
                    "'expected_average_precision'".format(self.method)
                )

        if mapping is None:
            utilities = utilities_cand
        else:
            utilities = np.full(len(X), np.nan)
            utilities[mapping] = utilities_cand
        utilities *= utility_weight

        return simple_batch(
            utilities,
            self.random_state_,
            batch_size=batch_size,
            return_utilities=return_utilities,
        )


[docs]def uncertainty_scores(probas, cost_matrix=None, method="least_confident"):
    """Computes uncertainty scores. Three methods are available: least
    confident ('least_confident'), margin sampling ('margin_sampling'),
    and entropy based uncertainty ('entropy') [1]_. For the least confident and
    margin sampling methods cost-sensitive variants are implemented in case of
    a given cost matrix (see [2]_ for more information).

    Parameters
    ----------
    probas : array-like of shape (n_samples, n_classes)
        Class membership probabilities for each sample.
    cost_matrix : array-like pf shape (n_classes, n_classes)
        Cost matrix with `cost_matrix[i,j]` defining the cost of predicting
        class `j` for a sample with the actual class `i`. Only supported for
        'least_confident' or 'margin_sampling'.
    method : 'least_confident' or 'margin_sampling' or 'entropy', \
            default='least_confident'
        The method to calculate the uncertainty.

    References
    ----------
    .. [1] Settles, Burr. "Active learning literature survey".
       University of Wisconsin-Madison Department of Computer Sciences, 2009.
    .. [2] P.-L. Chen and H.-T. Lin. Active Learning for Multiclass
       Cost-Sensitive Classification Using Probabilistic Models. In Conf.
       Technol. Appl. Artif. Intell., pages 13–18, 2013.
    """
    # Check probabilities.
    probas = check_array(probas)

    if not np.allclose(np.sum(probas, axis=1), 1, rtol=0, atol=1.0e-3):
        raise ValueError(
            "'probas' are invalid. The sum over axis 1 must be one."
        )

    n_classes = probas.shape[1]

    # Check cost matrix.
    if cost_matrix is not None:
        cost_matrix = check_cost_matrix(cost_matrix, n_classes=n_classes)

    # Compute uncertainties.
    if method == "least_confident":
        if cost_matrix is None:
            return 1 - np.max(probas, axis=1)
        else:
            costs = probas @ cost_matrix
            costs = np.partition(costs, 1, axis=1)[:, :2]
            return costs[:, 0]
    elif method == "margin_sampling":
        if cost_matrix is None:
            probas = -(np.partition(-probas, 1, axis=1)[:, :2])
            return 1 - np.abs(probas[:, 0] - probas[:, 1])
        else:
            costs = probas @ cost_matrix
            costs = np.partition(costs, 1, axis=1)[:, :2]
            return -np.abs(costs[:, 0] - costs[:, 1])
    elif method == "entropy":
        if cost_matrix is None:
            with np.errstate(divide="ignore", invalid="ignore"):
                return np.nansum(-probas * np.log(probas), axis=1)
        else:
            raise ValueError(
                "Method `entropy` does not support cost matrices but "
                "`cost_matrix` was not None."
            )
    else:
        raise ValueError(
            "Supported methods are ['least_confident', 'margin_sampling', "
            "'entropy'], the given one is: {}.".format(method)
        )


[docs]def expected_average_precision(classes, probas):
    """
    Calculate the expected average precision [1]_.

    Parameters
    ----------
    classes : array-like, shape=(n_classes,)
        Holds the label for each class.
    probas : array-like of shape (n_samples, n_classes)
        Class membership probabilities for each sample.

    Returns
    -------
    score : np.ndarray of shape=(n_samples,)
        The expected average precision score of all samples in candidates.

    References
    ----------
    .. [1] H. Wang, X. Chang, L. Shi, Y. Yang, and Y.-D. Shen. Uncertainty
       Sampling for Action Recognition via Maximizing Expected Average
       Precision. In Int. Jt. Conf. Artif. Intell., pages 964–970, 2018.
    """
    # Check if `probas` is valid.
    probas = check_array(
        probas,
        accept_sparse=False,
        accept_large_sparse=True,
        dtype="numeric",
        order=None,
        copy=False,
        ensure_all_finite=True,
        ensure_2d=True,
        allow_nd=False,
        ensure_min_samples=1,
        ensure_min_features=1,
        estimator=None,
    )

    if (np.sum(probas, axis=1) - 1).all():
        raise ValueError(
            "probas are invalid. The sum over axis 1 must be " "one."
        )

    # Check if `classes` are valid.
    check_classes(classes)
    if len(classes) < 2:
        raise ValueError("`classes` must contain at least 2 entries.")
    if len(classes) != probas.shape[1]:
        raise ValueError(
            "`classes` must have the same length as `probas` has " "columns."
        )

    score = np.zeros(len(probas))
    for i in range(len(classes)):
        for j in range(len(probas)):
            # The i-th column of p without p[j,i]
            p = probas[:, i]
            p = np.delete(p, [j])
            # Sort p in descending order
            p = np.flipud(np.sort(p, axis=0))

            # calculate g_arr
            g_arr = np.zeros((len(p), len(p)))
            for n in range(len(p)):
                for h in range(n + 1):
                    g_arr[n, h] = _g(n, h, p, g_arr)

            # calculate f_arr
            f_arr = np.zeros((len(p) + 1, len(p) + 1))
            for a in range(len(p) + 1):
                for b in range(a + 1):
                    f_arr[a, b] = _f(a, b, p, f_arr, g_arr)

            # calculate score
            for t in range(len(p)):
                score[j] += f_arr[len(p), t + 1] / (t + 1)

    return score


# g-function for expected_average_precision
def _g(n, t, p, g_arr):
    if t > n or (t == 0 and n > 0):
        return 0
    if t == 0 and n == 0:
        return 1
    return p[n - 1] * g_arr[n - 1, t - 1] + (1 - p[n - 1]) * g_arr[n - 1, t]


# f-function for expected_average_precision
def _f(n, t, p, f_arr, g_arr):
    if t > n or (t == 0 and n > 0):
        return 0
    if t == 0 and n == 0:
        return 1
    return (
        p[n - 1] * f_arr[n - 1, t - 1]
        + p[n - 1] * t * g_arr[n - 1, t - 1] / n
        + (1 - p[n - 1]) * f_arr[n - 1, t]
    )