Source code for skactiveml.pool._uncertainty_sampling

"""
Module implementing various uncertainty based query strategies.
"""

# Authors: Pascal Mergard <Pascal.Mergard@student.uni-kassel.de>
#          Marek Herde <marek.herde@uni-kassel.de>

import numpy as np
from sklearn import clone
from sklearn.utils.validation import check_array

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_cost_matrix,
    simple_batch,
    check_classes,
    check_type,
    check_equal_missing_label,
)


[docs]class UncertaintySampling(SingleAnnotatorPoolQueryStrategy): """Uncertainty Sampling (US) This class implement various uncertainty based query strategies, i.e., the standard uncertainty measures [1]_, cost-sensitive ones [2]_, and one optimizing expected average precision [3]_. Parameters ---------- method : 'least_confident' or 'margin' or 'entropy' or \ 'expected_average_precision', default='least_confident' The method to calculate the uncertainty. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` defining the cost of predicting class `j` for a sample with the actual class `i`. Only supported for `least_confident` and `margin_sampling` variant. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or np.random.RandomState The random state to use. References ---------- .. [1] Settles, Burr. Active learning literature survey. University of Wisconsin-Madison Department of Computer Sciences, 2009. .. [2] P.-L. Chen and H.-T. Lin. Active Learning for Multiclass Cost-Sensitive Classification Using Probabilistic Models. In Conf. Technol. Appl. Artif. Intell., pages 13–18, 2013. .. [3] H. Wang, X. Chang, L. Shi, Y. Yang, and Y.-D. Shen. Uncertainty Sampling for Action Recognition via Maximizing Expected Average Precision. In Int. Jt. Conf. Artif. Intell., pages 964–970, 2018. """ def __init__( self, method="least_confident", cost_matrix=None, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( missing_label=missing_label, random_state=random_state ) self.method = method self.cost_matrix = cost_matrix
[docs] def query( self, X, y, clf, fit_clf=True, sample_weight=None, utility_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. utility_weight : array-like, default=None Weight for each candidate (multiplied with utilities). Usually, this is to be the density of a candidate. The length of `utility_weight` is usually `n_samples`, except for the case when `candidates` contains samples (ndim >= 2). Then the length is `n_candidates`. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `utilities` refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, *)`, `utilities` refers to the indexing in `candidates`. """ # Validate input parameters. X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) X_cand, mapping = self._transform_candidates(candidates, X, y) # Validate classifier type. check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) # Validate classifier type. check_type(fit_clf, "fit_clf", bool) # Check `utility_weight`. if utility_weight is None: if mapping is None: utility_weight = np.ones(len(X_cand)) else: utility_weight = np.ones(len(X)) utility_weight = check_array(utility_weight, ensure_2d=False) if mapping is None and not len(X_cand) == len(utility_weight): raise ValueError( f"'utility_weight' must have length 'n_candidates' but " f"{len(X_cand)} != {len(utility_weight)}." ) if mapping is not None and not len(X) == len(utility_weight): raise ValueError( f"'utility_weight' must have length 'n_samples' but " f"{len(utility_weight)} != {len(X)}." ) # Validate method. if not isinstance(self.method, str): raise TypeError( "{} is an invalid type for method. Type {} is " "expected".format(type(self.method), str) ) # sample_weight is checked by clf when fitted # Fit the classifier. if fit_clf: if sample_weight is not None: clf = clone(clf).fit(X, y, sample_weight) else: clf = clone(clf).fit(X, y) # Predict class-membership probabilities. probas = clf.predict_proba(X_cand) # Choose the method and calculate corresponding utilities. with np.errstate(divide="ignore"): if self.method in [ "least_confident", "margin_sampling", "entropy", ]: utilities_cand = uncertainty_scores( probas=probas, method=self.method, cost_matrix=self.cost_matrix, ) elif self.method == "expected_average_precision": classes = clf.classes_ utilities_cand = expected_average_precision(classes, probas) else: raise ValueError( "The given method {} is not valid. Supported methods are " "'entropy', 'least_confident', 'margin_sampling' and " "'expected_average_precision'".format(self.method) ) if mapping is None: utilities = utilities_cand else: utilities = np.full(len(X), np.nan) utilities[mapping] = utilities_cand utilities *= utility_weight return simple_batch( utilities, self.random_state_, batch_size=batch_size, return_utilities=return_utilities, )
[docs]def uncertainty_scores(probas, cost_matrix=None, method="least_confident"): """Computes uncertainty scores. Three methods are available: least confident ('least_confident'), margin sampling ('margin_sampling'), and entropy based uncertainty ('entropy') [1]_. For the least confident and margin sampling methods cost-sensitive variants are implemented in case of a given cost matrix (see [2]_ for more information). Parameters ---------- probas : array-like of shape (n_samples, n_classes) Class membership probabilities for each sample. cost_matrix : array-like pf shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` defining the cost of predicting class `j` for a sample with the actual class `i`. Only supported for 'least_confident' or 'margin_sampling'. method : 'least_confident' or 'margin_sampling' or 'entropy', \ default='least_confident' The method to calculate the uncertainty. References ---------- .. [1] Settles, Burr. "Active learning literature survey". University of Wisconsin-Madison Department of Computer Sciences, 2009. .. [2] P.-L. Chen and H.-T. Lin. Active Learning for Multiclass Cost-Sensitive Classification Using Probabilistic Models. In Conf. Technol. Appl. Artif. Intell., pages 13–18, 2013. """ # Check probabilities. probas = check_array(probas) if not np.allclose(np.sum(probas, axis=1), 1, rtol=0, atol=1.0e-3): raise ValueError( "'probas' are invalid. The sum over axis 1 must be one." ) n_classes = probas.shape[1] # Check cost matrix. if cost_matrix is not None: cost_matrix = check_cost_matrix(cost_matrix, n_classes=n_classes) # Compute uncertainties. if method == "least_confident": if cost_matrix is None: return 1 - np.max(probas, axis=1) else: costs = probas @ cost_matrix costs = np.partition(costs, 1, axis=1)[:, :2] return costs[:, 0] elif method == "margin_sampling": if cost_matrix is None: probas = -(np.partition(-probas, 1, axis=1)[:, :2]) return 1 - np.abs(probas[:, 0] - probas[:, 1]) else: costs = probas @ cost_matrix costs = np.partition(costs, 1, axis=1)[:, :2] return -np.abs(costs[:, 0] - costs[:, 1]) elif method == "entropy": if cost_matrix is None: with np.errstate(divide="ignore", invalid="ignore"): return np.nansum(-probas * np.log(probas), axis=1) else: raise ValueError( "Method `entropy` does not support cost matrices but " "`cost_matrix` was not None." ) else: raise ValueError( "Supported methods are ['least_confident', 'margin_sampling', " "'entropy'], the given one is: {}.".format(method) )
[docs]def expected_average_precision(classes, probas): """ Calculate the expected average precision [1]_. Parameters ---------- classes : array-like, shape=(n_classes,) Holds the label for each class. probas : array-like of shape (n_samples, n_classes) Class membership probabilities for each sample. Returns ------- score : np.ndarray of shape=(n_samples,) The expected average precision score of all samples in candidates. References ---------- .. [1] H. Wang, X. Chang, L. Shi, Y. Yang, and Y.-D. Shen. Uncertainty Sampling for Action Recognition via Maximizing Expected Average Precision. In Int. Jt. Conf. Artif. Intell., pages 964–970, 2018. """ # Check if `probas` is valid. probas = check_array( probas, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None, ) if (np.sum(probas, axis=1) - 1).all(): raise ValueError( "probas are invalid. The sum over axis 1 must be " "one." ) # Check if `classes` are valid. check_classes(classes) if len(classes) < 2: raise ValueError("`classes` must contain at least 2 entries.") if len(classes) != probas.shape[1]: raise ValueError( "`classes` must have the same length as `probas` has " "columns." ) score = np.zeros(len(probas)) for i in range(len(classes)): for j in range(len(probas)): # The i-th column of p without p[j,i] p = probas[:, i] p = np.delete(p, [j]) # Sort p in descending order p = np.flipud(np.sort(p, axis=0)) # calculate g_arr g_arr = np.zeros((len(p), len(p))) for n in range(len(p)): for h in range(n + 1): g_arr[n, h] = _g(n, h, p, g_arr) # calculate f_arr f_arr = np.zeros((len(p) + 1, len(p) + 1)) for a in range(len(p) + 1): for b in range(a + 1): f_arr[a, b] = _f(a, b, p, f_arr, g_arr) # calculate score for t in range(len(p)): score[j] += f_arr[len(p), t + 1] / (t + 1) return score
# g-function for expected_average_precision def _g(n, t, p, g_arr): if t > n or (t == 0 and n > 0): return 0 if t == 0 and n == 0: return 1 return p[n - 1] * g_arr[n - 1, t - 1] + (1 - p[n - 1]) * g_arr[n - 1, t] # f-function for expected_average_precision def _f(n, t, p, f_arr, g_arr): if t > n or (t == 0 and n > 0): return 0 if t == 0 and n == 0: return 1 return ( p[n - 1] * f_arr[n - 1, t - 1] + p[n - 1] * t * g_arr[n - 1, t - 1] / n + (1 - p[n - 1]) * f_arr[n - 1, t] )