Source code for skactiveml.pool._falcun

"""
Module implementing `Falcun`, which is a deep active learning strategy jointly
selecting uncertain and diverse samples.
"""

import numpy as np

from sklearn.base import clone

from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    MISSING_LABEL,
    check_scalar,
    check_type,
    check_equal_missing_label,
)
from ._uncertainty_sampling import uncertainty_scores


[docs]class Falcun(SingleAnnotatorPoolQueryStrategy): """Fast Active Learning by Contrastive UNcertainty (FALCUN) This class implements the "Fast Active Learning by Contrastive UNcertainty" (FALCUN) query strategy [1]_, which is a hybrid pool-based strategy that jointly selects uncertain samples via margin sampling while considering batch diversity within the class probability space. Parameters ---------- gamma : float > 0, default=10 Controls the randomness in the selection. A value of 0 corresponds to random sampling, while a value going to infinity corresponds to selecting the sample with the highest utility (relevance). missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : None or int or np.random.RandomState, default=None The random state to use. References ---------- .. [1] S. Gilhuber, A. Beer, Y. Ma, and T. Seidl. FALCUN: A Simple and Efficient Deep Active Learning Strategy. In Joint Eur. Conf. Mach. Learn. Knowl. Discov. Databases, pages 421–439, 2024. """ def __init__( self, gamma=10, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( missing_label=missing_label, random_state=random_state ) self.gamma = gamma
[docs] def query( self, X, y, clf, fit_clf=True, sample_weight=None, candidates=None, batch_size=1, return_utilities=False, ): """Query the next samples to be labeled. X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`.) clf : skactiveml.base.SkactivemlClassifier Classifier implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier `clf` should be fitted on `X`, `y`, and `sample_weight`. sample_weight: array-like of shape (n_samples,), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `utilities` refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, *)`, `utilities` refers to the indexing in `candidates`. """ # Check parameters. X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) X_cand, mapping = self._transform_candidates(candidates, X, y) check_scalar( self.gamma, "gamma", min_val=0, target_type=(float, int), min_inclusive=True, ) check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) check_scalar(fit_clf, "fit_clf", bool) # Fit classifier, if requested. if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) # Compute uncertainties via margin sampling (cf. Eq. (1) in [1]). probas_cand = clf.predict_proba(X_cand) unc_cand = uncertainty_scores(probas_cand, method="margin_sampling") # Initialize distances in probability space (cf. Eq. (3) in [1]). dist_cand = unc_cand.copy() query_indices = [] utilities_cand = np.full((batch_size, len(X_cand)), np.nan) cand_indices = np.arange(len(X_cand)) for b in range(batch_size): if b > 0: # Update distances (diversity) values in the class probability # space (cf. Eqs. (2) and (4) in [1]). probas_q = probas_cand[[query_indices[int(b - 1)]]] dist_new = np.abs(probas_cand - probas_q).sum(axis=1) dist_cand = np.minimum(dist_new, dist_cand) dist_min = dist_cand.min() dist_range = dist_cand.max() - dist_min dist_cand -= dist_min if dist_range > 0: dist_cand /= dist_range # Compute relevance scores for candidates (cf. Eq. (5) and # (6) in [1]). rel_cand = (unc_cand + dist_cand) ** self.gamma rel_cand[query_indices] = 0 rel_cand_sum = np.sum(rel_cand) if rel_cand_sum == 0: rel_cand = np.ones_like(rel_cand) rel_cand[query_indices] = 0 rel_cand = rel_cand / np.sum(rel_cand) # Sample instance to be labeled (cf. Eq. (6) in [1]). query_idx = self.random_state_.choice( cand_indices, p=rel_cand, size=1 ) rel_cand[query_indices] = np.nan utilities_cand[b] = rel_cand query_indices.append(query_idx[0]) if mapping is not None: query_indices = mapping[query_indices] utilities = np.full((batch_size, len(X)), np.nan) utilities[:, mapping] = utilities_cand else: utilities = utilities_cand if return_utilities: return query_indices, utilities else: return query_indices