Source code for skactiveml.pool._prob_cover

"""
Module implementing `ProbCover`, which is a deep active learning strategy
suited for low budgets.
"""

import numpy as np
import warnings

from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.utils.validation import column_or_1d

from ..base import SingleAnnotatorPoolQueryStrategy
from ..utils import (
    MISSING_LABEL,
    rand_argmax,
    check_scalar,
)


[docs]class ProbCover(SingleAnnotatorPoolQueryStrategy): """Probability Coverage This class implements the Probability Coverage (ProbCover) query strategy [1]_, which aims at maximizing the probability coverage in a meaningful sample embedding space. Parameters ---------- n_classes : None or int, default=None This parameter is used to determine the delta value. If `n_classes=None`, the number of classes is extracted from the given labels. If this extracted number of classes is below 2, `n_classes=2` is used as a fallback. deltas : None or array-like of shape (n_deltas,), default=None List of deltas (ball radii) to be tested for finding the maximum value satisfying a sample coverage >= `alpha`. If no value in `deltas` satisfies this constraint, a warning is raised where the minimum `delta` value is used. If `deltas=None`, the values `np.arange(0.1, 2.1, 0.1)` are used. alpha : float in (0, 1), alpha=0.95 Minimum coverage as a constraint for the `delta` selection. cluster_algo : ClusterMixin.__class__, default=sklearn.cluster.KMeans The cluster algorithm to be used for determining the best delta value. cluster_algo_dict : dict, default=None The parameters passed to the clustering algorithm `cluster_algo`, excluding the parameter for the number of clusters. n_cluster_param_name : string, default="n_clusters" The name of the parameter for the number of clusters. distance_func : callable, default=sklearn.metrics.pairwise_distances Takes as input `X` to compute the distances between each pair of samples. This function can also only return the precomputed distances of each pair in `X` for speedup. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : None or int or np.random.RandomState, default=None The random state to use. References ---------- .. [1] Yehuda, Ofer, Avihu Dekel, Guy Hacohen, and Daphna Weinshall. "Active Learning Through a Covering Lens." NeurIPS, 2022. """ def __init__( self, n_classes=None, deltas=None, alpha=0.95, cluster_algo=KMeans, cluster_algo_dict=None, n_cluster_param_name="n_clusters", distance_func=pairwise_distances, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( missing_label=missing_label, random_state=random_state ) self.deltas = deltas self.alpha = alpha self.n_classes = n_classes self.cluster_algo = cluster_algo self.cluster_algo_dict = cluster_algo_dict self.n_cluster_param_name = n_cluster_param_name self.distance_func = distance_func
[docs] def query( self, X, y, candidates=None, batch_size=1, return_utilities=False, update=False, ): """Query the next samples to be labeled Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by self.missing_label). candidates : None or array-like of shape (n_candidates) with \ dtype=int, default=None If `candidates` is None, the unlabeled samples from `(X, y)` are considered as candidates. If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as a list of the indices of the samples in `(X, y)`. batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If True, also return the utilities based on the query strategy. update : bool, default=False This boolean flag determines whether the computed `delta_max_` and the `distances_` shall be updated in the `query`. For the first call of `query`, this parameter has no impact because both quantities are computed for the first time. Returns ---------- query_indices : numpy.ndarray of shape (batch_size,) The `query_indices` indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. If `candidates` in `None` or of shape `(n_candidates,)`, the indexing refers to samples in `X`. utilities : numpy.ndarray of shape (batch_size, n_samples) The utilities of samples for selecting each sample of the batch. Here, utilities mean the out-degree of the candidate samples. If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. """ # Check parameters. X, y, candidates, batch_size, return_utilities = self._validate_data( X, y, candidates, batch_size, return_utilities, reset=True ) _, mapping = self._transform_candidates( candidates, X, y, enforce_mapping=True ) is_candidate = np.full(len(X), fill_value=False) is_candidate[mapping] = True n_classes = self.n_classes if n_classes is None: n_classes = max(len(np.unique(y[~is_candidate])), 2) check_scalar( n_classes, "n_classes", min_val=2, min_inclusive=True, target_type=int, ) if self.deltas is None: deltas = np.arange(0.2, 2.2, 0.2) else: deltas = column_or_1d(self.deltas, dtype=float) deltas = np.sort(deltas) if (deltas < 0).any(): raise ValueError("`deltas` must contain non-negative floats.") check_scalar( self.alpha, "alpha", min_val=0, max_val=1, min_inclusive=False, max_inclusive=False, target_type=float, ) if not ( isinstance(self.cluster_algo_dict, dict) or self.cluster_algo_dict is None ): raise TypeError( "Pass a dictionary with corresponding parameter names and " "values according to the `init` function of `cluster_algo`." ) cluster_algo_dict = ( {} if self.cluster_algo_dict is None else self.cluster_algo_dict.copy() ) check_scalar(update, name="update", target_type=bool) if update or not hasattr(self, "delta_max_"): # Compute distances between each pair of observed samples. self.distances_ = self.distance_func(X) # Compute the maximum `delta` value satisfying a purity >= `alpha`. self.delta_max_ = deltas[0] max_purity = -1 if len(deltas) > 1: cluster_algo_dict[self.n_cluster_param_name] = n_classes cluster_obj = self.cluster_algo(**cluster_algo_dict) y_cluster = cluster_obj.fit_predict(X) is_impure = y_cluster[:, None] != y_cluster for delta in deltas: edges = self.distances_ <= delta purity = 1 - (edges * is_impure).any(axis=1).mean() max_purity = max(max_purity, purity) if purity < self.alpha: break self.delta_max_ = delta # Check whether condition defined by `alpha` was satisfied. if max_purity < self.alpha: warnings.warn( f"The maximum purity was {max_purity} being smaller " f"than the required value `alpha={self.alpha}`. You must" f"provide smaller values in `deltas` to avoid " f"this warning." ) # Compute edges of the graph with the samples as vertices. edges = self.distances_ <= self.delta_max_ # Perform sample-wise selection of the batch. query_indices = np.full(batch_size, fill_value=-1, dtype=int) utilities = np.full((batch_size, len(X)), fill_value=np.nan) for b in range(batch_size): # Step (ii) in [1]: Remove incoming edges for covered samples. is_covered = edges[~is_candidate].any(axis=0) edges[:, is_covered] = False # Step (i) in [1]: Query the sample with the highest out-degree. utilities[b][is_candidate] = edges[is_candidate].sum(axis=1) idx = rand_argmax(utilities[b], random_state=self.random_state_)[0] is_candidate[idx] = False query_indices[b] = idx if return_utilities: return query_indices, utilities else: return query_indices