Source code for skactiveml.base

"""
The :mod:`skactiveml.base` package implements the base classes for
:mod:`skactiveml`.
"""

import warnings
from abc import ABC, abstractmethod
from copy import deepcopy

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
    check_array,
    check_consistent_length,
    column_or_1d,
)

from .exceptions import MappingError
from .utils import (
    MISSING_LABEL,
    is_labeled,
    is_unlabeled,
    unlabeled_indices,
    ExtLabelEncoder,
    rand_argmin,
    check_classifier_params,
    check_random_state,
    check_cost_matrix,
    check_scalar,
    check_class_prior,
    check_missing_label,
    check_indices,
    check_n_features,
)

# '__all__' is necessary to create the sphinx docs.
__all__ = [
    "QueryStrategy",
    "SingleAnnotatorPoolQueryStrategy",
    "MultiAnnotatorPoolQueryStrategy",
    "BudgetManager",
    "SingleAnnotatorStreamQueryStrategy",
    "SkactivemlClassifier",
    "ClassFrequencyEstimator",
    "AnnotatorModelMixin",
    "SkactivemlRegressor",
    "ProbabilisticRegressor",
]


[docs]class QueryStrategy(ABC, BaseEstimator): """Base class for all query strategies in scikit-activeml. Parameters ---------- random_state : int or RandomState instance, optional (default=None) Controls the randomness of the estimator. """ def __init__(self, random_state=None): self.random_state = random_state
[docs] @abstractmethod def query(self, *args, **kwargs): """ Determines the query for active learning based on input arguments. """ raise NotImplementedError
class PoolQueryStrategy(QueryStrategy): """Base class for all pool-based active learning query strategies in scikit-activeml. Parameters ---------- missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or RandomState instance or None, default=None Controls the randomness of the estimator. """ def __init__(self, missing_label=MISSING_LABEL, random_state=None): super().__init__(random_state=random_state) self.missing_label = missing_label def _validate_data( self, X, y, candidates, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, *) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int The number of samples to be selected in one AL cycle. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples, *) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or\ np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool Checked boolean value of `return_utilities`. """ # Check samples. if check_X_dict is None: check_X_dict = {"allow_nd": True} X = check_array(X, **check_X_dict) # Check number of features. check_n_features(self, X, reset=reset) # Check labels y = check_array( y, ensure_2d=False, ensure_all_finite="allow-nan", dtype=None ) check_consistent_length(X, y) # Check missing_label check_missing_label(self.missing_label, target_type=y.dtype) self.missing_label_ = self.missing_label # Check candidates (+1 to avoid zero multiplier). seed_mult = int(np.sum(is_unlabeled(y, self.missing_label_))) + 1 if candidates is not None: candidates = np.array(candidates) if candidates.ndim == 1: candidates = check_indices(candidates, y, dim=0) else: check_candidates_dict = deepcopy(check_X_dict) check_candidates_dict["ensure_2d"] = False candidates = check_array(candidates, **check_candidates_dict) check_n_features(self, candidates, reset=False) # Check return_utilities. check_scalar(return_utilities, "return_utilities", bool) # Check batch size. check_scalar(batch_size, target_type=int, name="batch_size", min_val=1) # Check random state. self.random_state_ = check_random_state(self.random_state, seed_mult) return X, y, candidates, batch_size, return_utilities
[docs]class SingleAnnotatorPoolQueryStrategy(PoolQueryStrategy): """Base class for all pool-based active learning query strategies with a single annotator in scikit-activeml. """
[docs] @abstractmethod def query( self, X, y, *args, candidates=None, batch_size=1, return_utilities=False, **kwargs, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by self.missing_label). candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size,) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) or \ numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. """ raise NotImplementedError
def _validate_data( self, X, y, candidates, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int The number of samples to be selected in one AL cycle. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool Checked boolean value of `return_utilities`. """ ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset, check_X_dict ) y = column_or_1d(y, warn=True) if candidates is None: n_candidates = int( np.sum(is_unlabeled(y, missing_label=self.missing_label_)) ) else: n_candidates = len(candidates) if n_candidates < batch_size: warnings.warn( f"'batch_size={batch_size}' is larger than number of " f"candidates. Instead, 'batch_size={n_candidates}' was set." ) batch_size = n_candidates return X, y, candidates, batch_size, return_utilities def _transform_candidates( self, candidates, X, y, enforce_mapping=False, allow_only_unlabeled=False, ): """Transforms the `candidates` parameter into a sample array and the corresponding index array `mapping` such that `candidates = X[mapping]`. Parameters ---------- candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, *)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. enforce_mapping : bool, default=False If True, an exception is raised when no exact mapping can be determined (i.e., `mapping` is None). allow_only_unlabeled : bool, default=False If True, an exception is raised when indices of candidates contain labeled samples. Returns ------- candidates : np.ndarray of shape (n_candidates, n_features) Candidate samples from which the strategy can query the label. mapping : np.ndarray of shape (n_candidates) or None Index array that maps `candidates` to `X`. (`candidates = X[mapping]`) """ if candidates is None: ulbd_idx = unlabeled_indices(y, self.missing_label_) return X[ulbd_idx], ulbd_idx elif candidates.ndim == 1: if allow_only_unlabeled: if is_labeled(y[candidates], self.missing_label_).any(): raise ValueError( "Candidates must not contain labeled " "samples." ) return X[candidates], candidates else: if enforce_mapping: raise MappingError( "Mapping `candidates` to `X` is not " "possible but `enforce_mapping` is True. " "Use index array for `candidates` instead." ) else: return candidates, None
[docs]class MultiAnnotatorPoolQueryStrategy(PoolQueryStrategy): """Base class for all pool-based active learning query strategies with multiple annotators in scikit-activeml. Parameters ---------- missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. """
[docs] @abstractmethod def query( self, X, y, *args, candidates=None, annotators=None, batch_size=1, return_utilities=False, **kwargs, ): """Determines which candidate sample is to be annotated by which annotator. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples, n_annotators) Labels of the training data set for each annotator (possibly including unlabeled ones indicated by self.MISSING_LABEL), meaning that `y[i, j]` contains the label annotated by annotator `i` for sample `j`. candidates : None or array-like of shape (n_candidates), dtype=int or\ array-like of shape (n_candidates, n_features), default=None See parameter `annotators`. annotators : None or array-like of shape (n_avl_annotators), dtype=int\ or array-like of shape (n_candidates, n_annotators),\ default=None - If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. - If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. - If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. - If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. - If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. - If `annotators` is `None`, all annotators are considered as available annotators. - If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. - If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. batch_size : int or str, default=1 The number of annotators-sample pairs to be selected in one AL cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed. return_utilities : bool, default=False If True, also return the utilities based on the query strategy. Returns ------- query_indices : np.ndarray of shape (batch_size, 2) The `query_indices` indicate which candidate sample pairs are to be queried is, i.e., which candidate sample is to be annotated by which annotator, e.g., `query_indices[:, 0]` indicates the selected candidate samples and `query_indices[:, 1]` indicates the respectively selected annotators. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing of refers to samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to samples in `candidates`. utilities: numpy.ndarray of shape (batch_size, n_samples,\ n_annotators) or numpy.ndarray of shape (batch_size,\ n_candidates, n_annotators) The utilities of all candidate samples w.r.t. to the available annotators after each selected sample of the batch, e.g., `utilities[0, :, j]` indicates the utilities used for selecting the first sample-annotator-pair (with indices `query_indices[0]`). - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to samples in `candidates`. """ raise NotImplementedError
def _validate_data( self, X, y, candidates, annotators, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples, n_annotators) Labels of the training data set for each annotator (possibly including unlabeled ones indicated by `self.missing_label`), meaning that `y[i, j]` contains the label annotated by annotator `i` for sample `j`. candidates : None or array-like of shape (n_candidates), dtype=int or\ array-like of shape (n_candidates, n_features), See annotators. annotators : None or array-like of shape (n_avl_annotators), dtype=int\ or array-like of shape (n_candidates, n_annotators), - If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. - If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. - If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. - If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. - If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. - If `annotators` is `None`, all annotators are considered as available annotators. - If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. - If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. batch_size : int or string, The number of annotators sample pairs to be selected in one AL cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples, n_annotators) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or\ np.ndarray of shape (n_candidates, n_features) Checked candidate samples. annotators : None or np.ndarray of shape (n_avl_annotators), dtype=int\ or np.ndarray of shape (n_candidates, n_annotators) Checked annotator boolean array batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool, Checked boolean value of `return_utilities`. """ ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset, check_X_dict ) check_array(y, ensure_2d=True, ensure_all_finite="allow-nan") unlabeled_pairs = is_unlabeled(y, missing_label=self.missing_label_) if annotators is not None: annotators = check_array( annotators, ensure_2d=False, allow_nd=True ) if annotators.ndim == 1: annotators = check_indices(annotators, y, dim=1) elif annotators.ndim == 2: annotators = check_array(annotators, dtype=bool) if candidates is None: check_consistent_length(X, annotators) else: check_consistent_length(candidates, annotators) check_consistent_length(y.T, annotators.T) else: raise ValueError( "`annotators` must be either None, 1d or 2d array-like." ) if annotators is None: if candidates is None: n_candidate_pairs = int(np.sum(unlabeled_pairs)) else: n_candidate_pairs = len(candidates) * len(y.T) elif annotators.ndim == 1: if candidates is None: n_candidate_pairs = len(X) * len(annotators) else: n_candidate_pairs = len(candidates) * len(annotators) else: n_candidate_pairs = int(np.sum(annotators)) if n_candidate_pairs < batch_size: warnings.warn( f"'batch_size={batch_size}' is larger than number of " f"candidates pairs. Instead, 'batch_size={n_candidate_pairs}'" f" was set." ) batch_size = n_candidate_pairs return X, y, candidates, annotators, batch_size, return_utilities def _transform_cand_annot( self, candidates, annotators, X, y, enforce_mapping=False ): """ Transforms the `candidates` parameter into a sample array and the corresponding index array `mapping` such that `candidates = X[mapping]`, and transforms `annotators` into a boolean array such that `A_cand` represents the available annotator sample pairs for the samples of candidates. Parameters ---------- candidates : None or array-like of shape (n_candidates), dtype=int or\ array-like of shape (n_candidates, n_features), See annotators. annotators : None or array-like of shape (n_avl_annotators), dtype=int\ or array-like of shape (n_candidates, n_annotators), - If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. - If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. - If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. - If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. - If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. - If `annotators` is `None`, all annotators are considered as available annotators. - If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. - If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. enforce_mapping : bool, default=False If `True`, an exception is raised when no exact mapping can be determined (i.e., `mapping` is `None`). Returns ------- candidates : np.ndarray of shape (n_selectable_candidates, n_features) Candidate samples from which the strategy can query the label. mapping : np.ndarray of shape (n_selectable_candidates) or None Index array that maps `candidates` to `X` (`candidates = X[mapping]`). A_cand : np.ndarray of shape(n_selectable_candidates, n_annotators) Available annotator-sample-pairs with respect to `candidates`. """ unlbd_pairs = is_unlabeled(y, self.missing_label_) unlbd_sample_indices = np.argwhere( np.any(unlbd_pairs, axis=1) ).flatten() n_annotators = y.shape[1] # if mapping does not exist if candidates is not None and candidates.ndim == 2: n_candidates = len(candidates) if annotators is None: A_cand = np.full((n_candidates, n_annotators), True) elif annotators.ndim == 1: A_cand = np.full((n_candidates, n_annotators), False) A_cand[:, annotators] = True else: A_cand = annotators if enforce_mapping: raise ValueError( "Mapping `candidates` to `X` is not posssible" "but `enforce_mapping` is True. Use index" "array for `candidates` instead." ) else: return candidates, None, A_cand # mapping exists if candidates is None: if annotators is None: candidates = unlbd_sample_indices A_cand = unlbd_pairs[unlbd_sample_indices] elif annotators.ndim == 1: candidates = np.arange(len(X), dtype=int) A_cand = np.full_like(y, False) A_cand[:, annotators] = True else: candidates = np.arange(len(X), dtype=int) A_cand = annotators else: # candidates indices array if annotators is None: A_cand = np.full((len(candidates), y.shape[1]), True) elif annotators.ndim == 1: A_cand = np.full((len(candidates), y.shape[1]), False) A_cand[:, annotators] = True else: candidates = candidates A_cand = annotators return X[candidates], candidates, A_cand
[docs]class BudgetManager(ABC, BaseEstimator): """Base class for all budget managers for stream-based active learning to model budgeting constraints. Parameters ---------- budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. """ def __init__(self, budget=None): self.budget = budget
[docs] @abstractmethod def query_by_utility(self, utilities, *args, **kwargs): """Ask the budget manager which `utilities` are sufficient to query the corresponding labels. Parameters ---------- utilities : array-like of shape (n_samples,) The utilities provided by the stream-based active learning strategy, which are used to determine whether querying a sample is worth it given the budgeting constraint. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. """ raise NotImplementedError
[docs] @abstractmethod def update(self, candidates, queried_indices, *args, **kwargs): """Updates the budget manager. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. Returns ------- self : BudgetManager The budget manager returns itself, after it is updated. """ raise NotImplementedError
def _validate_budget(self): """check the assigned `budget` and set the default value 0.1 if `budget` is set to `None`. """ if self.budget is not None: self.budget_ = self.budget else: self.budget_ = 0.1 check_scalar( self.budget_, "budget", float, min_val=0.0, max_val=1.0, min_inclusive=False, ) def _validate_data(self, utilities, *args, **kwargs): """Validate input data. Parameters ---------- utilities: array-like of shape (n_samples,) The `utilities` provided by the stream-based active learning strategy. Returns ------- utilities: ndarray of shape (n_samples,) Checked `utilities`. """ # Check if utilities is set if not isinstance(utilities, np.ndarray): raise TypeError( "{} is not a valid type for utilities".format(type(utilities)) ) # Check budget self._validate_budget() return utilities
[docs]class SingleAnnotatorStreamQueryStrategy(QueryStrategy): """Base class for all stream-based active learning query strategies. Parameters ---------- budget : float Specifies the ratio of labels which are allowed to be queried, with `0 <= budget <= 1`. random_state : int or RandomState instance or None, default=None Controls the randomness of the estimator. """ def __init__(self, budget, random_state=None): super().__init__(random_state=random_state) self.budget = budget
[docs] @abstractmethod def query(self, candidates, *args, return_utilities=False, **kwargs): """Determines for which candidate samples labels are to be queried. The query startegy determines the most useful samples in candidates, which can be acquired within the budgeting constraint specified by `budget`. Please note that, this method does not change the internal state of the query strategy. To adapt the query strategy to the selected candidates, use `update(...)`. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. utilities: np.ndarray of shape (n_candidates,), The utilities based on the query strategy. Only provided if `return_utilities` is `True`. """ raise NotImplementedError
[docs] @abstractmethod def update( self, candidates, queried_indices, *args, budget_manager_param_dict=None, **kwargs, ): """Updates the budget manager and the count for seen and queried labels. This function should be used in conjunction with the `query` function. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. budget_manager_param_dict : dict, default=None Optional kwargs for budget_manager. Returns ------- self : SingleAnnotatorStreamQueryStrategy The query strategy returns itself, after it is updated. """ raise NotImplementedError
def _validate_random_state(self): """Creates a copy 'random_state_' if random_state is an instance of np.random_state. If not create a new random state. See also :func:`~sklearn.utils.check_random_state` """ if not hasattr(self, "random_state_"): self.random_state_ = deepcopy(self.random_state) self.random_state_ = check_random_state(self.random_state_) def _validate_budget(self): """Creates a copy "budget_" if budget is a float between 0 and 1. If it is `None`, `budget_` is set to 0.1. """ if self.budget is not None: self.budget_ = self.budget else: self.budget_ = 0.1 check_scalar( self.budget_, "budget", float, min_val=0.0, max_val=1.0, min_inclusive=False, ) def _validate_data( self, candidates, return_utilities, *args, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, If `True`, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples. return_utilities : bool, Checked boolean value of `return_utilities`. """ # Check candidate samples. candidates = check_array(candidates, **check_candidates_params) # Check number of features. check_n_features(self, candidates, reset=reset) # Check return_utilities. check_scalar(return_utilities, "return_utilities", bool) # Check random state. self._validate_random_state() # Check budgetmanager. self._validate_budget() return candidates, return_utilities
[docs]class SkactivemlClassifier(ClassifierMixin, BaseEstimator, ABC): """Skactiveml Classifier Base class for `scikit-activeml` classifiers such that missing labels, user-defined classes, and cost-sensitive classification (i.e., cost matrix) can be handled. Parameters ---------- classes : array-like of shape (n_classes), default=None Holds the label for each class. If `None`, the classes are determined during the fit. missing_label : scalar, string, np.nan, or None, default=np.nan Value to represent a missing label. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class `classes[j]` for a sample of class `classes[i]`. Can be only set, if `classes` is not `None`. random_state : int or RandomState instance or None, default=None Determines random number for `predict` method. Pass an int for reproducible results across multiple method calls. Attributes ---------- classes_ : array-like of shape (n_classes,) Holds the label for each class after fitting. cost_matrix_ : array-like,of shape (classes, classes) Cost matrix after fitting with `cost_matrix_[i,j]` indicating cost of predicting class `classes_[j]` for a sample of class `classes_[i]`. """ def __init__( self, classes=None, missing_label=MISSING_LABEL, cost_matrix=None, random_state=None, ): self.classes = classes self.missing_label = missing_label self.cost_matrix = cost_matrix self.random_state = random_state
[docs] @abstractmethod def fit(self, X, y, sample_weight=None): """Fit the model using X as training data and y as class labels. Parameters ---------- X : matrix-like, shape (n_samples, n_features) The sample matrix `X` is the feature matrix representing the samples. y : array-like, shape (n_samples) or (n_samples, n_outputs) It contains the class labels of the training samples. The number of class labels may be variable for the samples, where missing labels are represented the attribute `missing_label`. sample_weight : array-like, shape (n_samples) or (n_samples, n_outputs) It contains the weights of the training samples' class labels. It must have the same shape as `y`. Returns ------- self: skactiveml.base.SkactivemlClassifier, The `skactiveml.base.SkactivemlClassifier` object fitted on the training data. """ raise NotImplementedError
[docs] def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. Returns ------- P : numpy.ndarray of shape (n_samples, classes) The class probabilities of the test samples. Classes are ordered according to `self.classes_`. """ raise NotImplementedError
[docs] def predict(self, X): """Return class label predictions for the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- y : numpy.ndarray of shape (n_samples,) Predicted class labels of the test samples `X`. """ P = self.predict_proba(X) costs = np.dot(P, self.cost_matrix_) y_pred = rand_argmin(costs, random_state=self.random_state_, axis=1) y_pred = self._le.inverse_transform(y_pred) y_pred = np.asarray(y_pred, dtype=self.classes_.dtype) return y_pred
[docs] def score(self, X, y, sample_weight=None): """Return the mean accuracy on the given test data and labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) True labels for `X`. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float Mean accuracy of `self.predict(X)` regarding `y`. """ y = self._le.transform(y) y_pred = self._le.transform(self.predict(X)) return accuracy_score(y, y_pred, sample_weight=sample_weight)
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, reset=True, ): if check_X_dict is None: check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0} if check_y_dict is None: check_y_dict = { "ensure_min_samples": 0, "ensure_min_features": 0, "ensure_2d": False, "ensure_all_finite": False, "dtype": None, } # Check common classifier parameters. check_classifier_params( self.classes, self.missing_label, self.cost_matrix ) # Store and check random state. self.random_state_ = check_random_state(self.random_state) # Create label encoder. self._le = ExtLabelEncoder( classes=self.classes, missing_label=self.missing_label ) # Check input parameters. y = check_array(y, **check_y_dict) error_msg = ( "No class label is known because 'y' contains no actual " "class labels and 'classes' is not defined. Change at " "least on of both to overcome this error." ) if len(y) > 0: y = column_or_1d(y) if y_ensure_1d else y y = self._le.fit_transform(y) is_lbdl = is_labeled(y, missing_label=-1) if len(y[is_lbdl]) > 0: check_classification_targets(y[is_lbdl]) if len(self._le.classes_) == 0: raise ValueError(error_msg) else: if self.classes is None: raise ValueError(error_msg) self._le.fit(self.classes) check_X_dict["ensure_2d"] = False X = check_array(X, **check_X_dict) check_consistent_length(X, y) check_n_features(self, X, reset=reset) # Update detected classes. self.classes_ = self._le.classes_ # Check classes. if sample_weight is not None: sample_weight = check_array(sample_weight, **check_y_dict) if not np.array_equal(y.shape, sample_weight.shape): raise ValueError( f"`y` has the shape {y.shape} and `sample_weight` has the " f"shape {sample_weight.shape}. Both need to have " f"identical shapes." ) # Update cost matrix. self.cost_matrix_ = ( 1 - np.eye(len(self.classes_)) if self.cost_matrix is None else self.cost_matrix ) self.cost_matrix_ = check_cost_matrix( self.cost_matrix_, len(self.classes_) ) if self.classes is not None: class_indices = np.argsort(self.classes) self.cost_matrix_ = self.cost_matrix_[class_indices] self.cost_matrix_ = self.cost_matrix_[:, class_indices] return X, y, sample_weight
[docs]class ClassFrequencyEstimator(SkactivemlClassifier): """Class Frequency Estimator Extends `scikit-activeml` classifiers to estimators that are able to estimate class frequencies for given samples (by calling `predict_freq`). Parameters ---------- classes : array-like, shape (n_classes), default=None Holds the label for each class. If `None`, the classes are determined during the fit. missing_label : scalar or str or np.nan or None, default=np.nan Value to represent a missing label. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class `classes[j]` for a sample of class `classes[i]`. Can be only set, if classes is not `None`. class_prior : float or array-like, shape (n_classes), default=0 Prior observations of the class frequency estimates. If `class_prior` is an array, the entry `class_prior[i]` indicates the non-negative prior number of samples belonging to class `classes_[i]`. If `class_prior` is a float, `class_prior` indicates the non-negative prior number of samples per class. random_state : int or np.RandomState or None, default=None Determines random number for `predict` method. Pass an int for reproducible results across multiple method calls. Attributes ---------- classes_ : np.ndarray of shape (n_classes) Holds the label for each class after fitting. class_prior_ : np.ndarray of shape (n_classes) Prior observations of the class frequency estimates. The entry `class_prior_[i]` indicates the non-negative prior number of samples belonging to class `classes_[i]`. cost_matrix_ : np.ndarray of shape (classes, classes) Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting class `classes_[j]` for a sample of class `classes_[i]`. """ def __init__( self, class_prior=0, classes=None, missing_label=MISSING_LABEL, cost_matrix=None, random_state=None, ): super().__init__( classes=classes, missing_label=missing_label, cost_matrix=cost_matrix, random_state=random_state, ) self.class_prior = class_prior
[docs] @abstractmethod def predict_freq(self, X): """Return class frequency estimates for the test samples `X`. Parameters ---------- X: array-like of shape (n_samples, n_features) Test samples whose class frequencies are to be estimated. Returns ------- F: array-like of shape (n_samples, classes) The class frequency estimates of the test samples `X`. Classes are ordered according to attribute `classes_`. """ raise NotImplementedError
[docs] def predict_proba(self, X): """Return probability estimates for the test data `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- P : array-like of shape (n_samples, classes) The class probabilities of the test samples. Classes are ordered according to `self.classes_`. """ # Normalize probabilities of each sample. P = self.predict_freq(X) + self.class_prior_ normalizer = np.sum(P, axis=1) P[normalizer > 0] /= normalizer[normalizer > 0, np.newaxis] P[normalizer == 0, :] = [1 / len(self.classes_)] * len(self.classes_) return P
[docs] def sample_proba(self, X, n_samples=10, random_state=None): """Samples probability vectors from Dirichlet distributions whose parameters `alphas` are defined as the sum of the frequency estimates returned by `predict_freq` and the `class_prior`. Parameters ---------- X : array-like of shape (n_test_samples, n_features) Test samples for which `n_samples` probability vectors are to be sampled. n_samples : int, default=10 Number of probability vectors to sample for each `X[i]`. random_state : int or numpy.random.RandomState or None, default=None Ensure reproducibility when sampling probability vectors from the Dirichlet distributions. Returns ------- P : array-like of shape (n_samples, n_test_samples, n_classes) There are `n_samples` class probability vectors for each test sample in `X`. Classes are ordered according to `self.classes_`. """ random_state = check_random_state(random_state) alphas = self.predict_freq(X) + self.class_prior_ alphas = alphas.repeat(repeats=n_samples, axis=0) if (alphas == 0).any(): raise ValueError( "There are zero frequency observations. " "Set `class_prior > 0` to avoid this error." ) R = random_state.standard_gamma(alphas) R_sums = R.sum(axis=-1) is_zero = (R_sums == 0.0).ravel() sampled_class_indices = random_state.choice( np.array(R.shape[-1]), size=is_zero.sum() ) R[is_zero, sampled_class_indices] = 1.0 P = R / R.sum(axis=-1, keepdims=True) P = P.reshape(n_samples, len(X), P.shape[-1], order="F") return P
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, ): X, y, sample_weight = super()._validate_data( X=X, y=y, sample_weight=sample_weight, check_X_dict=check_X_dict, check_y_dict=check_y_dict, y_ensure_1d=y_ensure_1d, ) # Check class prior. self.class_prior_ = check_class_prior( self.class_prior, len(self.classes_) ) return X, y, sample_weight
[docs]class SkactivemlRegressor(RegressorMixin, BaseEstimator, ABC): """Skactiveml Regressor Base class for `scikit-activeml` regressors. Parameters __________ missing_label : scalar, string, np.nan, or None, default=np.nan Value to represent a missing label. random_state : int, RandomState or None, default=None Determines random number for `fit` and `predict` method. Pass an int for reproducible results across multiple method calls. """ def __init__(self, missing_label=MISSING_LABEL, random_state=None): self.missing_label = missing_label self.random_state = random_state
[docs] @abstractmethod def fit(self, X, y, sample_weight=None): """Fit the model using `X` as training data and y as numerical labels. Parameters ---------- X : matrix-like of shape (n_samples, n_features) The sample matrix X is the feature matrix representing the samples. y : array-like, shape (n_samples) or (n_samples, n_targets) It contains the labels of the training samples. The number of numerical labels may be variable for the samples, where missing labels are represented as `missing_label_`. sample_weight : array-like, shape (n_samples) It contains the weights of the training samples' values. Returns ------- self: skactiveml.base.SkactivemlRegressor, The `skactiveml.base.SkactivemlRegressor` object fitted on the training data. """ raise NotImplementedError
[docs] @abstractmethod def predict(self, X): """Return value predictions for the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- y : numpy.ndarray of shape (n_samples,) Predicted values of the test samples `X`. """ raise NotImplementedError
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, reset=True, ): if check_X_dict is None: check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0} if check_y_dict is None: check_y_dict = { "ensure_min_samples": 0, "ensure_min_features": 0, "ensure_2d": False, "ensure_all_finite": False, "dtype": None, } check_missing_label(self.missing_label) self.missing_label_ = self.missing_label # Store and check random state. self.random_state_ = check_random_state(self.random_state) y = check_array(y, **check_y_dict) if len(y) > 0: y = column_or_1d(y) if y_ensure_1d else y else: check_X_dict["ensure_2d"] = False if sample_weight is not None: sample_weight = check_array(sample_weight, **check_y_dict) if not np.array_equal(y.shape, sample_weight.shape): raise ValueError( f"`y` has the shape {y.shape} and `sample_weight` has the " f"shape {sample_weight.shape}. Both need to have " f"identical shapes." ) X = check_array(X, **check_X_dict) check_consistent_length(X, y) check_n_features(self, X, reset=reset) return X, y, sample_weight
[docs]class ProbabilisticRegressor(SkactivemlRegressor): """ProbabilisticRegressor Base class for `scikit-activeml` probabilistic regressors. """
[docs] @abstractmethod def predict_target_distribution(self, X): """Returns the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like, shape (n_samples, n_features) Input samples. Returns ------- dist : scipy.stats._distn_infrastructure.rv_frozen The distribution of the targets at the test samples. """ raise NotImplementedError
[docs] def predict(self, X, return_std=False, return_entropy=False): """Returns the mean, std (optional) and differential entropy (optional) of the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. return_std : bool, default=False Whether to return the standard deviation. return_entropy : bool, default=False Whether to return the differential entropy. Returns ------- mu : numpy.ndarray, shape (n_samples,) Predicted mean conditioned on `X`. std : numpy.ndarray, shape (n_samples,), optional Predicted standard deviation conditioned on `X`. entropy : numpy.ndarray, optional Predicted differential entropy conditioned on `X`. """ check_scalar(return_std, "return_std", bool) check_scalar(return_entropy, "return_entropy", bool) rv = self.predict_target_distribution(X) result = (rv.mean(),) if return_std: result += (rv.std(),) if return_entropy: result += (rv.entropy(),) if len(result) == 1: result = result[0] return result
[docs] def sample_y(self, X, n_samples=1, random_state=None): """Returns random samples from the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like of shape (n_samples_X, n_features) Input samples, where the target values are drawn from. n_samples: int, default=1 Number of random samples to be drawn. random_state : int or RandomState instance or None, default=None Determines random number generation to randomly draw samples. Pass an int for reproducible results across multiple method calls. Returns ------- y_samples : numpy.ndarray of shape (n_samples_X, n_samples) Drawn random target samples. """ rv = self.predict_target_distribution(X) rv_samples = rv.rvs( size=(n_samples, len(X)), random_state=random_state ) return rv_samples.T
[docs]class AnnotatorModelMixin(ABC): """Annotator Model Base class of all annotator models estimating the performances of annotators for given samples. """
[docs] @abstractmethod def predict_annotator_perf(self, X): """Calculates the performance of an annotator to provide the true label for a given sample. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. Returns ------- P_annot : numpy.ndarray of shape (n_samples, n_annotators) `P_annot[i,l]` is the performance of annotator `l` regarding the annotation of sample `X[i]`. """ raise NotImplementedError