Source code for skactiveml.base

"""
The :mod:`skactiveml.base` package implements the base classes for
:mod:`skactiveml`.
"""

import warnings
from abc import ABC, abstractmethod
from copy import deepcopy

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
    check_array,
    check_consistent_length,
    column_or_1d,
)

from .exceptions import MappingError
from .utils import (
    MISSING_LABEL,
    is_labeled,
    is_unlabeled,
    unlabeled_indices,
    ExtLabelEncoder,
    rand_argmin,
    check_classifier_params,
    check_random_state,
    check_cost_matrix,
    check_scalar,
    check_class_prior,
    check_missing_label,
    check_indices,
)

# '__all__' is necessary to create the sphinx docs.
__all__ = [
    "QueryStrategy",
    "SingleAnnotatorPoolQueryStrategy",
    "MultiAnnotatorPoolQueryStrategy",
    "BudgetManager",
    "SingleAnnotatorStreamQueryStrategy",
    "SkactivemlClassifier",
    "ClassFrequencyEstimator",
    "AnnotatorModelMixin",
    "SkactivemlRegressor",
    "ProbabilisticRegressor",
]


[docs]class QueryStrategy(ABC, BaseEstimator): """Base class for all query strategies in scikit-activeml. Parameters ---------- random_state : int or RandomState instance, optional (default=None) Controls the randomness of the estimator. """ def __init__(self, random_state=None): self.random_state = random_state
[docs] @abstractmethod def query(self, *args, **kwargs): """ Determines the query for active learning based on input arguments. """ raise NotImplementedError
class PoolQueryStrategy(QueryStrategy): """Base class for all pool-based active learning query strategies in scikit-activeml. Parameters ---------- missing_label : scalar or string or np.nan or None, optional (default=np.nan) Value to represent a missing label. random_state : int or RandomState instance, optional (default=None) Controls the randomness of the estimator. """ def __init__(self, missing_label=MISSING_LABEL, random_state=None): super().__init__(random_state=random_state) self.missing_label = missing_label def _validate_data( self, X, y, candidates, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, *) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL. candidates : None or array-like of shape (n_candidates), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If candidates is None, the unlabeled samples from (X,y) are considered as candidates. If candidates is of shape (n_candidates) and of type int, candidates is considered as the indices of the samples in (X,y). If candidates is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. batch_size : int The number of samples to be selected in one AL cycle. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples, *) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool, Checked boolean value of `return_utilities`. """ # Check samples. if check_X_dict is None: check_X_dict = {"allow_nd": True} X = check_array(X, **check_X_dict) # Check number of features. self._check_n_features(X, reset=reset) # Check labels y = check_array( y, ensure_2d=False, force_all_finite="allow-nan", dtype=None ) check_consistent_length(X, y) # Check missing_label check_missing_label(self.missing_label, target_type=y.dtype) self.missing_label_ = self.missing_label # Check candidates (+1 to avoid zero multiplier). seed_mult = int(np.sum(is_unlabeled(y, self.missing_label_))) + 1 if candidates is not None: candidates = np.array(candidates) if candidates.ndim == 1: candidates = check_indices(candidates, y, dim=0) else: check_candidates_dict = deepcopy(check_X_dict) check_candidates_dict["ensure_2d"] = False candidates = check_array(candidates, **check_candidates_dict) self._check_n_features(candidates, reset=False) # Check return_utilities. check_scalar(return_utilities, "return_utilities", bool) # Check batch size. check_scalar(batch_size, target_type=int, name="batch_size", min_val=1) # Check random state. self.random_state_ = check_random_state(self.random_state, seed_mult) return X, y, candidates, batch_size, return_utilities
[docs]class SingleAnnotatorPoolQueryStrategy(PoolQueryStrategy): """Base class for all pool-based active learning query strategies with a single annotator in scikit-activeml. """
[docs] @abstractmethod def query( self, X, y, *args, candidates=None, batch_size=1, return_utilities=False, **kwargs, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL). candidates : None or array-like of shape (n_candidates), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If candidates is None, the unlabeled samples from (X,y) are considered as candidates. If candidates is of shape (n_candidates) and of type int, candidates is considered as the indices of the samples in (X,y). If candidates is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. batch_size : int, optional (default=1) The number of samples to be selected in one AL cycle. return_utilities : bool, optional (default=False) If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size) The query_indices indicate for which candidate sample a label is to queried, e.g., `query_indices[0]` indicates the first selected sample. If candidates is None or of shape (n_candidates), the indexing refers to samples in X. If candidates is of shape (n_candidates, n_features), the indexing refers to samples in candidates. utilities : numpy.ndarray of shape (batch_size, n_samples) or numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. If candidates is None or of shape (n_candidates), the indexing refers to samples in X. If candidates is of shape (n_candidates, n_features), the indexing refers to samples in candidates. """ raise NotImplementedError
def _validate_data( self, X, y, candidates, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL. candidates : None or array-like of shape (n_candidates,), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If candidates is None, the unlabeled samples from (X,y) are considered as candidates. If candidates is of shape (n_candidates,) and of type int, candidates is considered as the indices of the samples in (X,y). If candidates is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. batch_size : int The number of samples to be selected in one AL cycle. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool, Checked boolean value of `return_utilities`. """ ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset, check_X_dict ) y = column_or_1d(y, warn=True) if candidates is None: n_candidates = int( np.sum(is_unlabeled(y, missing_label=self.missing_label_)) ) else: n_candidates = len(candidates) if n_candidates < batch_size: warnings.warn( f"'batch_size={batch_size}' is larger than number of " f"candidates. Instead, 'batch_size={n_candidates}' was set." ) batch_size = n_candidates return X, y, candidates, batch_size, return_utilities def _transform_candidates( self, candidates, X, y, enforce_mapping=False, allow_only_unlabeled=False, ): """ Transforms the `candidates` parameter into a sample array and the corresponding index array `mapping` such that `candidates = X[mapping]`. Parameters ---------- candidates : None or np.ndarray of shape (n_candidates), dtype=int or np.ndarray of shape (n_candidates, n_features) Checked candidate samples. If candidates is None, the unlabeled samples from (X,y) are considered as candidates. If candidates is of shape (n_candidates) and of type int, candidates is considered as the indices of the samples in (X,y). If candidates is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples) Checked labels of the training data set. enforce_mapping : bool, default=False If True, an exception is raised when no exact mapping can be determined (i.e., `mapping` is None). allow_only_unlabeled : bool, default=False If True, an exception is raised when indices of candidates contain labeled samples. Returns ------- candidates : np.ndarray of shape (n_candidates, n_features) Candidate samples from which the strategy can query the label. mapping : np.ndarray of shape (n_candidates) or None Index array that maps `candidates` to `X`. (`candidates = X[mapping]`) """ if candidates is None: ulbd_idx = unlabeled_indices(y, self.missing_label_) return X[ulbd_idx], ulbd_idx elif candidates.ndim == 1: if allow_only_unlabeled: if is_labeled(y[candidates], self.missing_label_).any(): raise ValueError( "Candidates must not contain labeled " "samples." ) return X[candidates], candidates else: if enforce_mapping: raise MappingError( "Mapping `candidates` to `X` is not " "possible but `enforce_mapping` is True. " "Use index array for `candidates` instead." ) else: return candidates, None
[docs]class MultiAnnotatorPoolQueryStrategy(PoolQueryStrategy): """Base class for all pool-based active learning query strategies with multiple annotators in scikit-activeml. Parameters ---------- missing_label : scalar or string or np.nan or None, optional (default=np.nan) Value to represent a missing label. random_state : int or RandomState instance, optional (default=None) Controls the randomness of the estimator. """
[docs] @abstractmethod def query( self, X, y, *args, candidates=None, annotators=None, batch_size=1, return_utilities=False, **kwargs, ): """Determines which candidate sample is to be annotated by which annotator. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples, n_annotators) Labels of the training data set for each annotator (possibly including unlabeled ones indicated by self.MISSING_LABEL), meaning that `y[i, j]` contains the label annotated by annotator `i` for sample `j`. candidates : None or array-like of shape (n_candidates), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) See parameter `annotators`. annotators : None or array-like of shape (n_avl_annotators), dtype=int or array-like of shape (n_candidates, n_annotators), optional (default=None) If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. If `annotators` is `None`, all annotators are considered as available annotators. If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. If `annotators` is a boolean array of shape (n_candidates, n_annotators) the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. batch_size : int, optional (default=1) The number of annotators-sample pairs to be selected in one AL cycle. return_utilities : bool, optional (default=False) If True, also return the utilities based on the query strategy. Returns ------- query_indices : np.ndarray of shape (batchsize, 2) The query_indices indicate which candidate sample pairs are to be queried is, i.e., which candidate sample is to be annotated by which annotator, e.g., `query_indices[:, 0]` indicates the selected candidate samples and `query_indices[:, 1]` indicates the respectively selected annotators. If candidates is None or of shape (n_candidates), the indexing of refers to samples in X. If candidates is of shape (n_candidates, n_features), the indexing refers to samples in candidates. utilities: numpy.ndarray of shape (batch_size, n_samples, n_annotators) or numpy.ndarray of shape (batch_size, n_candidates, n_annotators) The utilities of all candidate samples w.r.t. to the available annotators after each selected sample of the batch, e.g., `utilities[0, :, j]` indicates the utilities used for selecting the first sample-annotator-pair (with indices `query_indices[0]`). If `candidates is None` or of shape (n_candidates), the indexing refers to samples in `X`. If `candidates` is of shape (n_candidates, n_features), the indexing refers to samples in `candidates`. """ raise NotImplementedError
def _validate_data( self, X, y, candidates, annotators, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples, n_annotators) Labels of the training data set for each annotator (possibly including unlabeled ones indicated by self.MISSING_LABEL), meaning that `y[i, j]` contains the label annotated by annotator `i` for sample `j`. candidates : None or array-like of shape (n_candidates), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) See annotators. annotators : None or array-like of shape (n_avl_annotators), dtype=int or array-like of shape (n_candidates, n_annotators), optional (default=None) If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. If `candidates` is of shape `(n_candidates, n_features)`, the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. If `annotators` is `None`, all annotators are considered as available annotators. If `annotators` is of shape `(n_avl_annotators)`, and of type int, `annotators` is considered as the indices of the available annotators. If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample-pairs. batch_size : int or string, optional (default=1) The number of annotators sample pairs to be selected in one AL cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples, n_annotators) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or np.ndarray of shape (n_candidates, n_features) Checked candidate samples. annotators : None or np.ndarray of shape (n_avl_annotators), dtype=int or np.ndarray of shape (n_candidates, n_annotators) Checked annotator boolean array batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool, Checked boolean value of `return_utilities`. """ ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset, check_X_dict ) check_array(y, ensure_2d=True, force_all_finite="allow-nan") unlabeled_pairs = is_unlabeled(y, missing_label=self.missing_label_) if annotators is not None: annotators = check_array( annotators, ensure_2d=False, allow_nd=True ) if annotators.ndim == 1: annotators = check_indices(annotators, y, dim=1) elif annotators.ndim == 2: annotators = check_array(annotators, dtype=bool) if candidates is None: check_consistent_length(X, annotators) else: check_consistent_length(candidates, annotators) check_consistent_length(y.T, annotators.T) else: raise ValueError( "`annotators` must be either None, 1d or 2d array-like." ) if annotators is None: if candidates is None: n_candidate_pairs = int(np.sum(unlabeled_pairs)) else: n_candidate_pairs = len(candidates) * len(y.T) elif annotators.ndim == 1: if candidates is None: n_candidate_pairs = len(X) * len(annotators) else: n_candidate_pairs = len(candidates) * len(annotators) else: n_candidate_pairs = int(np.sum(annotators)) if n_candidate_pairs < batch_size: warnings.warn( f"'batch_size={batch_size}' is larger than number of " f"candidates pairs. Instead, 'batch_size={n_candidate_pairs}'" f" was set." ) batch_size = n_candidate_pairs return X, y, candidates, annotators, batch_size, return_utilities def _transform_cand_annot( self, candidates, annotators, X, y, enforce_mapping=False ): """ Transforms the `candidates` parameter into a sample array and the corresponding index array `mapping` such that `candidates = X[mapping]`, and transforms `annotators` into a boolean array such that `A_cand` represents the available annotator sample pairs for the samples of candidates. Parameters ---------- candidates : None or array-like of shape (n_candidates), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) See annotators. annotators : None or array-like of shape (n_avl_annotators), dtype=int or array-like of shape (n_candidates, n_annotators), optional (default=None) If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. If `candidates` is `None`, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. If `candidates` is of shape (n_candidates,) and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. If `candidates` is of shape `(n_candidates, n_features)`, the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. If `annotators` is `None`, all annotators are considered as available annotators. If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample-pairs. X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. enforce_mapping : bool, optional (default=False) If `True`, an exception is raised when no exact mapping can be determined (i.e., `mapping` is `None`). Returns ------- candidates : np.ndarray of shape (n_selectable_candidates, n_features) Candidate samples from which the strategy can query the label. mapping : np.ndarray of shape (n_selectable_candidates) or None Index array that maps `candidates` to `X` (`candidates = X[mapping]`). A_cand : np.ndarray of shape(n_selectable_candidates, n_annotators) Available annotator-sample-pairs with respect to `candidates`. """ unlbd_pairs = is_unlabeled(y, self.missing_label_) unlbd_sample_indices = np.argwhere( np.any(unlbd_pairs, axis=1) ).flatten() n_annotators = y.shape[1] # if mapping does not exist if candidates is not None and candidates.ndim == 2: n_candidates = len(candidates) if annotators is None: A_cand = np.full((n_candidates, n_annotators), True) elif annotators.ndim == 1: A_cand = np.full((n_candidates, n_annotators), False) A_cand[:, annotators] = True else: A_cand = annotators if enforce_mapping: raise ValueError( "Mapping `candidates` to `X` is not posssible" "but `enforce_mapping` is True. Use index" "array for `candidates` instead." ) else: return candidates, None, A_cand # mapping exists if candidates is None: if annotators is None: candidates = unlbd_sample_indices A_cand = unlbd_pairs[unlbd_sample_indices] elif annotators.ndim == 1: candidates = np.arange(len(X), dtype=int) A_cand = np.full_like(y, False) A_cand[:, annotators] = True else: candidates = np.arange(len(X), dtype=int) A_cand = annotators else: # candidates indices array if annotators is None: A_cand = np.full((len(candidates), y.shape[1]), True) elif annotators.ndim == 1: A_cand = np.full((len(candidates), y.shape[1]), False) A_cand[:, annotators] = True else: candidates = candidates A_cand = annotators return X[candidates], candidates, A_cand
[docs]class BudgetManager(ABC, BaseEstimator): """Base class for all budget managers for stream-based active learning in scikit-activeml to model budgeting constraints. Parameters ---------- budget : float (default=None) Specifies the ratio of instances which are allowed to be sampled, with 0 <= budget <= 1. If budget is None, it is replaced with the default budget 0.1. """ def __init__(self, budget=None): self.budget = budget
[docs] @abstractmethod def query_by_utility(self, utilities, *args, **kwargs): """Ask the budget manager which utilities are sufficient to query the corresponding instance. Parameters ---------- utilities : ndarray of shape (n_samples,) The utilities provided by the stream-based active learning strategy, which are used to determine whether sampling an instance is worth it given the budgeting constraint. Returns ------- queried_indices : ndarray of shape (n_queried_instances,) The indices of instances represented by utilities which should be queried, with 0 <= n_queried_instances <= n_samples. """ raise NotImplementedError
[docs] @abstractmethod def update(self, candidates, queried_indices, *args, **kwargs): """Updates the BudgetManager. Parameters ---------- candidates : {array-like, sparse matrix} of shape (n_samples, n_features) The instances which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : array-like Indicates which instances from candidates have been queried. Returns ------- self : BudgetManager The BudgetManager returns itself, after it is updated. """ raise NotImplementedError
def _validate_budget(self): """check the assigned budget and set the default value 0.1 if budget is set to None. """ if self.budget is not None: self.budget_ = self.budget else: self.budget_ = 0.1 check_scalar( self.budget_, "budget", float, min_val=0.0, max_val=1.0, min_inclusive=False, ) def _validate_data(self, utilities, *args, **kwargs): """Validate input data. Parameters ---------- utilities: ndarray of shape (n_samples,) The utilities provided by the stream-based active learning strategy. Returns ------- utilities: ndarray of shape (n_samples,) Checked utilities """ # Check if utilities is set if not isinstance(utilities, np.ndarray): raise TypeError( "{} is not a valid type for utilities".format(type(utilities)) ) # Check budget self._validate_budget() return utilities
[docs]class SingleAnnotatorStreamQueryStrategy(QueryStrategy): """Base class for all stream-based active learning query strategies in scikit-activeml. Parameters ---------- budget : float, default=None The budget which models the budgeting constraint used in the stream-based active learning setting. random_state : int, RandomState instance, default=None Controls the randomness of the estimator. """ def __init__(self, budget, random_state=None): super().__init__(random_state=random_state) self.budget = budget
[docs] @abstractmethod def query(self, candidates, *args, return_utilities=False, **kwargs): """Ask the query strategy which instances in candidates to acquire. The query startegy determines the most useful instances in candidates, which can be acquired within the budgeting constraint specified by the budgetmanager. Please note that, this method does not alter the internal state of the query strategy. To adapt the query strategy to the selected candidates, use update(...) with the selected candidates. Parameters ---------- candidates : {array-like, sparse matrix} of shape (n_samples, n_features) The instances which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, optional If true, also return the utilities based on the query strategy. The default is False. Returns ------- queried_indices : ndarray of shape (n_sampled_instances,) The indices of instances in candidates which should be sampled, with 0 <= n_sampled_instances <= n_samples. utilities: ndarray of shape (n_samples,), optional The utilities based on the query strategy. Only provided if return_utilities is True. """ raise NotImplementedError
[docs] @abstractmethod def update( self, candidates, queried_indices, *args, budget_manager_param_dict=None, **kwargs, ): """Update the query strategy with the decisions taken. This function should be used in conjunction with the query function, when the instances queried from query(...) may differ from the instances queried in the end. In this case use query(...) with simulate=true and provide the final decisions via update(...). This is especially helpful, when developing wrapper query strategies. Parameters ---------- candidates : {array-like, sparse matrix} of shape (n_samples, n_features) The instances which could be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : array-like Indicates which instances from candidates have been queried. budget_manager_param_dict : kwargs, optional Optional kwargs for budgetmanager. Returns ------- self : StreamBasedQueryStrategy The StreamBasedQueryStrategy returns itself, after it is updated. """ raise NotImplementedError
def _validate_random_state(self): """Creates a copy 'random_state_' if random_state is an instance of np.random_state. If not create a new random state. See also :func:`~sklearn.utils.check_random_state` """ if not hasattr(self, "random_state_"): self.random_state_ = deepcopy(self.random_state) self.random_state_ = check_random_state(self.random_state_) def _validate_budget(self): if self.budget is not None: self.budget_ = self.budget else: self.budget_ = 0.1 check_scalar( self.budget_, "budget", float, min_val=0.0, max_val=1.0, min_inclusive=False, ) def _validate_data( self, candidates, return_utilities, *args, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The instances which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples return_utilities : bool, Checked boolean value of `return_utilities`. """ # Check candidate instances. candidates = check_array(candidates, **check_candidates_params) # Check number of features. self._check_n_features(candidates, reset=reset) # Check return_utilities. check_scalar(return_utilities, "return_utilities", bool) # Check random state. self._validate_random_state() # Check budgetmanager. self._validate_budget() return candidates, return_utilities
[docs]class SkactivemlClassifier(BaseEstimator, ClassifierMixin, ABC): """SkactivemlClassifier Base class for scikit-activeml classifiers such that missing labels, user-defined classes, and cost-sensitive classification (i.e., cost matrix) can be handled. Parameters ---------- classes : array-like of shape (n_classes), default=None Holds the label for each class. If none, the classes are determined during the fit. missing_label : scalar, string, np.nan, or None, default=np.nan Value to represent a missing label. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class `classes[j]` for a sample of class `classes[i]`. Can be only set, if classes is not none. random_state : int or RandomState instance or None, default=None Determines random number for `predict` method. Pass an int for reproducible results across multiple method calls. Attributes ---------- classes_ : array-like, shape (n_classes) Holds the label for each class after fitting. cost_matrix_ : array-like,of shape (classes, classes) Cost matrix after fitting with `cost_matrix_[i,j]` indicating cost of predicting class `classes_[j]` for a sample of class `classes_[i]`. """ def __init__( self, classes=None, missing_label=MISSING_LABEL, cost_matrix=None, random_state=None, ): self.classes = classes self.missing_label = missing_label self.cost_matrix = cost_matrix self.random_state = random_state
[docs] @abstractmethod def fit(self, X, y, sample_weight=None): """Fit the model using X as training data and y as class labels. Parameters ---------- X : matrix-like, shape (n_samples, n_features) The sample matrix X is the feature matrix representing the samples. y : array-like, shape (n_samples) or (n_samples, n_outputs) It contains the class labels of the training samples. The number of class labels may be variable for the samples, where missing labels are represented the attribute 'missing_label'. sample_weight : array-like, shape (n_samples) or (n_samples, n_outputs) It contains the weights of the training samples' class labels. It must have the same shape as y. Returns ------- self: skactiveml.base.SkactivemlClassifier, The `skactiveml.base.SkactivemlClassifier` object fitted on the training data. """ raise NotImplementedError
[docs] def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : array-like, shape (n_samples, n_features) Test samples. Returns ------- P : numpy.ndarray, shape (n_samples, classes) The class probabilities of the test samples. Classes are ordered according to 'classes_'. """ raise NotImplementedError
[docs] def predict(self, X): """Return class label predictions for the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- y : numpy.ndarray of shape (n_samples) Predicted class labels of the test samples `X`. Classes are ordered according to `classes_`. """ P = self.predict_proba(X) costs = np.dot(P, self.cost_matrix_) y_pred = rand_argmin(costs, random_state=self.random_state_, axis=1) y_pred = self._le.inverse_transform(y_pred) y_pred = np.asarray(y_pred, dtype=self.classes_.dtype) return y_pred
[docs] def score(self, X, y, sample_weight=None): """Return the mean accuracy on the given test data and labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) True labels for `X`. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float Mean accuracy of `self.predict(X)` regarding `y`. """ y = self._le.transform(y) y_pred = self._le.transform(self.predict(X)) return accuracy_score(y, y_pred, sample_weight=sample_weight)
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, ): if check_X_dict is None: check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0} if check_y_dict is None: check_y_dict = { "ensure_min_samples": 0, "ensure_min_features": 0, "ensure_2d": False, "force_all_finite": False, "dtype": None, } # Check common classifier parameters. check_classifier_params( self.classes, self.missing_label, self.cost_matrix ) # Store and check random state. self.random_state_ = check_random_state(self.random_state) # Create label encoder. self._le = ExtLabelEncoder( classes=self.classes, missing_label=self.missing_label ) # Check input parameters. y = check_array(y, **check_y_dict) if len(y) > 0: y = column_or_1d(y) if y_ensure_1d else y y = self._le.fit_transform(y) is_lbdl = is_labeled(y, missing_label=-1) if len(y[is_lbdl]) > 0: check_classification_targets(y[is_lbdl]) if len(self._le.classes_) == 0: raise ValueError( "No class label is known because 'y' contains no actual " "class labels and 'classes' is not defined. Change at " "least on of both to overcome this error." ) else: self._le.fit(self.classes) check_X_dict["ensure_2d"] = False X = check_array(X, **check_X_dict) check_consistent_length(X, y) # Update detected classes. self.classes_ = self._le.classes_ # Check classes. if sample_weight is not None: sample_weight = check_array(sample_weight, **check_y_dict) if not np.array_equal(y.shape, sample_weight.shape): raise ValueError( f"`y` has the shape {y.shape} and `sample_weight` has the " f"shape {sample_weight.shape}. Both need to have " f"identical shapes." ) # Update cost matrix. self.cost_matrix_ = ( 1 - np.eye(len(self.classes_)) if self.cost_matrix is None else self.cost_matrix ) self.cost_matrix_ = check_cost_matrix( self.cost_matrix_, len(self.classes_) ) if self.classes is not None: class_indices = np.argsort(self.classes) self.cost_matrix_ = self.cost_matrix_[class_indices] self.cost_matrix_ = self.cost_matrix_[:, class_indices] return X, y, sample_weight def _check_n_features(self, X, reset): if reset: self.n_features_in_ = X.shape[1] if len(X) > 0 else None elif not reset: if self.n_features_in_ is not None: super()._check_n_features(X, reset=reset)
[docs]class ClassFrequencyEstimator(SkactivemlClassifier): """ClassFrequencyEstimator Extends scikit-activeml classifiers to estimators that are able to estimate class frequencies for given samples (by calling 'predict_freq'). Parameters ---------- classes : array-like, shape (n_classes), default=None Holds the label for each class. If none, the classes are determined during the fit. missing_label : scalar or str or np.nan or None, default=np.nan Value to represent a missing label. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class `classes[j]` for a sample of class `classes[i]`. Can be only set, if classes is not none. class_prior : float or array-like, shape (n_classes), default=0 Prior observations of the class frequency estimates. If `class_prior` is an array, the entry `class_prior[i]` indicates the non-negative prior number of samples belonging to class `classes_[i]`. If `class_prior` is a float, `class_prior` indicates the non-negative prior number of samples per class. random_state : int or np.RandomState or None, default=None Determines random number for 'predict' method. Pass an int for reproducible results across multiple method calls. Attributes ---------- classes_ : np.ndarray of shape (n_classes) Holds the label for each class after fitting. class_prior_ : np.ndarray of shape (n_classes) Prior observations of the class frequency estimates. The entry `class_prior_[i]` indicates the non-negative prior number of samples belonging to class `classes_[i]`. cost_matrix_ : np.ndarray of shape (classes, classes) Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting class `classes_[j]` for a sample of class `classes_[i]`. """ def __init__( self, class_prior=0, classes=None, missing_label=MISSING_LABEL, cost_matrix=None, random_state=None, ): super().__init__( classes=classes, missing_label=missing_label, cost_matrix=cost_matrix, random_state=random_state, ) self.class_prior = class_prior
[docs] @abstractmethod def predict_freq(self, X): """Return class frequency estimates for the test samples `X`. Parameters ---------- X: array-like of shape (n_samples, n_features) Test samples whose class frequencies are to be estimated. Returns ------- F: array-like of shape (n_samples, classes) The class frequency estimates of the test samples 'X'. Classes are ordered according to attribute 'classes_'. """ raise NotImplementedError
[docs] def predict_proba(self, X): """Return probability estimates for the test data `X`. Parameters ---------- X : array-like, shape (n_samples, n_features) or shape (n_samples, m_samples) if metric == 'precomputed' Input samples. Returns ------- P : array-like of shape (n_samples, classes) The class probabilities of the test samples. Classes are ordered according to classes_. """ # Normalize probabilities of each sample. P = self.predict_freq(X) + self.class_prior_ normalizer = np.sum(P, axis=1) P[normalizer > 0] /= normalizer[normalizer > 0, np.newaxis] P[normalizer == 0, :] = [1 / len(self.classes_)] * len(self.classes_) return P
[docs] def sample_proba(self, X, n_samples=10, random_state=None): """Samples probability vectors from Dirichlet distributions whose parameters `alphas` are defined as the sum of the frequency estimates returned by `predict_freq` and the `class_prior`. Parameters ---------- X : array-like of shape (n_test_samples, n_features) Test samples for which `n_samples` probability vectors are to be sampled. n_samples : int, default=10 Number of probability vectors to sample for each `X[i]`. random_state : int or numpy.random.RandomState or None, default=None Ensure reproducibility when sampling probability vectors from the Dirichlet distributions. Returns ------- P : array-like of shape (n_samples, n_test_samples, n_classes) There are `n_samples` class probability vectors for each test sample in `X`. Classes are ordered according to classes_. """ random_state = check_random_state(random_state) alphas = self.predict_freq(X) + self.class_prior_ alphas = alphas.repeat(repeats=n_samples, axis=0) if (alphas == 0).any(): raise ValueError( "There are zero frequency observations. " "Set `class_prior > 0` to avoid this error." ) R = random_state.standard_gamma(alphas) R_sums = R.sum(axis=-1) is_zero = (R_sums == 0.0).ravel() sampled_class_indices = random_state.choice( np.array(R.shape[-1]), size=is_zero.sum() ) R[is_zero, sampled_class_indices] = 1.0 P = R / R.sum(axis=-1, keepdims=True) P = P.reshape(n_samples, len(X), P.shape[-1], order="F") return P
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, ): X, y, sample_weight = super()._validate_data( X=X, y=y, sample_weight=sample_weight, check_X_dict=check_X_dict, check_y_dict=check_y_dict, y_ensure_1d=y_ensure_1d, ) # Check class prior. self.class_prior_ = check_class_prior( self.class_prior, len(self.classes_) ) return X, y, sample_weight
[docs]class SkactivemlRegressor(BaseEstimator, RegressorMixin, ABC): """SkactivemlRegressor Base class for scikit-activeml regressors. Parameters __________ missing_label : scalar, string, np.nan, or None, optional (default=skactiveml.utils.MISSING_LABEL) Value to represent a missing label. random_state : int, RandomState or None, optional (default=None) Determines random number for 'fit' and 'predict' method. Pass an int for reproducible results across multiple method calls. """ def __init__(self, missing_label=MISSING_LABEL, random_state=None): self.missing_label = missing_label self.random_state = random_state
[docs] @abstractmethod def fit(self, X, y, sample_weight=None): """Fit the model using X as training data and y as numerical labels. Parameters ---------- X : matrix-like, shape (n_samples, n_features) The sample matrix X is the feature matrix representing the samples. y : array-like, shape (n_samples) or (n_samples, n_targets) It contains the labels of the training samples. The number of numerical labels may be variable for the samples, where missing labels are represented the attribute 'missing_label'. sample_weight : array-like, shape (n_samples) It contains the weights of the training samples' values. Returns ------- self: skactiveml.base.SkactivemlRegressor, The `skactiveml.base.SkactivemlRegressor` object fitted on the training data. """ raise NotImplementedError
[docs] @abstractmethod def predict(self, X): """Return value predictions for the test samples X. Parameters ---------- X : array-like, shape (n_samples, n_features) Input samples. Returns ------- y : numpy.ndarray, shape (n_samples) Predicted values of the test samples 'X'. """ raise NotImplementedError
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, ): if check_X_dict is None: check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0} if check_y_dict is None: check_y_dict = { "ensure_min_samples": 0, "ensure_min_features": 0, "ensure_2d": False, "force_all_finite": False, "dtype": None, } check_missing_label(self.missing_label) self.missing_label_ = self.missing_label # Store and check random state. self.random_state_ = check_random_state(self.random_state) X = check_array(X, **check_X_dict) y = check_array(y, **check_y_dict) if len(y) > 0: y = column_or_1d(y) if y_ensure_1d else y if sample_weight is not None: sample_weight = check_array(sample_weight, **check_y_dict) if not np.array_equal(y.shape, sample_weight.shape): raise ValueError( f"`y` has the shape {y.shape} and `sample_weight` has the " f"shape {sample_weight.shape}. Both need to have " f"identical shapes." ) return X, y, sample_weight
[docs]class ProbabilisticRegressor(SkactivemlRegressor): """ProbabilisticRegressor Base class for scikit-activeml probabilistic regressors. """
[docs] @abstractmethod def predict_target_distribution(self, X): """Returns the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like, shape (n_samples, n_features) Input samples. Returns ------- dist : scipy.stats._distn_infrastructure.rv_frozen The distribution of the targets at the test samples. """ raise NotImplementedError
[docs] def predict(self, X, return_std=False, return_entropy=False): """Returns the mean, std (optional) and differential entropy (optional) of the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like, shape (n_samples, n_features) Input samples. return_std : bool, optional (default=False) Whether to return the standard deviation. return_entropy : bool, optional (default=False) Whether to return the differential entropy. Returns ------- mu : numpy.ndarray, shape (n_samples) Predicted mean conditioned on `X`. std : numpy.ndarray, shape (n_samples), optional Predicted standard deviation conditioned on `X`. entropy : numpy..ndarray, optional Predicted differential entropy conditioned on `X`. """ check_scalar(return_std, "return_std", bool) check_scalar(return_entropy, "return_entropy", bool) rv = self.predict_target_distribution(X) result = (rv.mean(),) if return_std: result += (rv.std(),) if return_entropy: result += (rv.entropy(),) if len(result) == 1: result = result[0] return result
[docs] def sample_y(self, X, n_samples=1, random_state=None): """Returns random samples from the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like, shape (n_samples_X, n_features) Input samples, where the target values are drawn from. n_samples: int, optional (default=1) Number of random samples to be drawn. random_state : int, RandomState instance or None, optional (default=None) Determines random number generation to randomly draw samples. Pass an int for reproducible results across multiple method calls. Returns ------- y_samples : numpy.ndarray, shape (n_samples_X, n_samples) Drawn random target samples. """ rv = self.predict_target_distribution(X) rv_samples = rv.rvs( size=(n_samples, len(X)), random_state=random_state ) return rv_samples.T
[docs]class AnnotatorModelMixin(ABC): """AnnotatorModelMixin Base class of all annotator models estimating the performances of annotators for given samples. """
[docs] @abstractmethod def predict_annotator_perf(self, X): """Calculates the performance of an annotator to provide the true label for a given sample. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. Returns ------- P_annot : numpy.ndarray of shape (n_samples, n_annotators) `P_annot[i,l]` is the performance of annotator `l` regarding the annotation of sample `X[i]`. """ raise NotImplementedError