Source code for skactiveml.base

"""
The :mod:`skactiveml.base` package implements the base classes for
:mod:`skactiveml`.
"""

import numpy as np
import warnings

from abc import ABC, abstractmethod
from copy import deepcopy
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
    check_array,
    check_consistent_length,
    column_or_1d,
)

from .exceptions import MappingError
from .utils import (
    MISSING_LABEL,
    is_labeled,
    is_unlabeled,
    unlabeled_indices,
    ExtLabelEncoder,
    rand_argmin,
    check_classifier_params,
    check_random_state,
    check_cost_matrix,
    check_scalar,
    check_class_prior,
    check_missing_label,
    check_indices,
    check_n_features,
    check_type,
)

__all__ = [
    "QueryStrategy",
    "PoolQueryStrategy",
    "SingleAnnotatorPoolQueryStrategy",
    "MultiAnnotatorPoolQueryStrategy",
    "BudgetManager",
    "SingleAnnotatorStreamQueryStrategy",
    "SkactivemlClassifier",
    "ClassFrequencyEstimator",
    "SkactivemlRegressor",
    "ProbabilisticRegressor",
]

successful_skorch_torch_import = False
try:
    from collections.abc import Sequence
    from skorch import NeuralNet
    from skorch.utils import to_numpy
    from .utils import _check_forward_outputs

    successful_skorch_torch_import = True
except ImportError:  # pragma: no cover
    pass


[docs] class QueryStrategy(ABC, BaseEstimator): """Base class for all query strategies in scikit-activeml. Parameters ---------- random_state : int or RandomState instance, optional (default=None) Controls the randomness of the estimator. """ def __init__(self, random_state=None): self.random_state = random_state
[docs] @abstractmethod def query(self, *args, **kwargs): """ Determines the query for active learning based on input arguments. """ raise NotImplementedError
[docs] class PoolQueryStrategy(QueryStrategy): """Base class for all pool-based active learning query strategies in scikit-activeml. Parameters ---------- missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or RandomState instance or None, default=None Controls the randomness of the estimator. """ def __init__(self, missing_label=MISSING_LABEL, random_state=None): super().__init__(random_state=random_state) self.missing_label = missing_label def _validate_data( self, X, y, candidates, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, ...) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int The number of samples to be selected in one AL cycle. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples, ...) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or\ np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool Checked boolean value of `return_utilities`. """ # Check samples. if check_X_dict is None: check_X_dict = {"allow_nd": True} X = check_array(X, **check_X_dict) # Check number of features. check_n_features(self, X, reset=reset) # Check labels y = check_array( y, ensure_2d=False, ensure_all_finite="allow-nan", dtype=None ) check_consistent_length(X, y) # Check missing_label check_missing_label(self.missing_label, target_type=y.dtype) self.missing_label_ = self.missing_label # Check candidates (+1 to avoid zero multiplier). seed_mult = int(np.sum(is_unlabeled(y, self.missing_label_))) + 1 if candidates is not None: candidates = np.array(candidates) if candidates.ndim == 1: candidates = check_indices(candidates, y, dim=0) else: check_candidates_dict = deepcopy(check_X_dict) check_candidates_dict["ensure_2d"] = False candidates = check_array(candidates, **check_candidates_dict) check_n_features(self, candidates, reset=False) # Check return_utilities. check_scalar(return_utilities, "return_utilities", bool) # Check batch size. check_scalar(batch_size, target_type=int, name="batch_size", min_val=1) # Check random state. self.random_state_ = check_random_state(self.random_state, seed_mult) return X, y, candidates, batch_size, return_utilities
[docs] class SingleAnnotatorPoolQueryStrategy(PoolQueryStrategy): """Base class for all pool-based active learning query strategies with a single annotator in scikit-activeml. """
[docs] @abstractmethod def query( self, X, y, *args, candidates=None, batch_size=1, return_utilities=False, **kwargs, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by self.missing_label). candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If true, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size,) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) or \ numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. """ raise NotImplementedError
def _validate_data( self, X, y, candidates, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. batch_size : int The number of samples to be selected in one AL cycle. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool Checked boolean value of `return_utilities`. """ ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset, check_X_dict ) y = column_or_1d(y, warn=True) if candidates is None: n_candidates = int( np.sum(is_unlabeled(y, missing_label=self.missing_label_)) ) else: n_candidates = len(candidates) if n_candidates < batch_size: warnings.warn( f"'batch_size={batch_size}' is larger than number of " f"candidates. Instead, 'batch_size={n_candidates}' was set." ) batch_size = n_candidates return X, y, candidates, batch_size, return_utilities def _transform_candidates( self, candidates, X, y, enforce_mapping=False, allow_only_unlabeled=False, ): """Transforms the `candidates` parameter into a sample array and the corresponding index array `mapping` such that `candidates = X[mapping]`. Parameters ---------- candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. enforce_mapping : bool, default=False If True, an exception is raised when no exact mapping can be determined (i.e., `mapping` is None). allow_only_unlabeled : bool, default=False If True, an exception is raised when indices of candidates contain labeled samples. Returns ------- candidates : np.ndarray of shape (n_candidates, n_features) Candidate samples from which the strategy can query the label. mapping : np.ndarray of shape (n_candidates) or None Index array that maps `candidates` to `X`. (`candidates = X[mapping]`) """ if candidates is None: ulbd_idx = unlabeled_indices(y, self.missing_label_) return X[ulbd_idx], ulbd_idx elif candidates.ndim == 1: if allow_only_unlabeled: if is_labeled(y[candidates], self.missing_label_).any(): raise ValueError( "Candidates must not contain labeled " "samples." ) return X[candidates], candidates else: if enforce_mapping: raise MappingError( "Mapping `candidates` to `X` is not " "possible but `enforce_mapping` is True. " "Use index array for `candidates` instead." ) else: return candidates, None
[docs] class MultiAnnotatorPoolQueryStrategy(PoolQueryStrategy): """Base class for all pool-based active learning query strategies with multiple annotators in scikit-activeml. """
[docs] @abstractmethod def query( self, X, y, *args, candidates=None, annotators=None, batch_size=1, return_utilities=False, **kwargs, ): """Determines which candidate sample is to be annotated by which annotator. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples, n_annotators) Labels of the training data set for each annotator (possibly including unlabeled ones indicated by self.MISSING_LABEL), meaning that `y[i, j]` contains the label annotated by annotator `i` for sample `j`. candidates : None or array-like of shape (n_candidates), dtype=int or\ array-like of shape (n_candidates, n_features), default=None See parameter `annotators`. annotators : None or array-like of shape (n_avl_annotators), dtype=int\ or array-like of shape (n_candidates, n_annotators),\ default=None - If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. - If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. - If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. - If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. - If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. - If `annotators` is `None`, all annotators are considered as available annotators. - If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. - If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. batch_size : int or str, default=1 The number of annotators-sample pairs to be selected in one AL cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed. return_utilities : bool, default=False If True, also return the utilities based on the query strategy. Returns ------- query_indices : np.ndarray of shape (batch_size, 2) The `query_indices` indicate which candidate sample pairs are to be queried is, i.e., which candidate sample is to be annotated by which annotator, e.g., `query_indices[:, 0]` indicates the selected candidate samples and `query_indices[:, 1]` indicates the respectively selected annotators. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing of refers to samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to samples in `candidates`. utilities: numpy.ndarray of shape (batch_size, n_samples,\ n_annotators) or numpy.ndarray of shape (batch_size,\ n_candidates, n_annotators) The utilities of all candidate samples w.r.t. to the available annotators after each selected sample of the batch, e.g., `utilities[0, :, j]` indicates the utilities used for selecting the first sample-annotator-pair (with indices `query_indices[0]`). - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to samples in `candidates`. """ raise NotImplementedError
def _validate_data( self, X, y, candidates, annotators, batch_size, return_utilities, reset=True, check_X_dict=None, ): """Validate input data, all attributes and set or check the `n_features_in_` attribute. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples, n_annotators) Labels of the training data set for each annotator (possibly including unlabeled ones indicated by `self.missing_label`), meaning that `y[i, j]` contains the label annotated by annotator `i` for sample `j`. candidates : None or array-like of shape (n_candidates), dtype=int or\ array-like of shape (n_candidates, n_features), See annotators. annotators : None or array-like of shape (n_avl_annotators), dtype=int\ or array-like of shape (n_candidates, n_annotators), - If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. - If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. - If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. - If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. - If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. - If `annotators` is `None`, all annotators are considered as available annotators. - If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. - If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. batch_size : int or string, The number of annotators sample pairs to be selected in one AL cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed. return_utilities : bool If true, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_X_dict : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples, n_annotators) Checked labels of the training data set. candidates : None or np.ndarray of shape (n_candidates), dtype=int or\ np.ndarray of shape (n_candidates, n_features) Checked candidate samples. annotators : None or np.ndarray of shape (n_avl_annotators), dtype=int\ or np.ndarray of shape (n_candidates, n_annotators) Checked annotator boolean array batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool, Checked boolean value of `return_utilities`. """ ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset, check_X_dict ) check_array(y, ensure_2d=True, ensure_all_finite="allow-nan") unlabeled_pairs = is_unlabeled(y, missing_label=self.missing_label_) if annotators is not None: annotators = check_array( annotators, ensure_2d=False, allow_nd=True ) if annotators.ndim == 1: annotators = check_indices(annotators, y, dim=1) elif annotators.ndim == 2: annotators = check_array(annotators, dtype=bool) if candidates is None: check_consistent_length(X, annotators) else: check_consistent_length(candidates, annotators) check_consistent_length(y.T, annotators.T) else: raise ValueError( "`annotators` must be either None, 1d or 2d array-like." ) if annotators is None: if candidates is None: n_candidate_pairs = int(np.sum(unlabeled_pairs)) else: n_candidate_pairs = len(candidates) * len(y.T) elif annotators.ndim == 1: if candidates is None: n_candidate_pairs = len(X) * len(annotators) else: n_candidate_pairs = len(candidates) * len(annotators) else: n_candidate_pairs = int(np.sum(annotators)) if n_candidate_pairs < batch_size: warnings.warn( f"'batch_size={batch_size}' is larger than number of " f"candidates pairs. Instead, 'batch_size={n_candidate_pairs}'" f" was set." ) batch_size = n_candidate_pairs return X, y, candidates, annotators, batch_size, return_utilities def _transform_cand_annot( self, candidates, annotators, X, y, enforce_mapping=False ): """ Transforms the `candidates` parameter into a sample array and the corresponding index array `mapping` such that `candidates = X[mapping]`, and transforms `annotators` into a boolean array such that `A_cand` represents the available annotator sample pairs for the samples of candidates. Parameters ---------- candidates : None or array-like of shape (n_candidates), dtype=int or\ array-like of shape (n_candidates, n_features), See annotators. annotators : None or array-like of shape (n_avl_annotators), dtype=int\ or array-like of shape (n_candidates, n_annotators), - If candidate samples and annotators are not specified, i.e., `candidates=None`, `annotators=None` the unlabeled target values, `y`, are the candidates annotator-sample-pairs. - If candidate samples and available annotators are specified: The annotator-sample-pairs, for which the sample is a candidate sample and the annotator is an available annotator are considered as candidate annotator-sample-pairs. - If `candidates` is None, all samples of `X` are considered as candidate samples. In this case `n_candidates` equals `len(X)`. - If `candidates` is of shape `(n_candidates,)` and of type int, `candidates` is considered as the indices of the sample candidates in `(X, y)`. - If `candidates` is of shape (n_candidates, n_features), the sample candidates are directly given in `candidates` (not necessarily contained in `X`). This is not supported by all query strategies. - If `annotators` is `None`, all annotators are considered as available annotators. - If `annotators` is of shape (n_avl_annotators), and of type int, `annotators` is considered as the indices of the available annotators. - If `annotators` is a boolean array of shape `(n_candidates, n_annotators)` the annotator-sample-pairs, for which the sample is a candidate sample and the boolean matrix has entry `True` are considered as candidate annotator-sample pairs. X : np.ndarray of shape (n_samples, n_features) Checked training data set. y : np.ndarray of shape (n_samples,) Checked labels of the training data set. enforce_mapping : bool, default=False If `True`, an exception is raised when no exact mapping can be determined (i.e., `mapping` is `None`). Returns ------- candidates : np.ndarray of shape (n_selectable_candidates, n_features) Candidate samples from which the strategy can query the label. mapping : np.ndarray of shape (n_selectable_candidates) or None Index array that maps `candidates` to `X` (`candidates = X[mapping]`). A_cand : np.ndarray of shape(n_selectable_candidates, n_annotators) Available annotator-sample-pairs with respect to `candidates`. """ unlbd_pairs = is_unlabeled(y, self.missing_label_) unlbd_sample_indices = np.argwhere( np.any(unlbd_pairs, axis=1) ).flatten() n_annotators = y.shape[1] # if mapping does not exist if candidates is not None and candidates.ndim == 2: n_candidates = len(candidates) if annotators is None: A_cand = np.full((n_candidates, n_annotators), True) elif annotators.ndim == 1: A_cand = np.full((n_candidates, n_annotators), False) A_cand[:, annotators] = True else: A_cand = annotators if enforce_mapping: raise ValueError( "Mapping `candidates` to `X` is not posssible" "but `enforce_mapping` is True. Use index" "array for `candidates` instead." ) else: return candidates, None, A_cand # mapping exists if candidates is None: if annotators is None: candidates = unlbd_sample_indices A_cand = unlbd_pairs[unlbd_sample_indices] elif annotators.ndim == 1: candidates = np.arange(len(X), dtype=int) A_cand = np.full_like(y, False) A_cand[:, annotators] = True else: candidates = np.arange(len(X), dtype=int) A_cand = annotators else: # candidates indices array if annotators is None: A_cand = np.full((len(candidates), y.shape[1]), True) elif annotators.ndim == 1: A_cand = np.full((len(candidates), y.shape[1]), False) A_cand[:, annotators] = True else: candidates = candidates A_cand = annotators return X[candidates], candidates, A_cand
[docs] class BudgetManager(ABC, BaseEstimator): """Base class for all budget managers for stream-based active learning to model budgeting constraints. Parameters ---------- budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. """ def __init__(self, budget=None): self.budget = budget
[docs] @abstractmethod def query_by_utility(self, utilities, *args, **kwargs): """Ask the budget manager which `utilities` are sufficient to query the corresponding labels. Parameters ---------- utilities : array-like of shape (n_samples,) The utilities provided by the stream-based active learning strategy, which are used to determine whether querying a sample is worth it given the budgeting constraint. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. """ raise NotImplementedError
[docs] @abstractmethod def update(self, candidates, queried_indices, *args, **kwargs): """Updates the budget manager. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. Returns ------- self : BudgetManager The budget manager returns itself, after it is updated. """ raise NotImplementedError
def _validate_budget(self): """check the assigned `budget` and set the default value 0.1 if `budget` is set to `None`. """ if self.budget is not None: self.budget_ = self.budget else: self.budget_ = 0.1 check_scalar( self.budget_, "budget", float, min_val=0.0, max_val=1.0, min_inclusive=False, ) def _validate_data(self, utilities, *args, **kwargs): """Validate input data. Parameters ---------- utilities: array-like of shape (n_samples,) The `utilities` provided by the stream-based active learning strategy. Returns ------- utilities: ndarray of shape (n_samples,) Checked `utilities`. """ # Check if utilities is set if not isinstance(utilities, np.ndarray): raise TypeError( "{} is not a valid type for utilities".format(type(utilities)) ) # Check budget self._validate_budget() return utilities
[docs] class SingleAnnotatorStreamQueryStrategy(QueryStrategy): """Base class for all stream-based active learning query strategies. Parameters ---------- budget : float Specifies the ratio of labels which are allowed to be queried, with `0 <= budget <= 1`. random_state : int or RandomState instance or None, default=None Controls the randomness of the estimator. """ def __init__(self, budget, random_state=None): super().__init__(random_state=random_state) self.budget = budget
[docs] @abstractmethod def query(self, candidates, *args, return_utilities=False, **kwargs): """Determines for which candidate samples labels are to be queried. The query startegy determines the most useful samples in candidates, which can be acquired within the budgeting constraint specified by `budget`. Please note that, this method does not change the internal state of the query strategy. To adapt the query strategy to the selected candidates, use `update(...)`. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. utilities: np.ndarray of shape (n_candidates,), The utilities based on the query strategy. Only provided if `return_utilities` is `True`. """ raise NotImplementedError
[docs] @abstractmethod def update( self, candidates, queried_indices, *args, budget_manager_param_dict=None, **kwargs, ): """Updates the budget manager and the count for seen and queried labels. This function should be used in conjunction with the `query` function. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. budget_manager_param_dict : dict, default=None Optional kwargs for budget_manager. Returns ------- self : SingleAnnotatorStreamQueryStrategy The query strategy returns itself, after it is updated. """ raise NotImplementedError
def _validate_random_state(self): """Creates a copy 'random_state_' if random_state is an instance of np.random_state. If not create a new random state. See also :func:`~sklearn.utils.check_random_state` """ if not hasattr(self, "random_state_"): self.random_state_ = deepcopy(self.random_state) self.random_state_ = check_random_state(self.random_state_) def _validate_budget(self): """Creates a copy "budget_" if budget is a float between 0 and 1. If it is `None`, `budget_` is set to 0.1. """ if self.budget is not None: self.budget_ = self.budget else: self.budget_ = 0.1 check_scalar( self.budget_, "budget", float, min_val=0.0, max_val=1.0, min_inclusive=False, ) def _validate_data( self, candidates, return_utilities, *args, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, If `True`, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples. return_utilities : bool, Checked boolean value of `return_utilities`. """ # Check candidate samples. candidates = check_array(candidates, **check_candidates_params) # Check number of features. check_n_features(self, candidates, reset=reset) # Check return_utilities. check_scalar(return_utilities, "return_utilities", bool) # Check random state. self._validate_random_state() # Check budgetmanager. self._validate_budget() return candidates, return_utilities
[docs] class SkactivemlClassifier(ClassifierMixin, BaseEstimator, ABC): """Skactiveml Classifier Base class for `scikit-activeml` classifiers such that missing labels, user-defined classes, and cost-sensitive classification (i.e., cost matrix) can be handled. Parameters ---------- classes : array-like of shape (n_classes), default=None Holds the label for each class. If `None`, the classes are determined during the fit. missing_label : scalar, string, np.nan, or None, default=np.nan Value to represent a missing label. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class `classes[j]` for a sample of class `classes[i]`. Can be only set, if `classes` is not `None`. random_state : int or RandomState instance or None, default=None Determines random number for `predict` method. Pass an int for reproducible results across multiple method calls. Attributes ---------- classes_ : array-like of shape (n_classes,) Holds the label for each class after fitting. cost_matrix_ : array-like,of shape (classes, classes) Cost matrix after fitting with `cost_matrix_[i,j]` indicating cost of predicting class `classes_[j]` for a sample of class `classes_[i]`. """ def __init__( self, classes=None, missing_label=MISSING_LABEL, cost_matrix=None, random_state=None, ): self.classes = classes self.missing_label = missing_label self.cost_matrix = cost_matrix self.random_state = random_state
[docs] @abstractmethod def fit(self, X, y, sample_weight=None): """Fit the model using X as training data and y as class labels. Parameters ---------- X : matrix-like, shape (n_samples, n_features) The sample matrix `X` is the feature matrix representing the samples. y : array-like, shape (n_samples) or (n_samples, n_outputs) It contains the class labels of the training samples. The number of class labels may be variable for the samples, where missing labels are represented the attribute `missing_label`. sample_weight : array-like, shape (n_samples) or (n_samples, n_outputs) It contains the weights of the training samples' class labels. It must have the same shape as `y`. Returns ------- self: skactiveml.base.SkactivemlClassifier, The `skactiveml.base.SkactivemlClassifier` object fitted on the training data. """ raise NotImplementedError
[docs] def predict_proba(self, X, **kwargs): """Return probability estimates for the test data X. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. Returns ------- P : numpy.ndarray of shape (n_samples, classes) The class probabilities of the test samples. Classes are ordered according to `self.classes_`. """ raise NotImplementedError
[docs] def predict(self, X, **kwargs): """Return class label predictions for the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- y : numpy.ndarray of shape (n_samples,) Predicted class labels of the test samples `X`. """ out = self.predict_proba(X, **kwargs) P = out[0] if isinstance(out, tuple) else out costs = np.dot(P, self.cost_matrix_) y_pred = rand_argmin(costs, random_state=self.random_state_, axis=1) y_pred = self._le.inverse_transform(y_pred) y_pred = np.asarray(y_pred, dtype=self.classes_.dtype) if isinstance(out, tuple): return (y_pred,) + out[1:] else: return y_pred
[docs] def score(self, X, y, sample_weight=None): """Return the mean accuracy on the given test data and labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) True labels for `X`. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float Mean accuracy of `self.predict(X)` regarding `y`. """ y_pred = self.predict(X) y_pred = self._le.transform(y_pred) y = self._le.transform(y) return accuracy_score(y, y_pred, sample_weight=sample_weight)
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, reset=True, ): if check_X_dict is None: check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0} if check_y_dict is None: check_y_dict = { "ensure_min_samples": 0, "ensure_min_features": 0, "ensure_2d": False, "ensure_all_finite": False, "dtype": None, } # Check common classifier parameters. check_classifier_params( self.classes, self.missing_label, self.cost_matrix ) # Store and check random state. self.random_state_ = check_random_state(self.random_state) # Create label encoder. self._le = ExtLabelEncoder( classes=self.classes, missing_label=self.missing_label ) # Check input parameters. y = check_array(y, **check_y_dict) error_msg = ( "No class label is known because 'y' contains no actual " "class labels and 'classes' is not defined. Change at " "least on of both to overcome this error." ) if len(y) > 0: y = column_or_1d(y) if y_ensure_1d else y y = self._le.fit_transform(y) is_lbdl = is_labeled(y, missing_label=-1) if len(y[is_lbdl]) > 0: check_classification_targets(y[is_lbdl]) if len(self._le.classes_) == 0: raise ValueError(error_msg) else: if self.classes is None: raise ValueError(error_msg) self._le.fit(self.classes) check_X_dict["ensure_2d"] = False X = check_array(X, **check_X_dict) check_consistent_length(X, y) check_n_features(self, X, reset=reset) # Update detected classes. self.classes_ = self._le.classes_ # Check classes. if sample_weight is not None: sample_weight = check_array(sample_weight, **check_y_dict) if not np.array_equal(y.shape, sample_weight.shape): raise ValueError( f"`y` has the shape {y.shape} and `sample_weight` has the " f"shape {sample_weight.shape}. Both need to have " f"identical shapes." ) # Update cost matrix. self.cost_matrix_ = ( 1 - np.eye(len(self.classes_)) if self.cost_matrix is None else self.cost_matrix ) self.cost_matrix_ = check_cost_matrix( self.cost_matrix_, len(self.classes_) ) if self.classes is not None: class_indices = np.argsort(self.classes) self.cost_matrix_ = self.cost_matrix_[class_indices] self.cost_matrix_ = self.cost_matrix_[:, class_indices] return X, y, sample_weight
[docs] class ClassFrequencyEstimator(SkactivemlClassifier): """Class Frequency Estimator Extends `scikit-activeml` classifiers to estimators that are able to estimate class frequencies for given samples (by calling `predict_freq`). Parameters ---------- classes : array-like, shape (n_classes), default=None Holds the label for each class. If `None`, the classes are determined during the fit. missing_label : scalar or str or np.nan or None, default=np.nan Value to represent a missing label. cost_matrix : array-like of shape (n_classes, n_classes) Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class `classes[j]` for a sample of class `classes[i]`. Can be only set, if classes is not `None`. class_prior : float or array-like, shape (n_classes), default=0 Prior observations of the class frequency estimates. If `class_prior` is an array, the entry `class_prior[i]` indicates the non-negative prior number of samples belonging to class `classes_[i]`. If `class_prior` is a float, `class_prior` indicates the non-negative prior number of samples per class. random_state : int or np.RandomState or None, default=None Determines random number for `predict` method. Pass an int for reproducible results across multiple method calls. Attributes ---------- classes_ : np.ndarray of shape (n_classes) Holds the label for each class after fitting. class_prior_ : np.ndarray of shape (n_classes) Prior observations of the class frequency estimates. The entry `class_prior_[i]` indicates the non-negative prior number of samples belonging to class `classes_[i]`. cost_matrix_ : np.ndarray of shape (classes, classes) Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting class `classes_[j]` for a sample of class `classes_[i]`. """ def __init__( self, class_prior=0, classes=None, missing_label=MISSING_LABEL, cost_matrix=None, random_state=None, ): super().__init__( classes=classes, missing_label=missing_label, cost_matrix=cost_matrix, random_state=random_state, ) self.class_prior = class_prior
[docs] @abstractmethod def predict_freq(self, X, **kwargs): """Return class frequency estimates for the test samples `X`. Parameters ---------- X: array-like of shape (n_samples, n_features) Test samples whose class frequencies are to be estimated. Returns ------- F: array-like of shape (n_samples, classes) The class frequency estimates of the test samples `X`. Classes are ordered according to attribute `classes_`. """ raise NotImplementedError
[docs] def predict_proba(self, X, **kwargs): """Return probability estimates for the test data `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- P : array-like of shape (n_samples, classes) The class probabilities of the test samples. Classes are ordered according to `self.classes_`. """ # Normalize probabilities of each sample. P = self.predict_freq(X) + self.class_prior_ normalizer = np.sum(P, axis=1) P[normalizer > 0] /= normalizer[normalizer > 0, np.newaxis] P[normalizer == 0, :] = [1 / len(self.classes_)] * len(self.classes_) return P
[docs] def sample_proba(self, X, n_samples=10, random_state=None): """Samples probability vectors from Dirichlet distributions whose parameters `alphas` are defined as the sum of the frequency estimates returned by `predict_freq` and the `class_prior`. Parameters ---------- X : array-like of shape (n_test_samples, n_features) Test samples for which `n_samples` probability vectors are to be sampled. n_samples : int, default=10 Number of probability vectors to sample for each `X[i]`. random_state : int or numpy.random.RandomState or None, default=None Ensure reproducibility when sampling probability vectors from the Dirichlet distributions. Returns ------- P : array-like of shape (n_samples, n_test_samples, n_classes) There are `n_samples` class probability vectors for each test sample in `X`. Classes are ordered according to `self.classes_`. """ random_state = check_random_state(random_state) alphas = self.predict_freq(X) + self.class_prior_ alphas = alphas.repeat(repeats=n_samples, axis=0) if (alphas == 0).any(): raise ValueError( "There are zero frequency observations. " "Set `class_prior > 0` to avoid this error." ) R = random_state.standard_gamma(alphas) R_sums = R.sum(axis=-1) is_zero = (R_sums == 0.0).ravel() sampled_class_indices = random_state.choice( np.array(R.shape[-1]), size=is_zero.sum() ) R[is_zero, sampled_class_indices] = 1.0 P = R / R.sum(axis=-1, keepdims=True) P = P.reshape(n_samples, len(X), P.shape[-1], order="F") return P
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, reset=True, ): X, y, sample_weight = super()._validate_data( X=X, y=y, sample_weight=sample_weight, check_X_dict=check_X_dict, check_y_dict=check_y_dict, y_ensure_1d=y_ensure_1d, reset=reset, ) # Check class prior. self.class_prior_ = check_class_prior( self.class_prior, len(self.classes_) ) return X, y, sample_weight
[docs] class SkactivemlRegressor(RegressorMixin, BaseEstimator, ABC): """Skactiveml Regressor Base class for `scikit-activeml` regressors. Parameters __________ missing_label : scalar, string, np.nan, or None, default=np.nan Value to represent a missing label. random_state : int, RandomState or None, default=None Determines random number for `fit` and `predict` method. Pass an int for reproducible results across multiple method calls. """ def __init__(self, missing_label=MISSING_LABEL, random_state=None): self.missing_label = missing_label self.random_state = random_state
[docs] @abstractmethod def fit(self, X, y, sample_weight=None): """Fit the model using `X` as training data and y as numerical labels. Parameters ---------- X : matrix-like of shape (n_samples, n_features) The sample matrix X is the feature matrix representing the samples. y : array-like, shape (n_samples) or (n_samples, n_targets) It contains the labels of the training samples. The number of numerical labels may be variable for the samples, where missing labels are represented as `missing_label_`. sample_weight : array-like, shape (n_samples) It contains the weights of the training samples' values. Returns ------- self: skactiveml.base.SkactivemlRegressor, The `skactiveml.base.SkactivemlRegressor` object fitted on the training data. """ raise NotImplementedError
[docs] @abstractmethod def predict(self, X): """Return value predictions for the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- y : numpy.ndarray of shape (n_samples,) Predicted values of the test samples `X`. """ raise NotImplementedError
def _validate_data( self, X, y, sample_weight=None, check_X_dict=None, check_y_dict=None, y_ensure_1d=True, reset=True, ): if check_X_dict is None: check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0} if check_y_dict is None: check_y_dict = { "ensure_min_samples": 0, "ensure_min_features": 0, "ensure_2d": False, "ensure_all_finite": False, "dtype": None, } check_missing_label(self.missing_label) self.missing_label_ = self.missing_label # Store and check random state. self.random_state_ = check_random_state(self.random_state) y = check_array(y, **check_y_dict) if len(y) > 0: y = column_or_1d(y) if y_ensure_1d else y else: check_X_dict["ensure_2d"] = False if sample_weight is not None: sample_weight = check_array(sample_weight, **check_y_dict) if not np.array_equal(y.shape, sample_weight.shape): raise ValueError( f"`y` has the shape {y.shape} and `sample_weight` has the " f"shape {sample_weight.shape}. Both need to have " f"identical shapes." ) X = check_array(X, **check_X_dict) check_consistent_length(X, y) check_n_features(self, X, reset=reset) return X, y, sample_weight
[docs] class ProbabilisticRegressor(SkactivemlRegressor): """ProbabilisticRegressor Base class for `scikit-activeml` probabilistic regressors. """
[docs] @abstractmethod def predict_target_distribution(self, X): """Returns the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like, shape (n_samples, n_features) Input samples. Returns ------- dist : scipy.stats._distn_infrastructure.rv_frozen The distribution of the targets at the test samples. """ raise NotImplementedError
[docs] def predict(self, X, return_std=False, return_entropy=False): """Returns the mean, std (optional) and differential entropy (optional) of the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. return_std : bool, default=False Whether to return the standard deviation. return_entropy : bool, default=False Whether to return the differential entropy. Returns ------- mu : numpy.ndarray, shape (n_samples,) Predicted mean conditioned on `X`. std : numpy.ndarray, shape (n_samples,), optional Predicted standard deviation conditioned on `X`. entropy : numpy.ndarray, optional Predicted differential entropy conditioned on `X`. """ check_scalar(return_std, "return_std", bool) check_scalar(return_entropy, "return_entropy", bool) rv = self.predict_target_distribution(X) result = (rv.mean(),) if return_std: result += (rv.std(),) if return_entropy: result += (rv.entropy(),) if len(result) == 1: result = result[0] return result
[docs] def sample_y(self, X, n_samples=1, random_state=None): """Returns random samples from the predicted target distribution conditioned on the test samples `X`. Parameters ---------- X : array-like of shape (n_samples_X, n_features) Input samples, where the target values are drawn from. n_samples: int, default=1 Number of random samples to be drawn. random_state : int or RandomState instance or None, default=None Determines random number generation to randomly draw samples. Pass an int for reproducible results across multiple method calls. Returns ------- y_samples : numpy.ndarray of shape (n_samples_X, n_samples) Drawn random target samples. """ rv = self.predict_target_distribution(X) rv_samples = rv.rvs( size=(n_samples, len(X)), random_state=random_state ) return rv_samples.T
if successful_skorch_torch_import: __all__ += ["SkorchMixin"]
[docs] class SkorchMixin(ABC): """ Minimal mixin to build and train a `skorch.NeuralNet`. Subclasses must implement the abstract methods to provide the module, criterion, validation kwargs, and training data. This mixin always rebuilds and initializes `self.neural_net_` on `initialize` and fits only on training data in `_fit`. """
[docs] def initialize(self, X=None, y=None, enforce_check_X_y=False): """ Initialize the wrapper and (optionally) validate inputs. If any data is provided or `enforce_check_X_y` is True, inputs are validated via `_validate_data`. A new `skorch.NeuralNet` is then created and assigned to `self.neural_net_`. Parameters ---------- X : array-like of shape (n_samples, ...), default=None Input samples for optional validation. y : array-like of shape (n_samples, ...), default=None Target values for optional validation. enforce_check_X_y : bool, default=False Whether to validate even if both `X` and `y` are `None`. Returns ------- self : SkorchMixin Returned when no input data was supplied (both `X` and `y` are `None`). X_out, y_out : tuple of nd.array, optional Validated `X` and `y` as a tuple, returned when `enforce_check_X_y=True`. """ has_data = (X is not None) or (y is not None) vd_kwargs = self._validate_data_kwargs() if enforce_check_X_y or has_data: X, y, _ = self._validate_data(X=X, y=y, **vd_kwargs) module, criterion, nn_params = self._net_parts(X=X, y=y) check_type(nn_params, "neural_net_param_dict", dict) nn_params = dict(nn_params) invalid_keys = ["module", "criterion", "predict_nonlinearity"] for k in invalid_keys: if k in nn_params: raise ValueError( f"{k} must not be a key in `neural_net_param_dict`." ) self.neural_net_ = NeuralNet( module=module, criterion=criterion, predict_nonlinearity=None, **nn_params, ).initialize() return (self, X, y) if enforce_check_X_y else self
def _fit(self, fit_function, X, y, **fit_params): """ Initialize and fit the internal `skorch` model on training data. If the model is uninitialized, or `fit_function == 'fit'` and `self.neural_net_.warm_start` is `False`, the network is re-initialized. Parameters ---------- fit_function : {'fit', 'partial_fit'} Name of the caller, used to decide whether to reinitialize when warm start is off. X : array-like of shape (n_samples, ...) Training inputs (may include unlabeled samples). y : array-like of shape (n_samples, ...) Training targets; unlabeled entries must follow the subclass' convention (e.g., `self.missing_label`). **fit_params : dict Extra keyword arguments forwarded to `self.neural_net_.partial_fit`. Returns ------- self : SkorchMixin The fitted estimator. """ need_reinit = (not hasattr(self, "neural_net_")) or ( fit_function == "fit" and not getattr(self.neural_net_, "warm_start", False) ) if need_reinit: _, X, y = self.initialize(X=X, y=y, enforce_check_X_y=True) else: vd_kwargs = self._validate_data_kwargs() X, y, _ = self._validate_data(X=X, y=y, **vd_kwargs) X_train, y_train = self._return_training_data(X=X, y=y) if X_train is not None and y_train is not None: self.neural_net_.partial_fit(X_train, y_train, **fit_params) return self def _forward_with_named_outputs( self, X, forward_outputs, extra_outputs=None, ): """Run `module.forward(X)` once and return the primary output plus optionally requested extra outputs as NumPy arrays. The primary output is defined as the first entry of `forward_outputs` (after applying its transform, if any), or the sole output of `module.forward` if `forward_outputs` is `None`. Primary and extra outputs are always returned after applying their configured transforms. Parameters ---------- X : array-like of shape (n_samples, ...) Input samples. It is assumed that `X` has already been validated and that `self.neural_net_` is initialized. forward_outputs : dict[str, tuple[int, Callable | None]] `dict` that describes how to obtain and post-process the outputs of `module.forward` for prediction. Given `raw_outputs = module.forward(X)`, each entry `name -> (idx, transform)` is interpreted as: - `idx`: integer index of `raw_outputs` (0-based). - `transform`: callable `f(tensor) -> tensor` or `None`. If `transform` is not `None`, it is applied to the selected raw tensor; otherwise the raw tensor is used. extra_outputs : None or str or sequence of str, default=None Names of additional outputs to return next to the primary output. Must be a subset of `forward_outputs.keys()` if `forward_outputs` is not `None`. The first key in `forward_outputs` (the primary output) is not allowed here. Duplicate entries are not allowed. Returns ------- output : numpy.ndarray or tuple of numpy.ndarray If `extra_outputs is None`, returns the primary output as a single NumPy array. Otherwise, returns a tuple whose first element is the primary output and whose remaining elements are the requested extra outputs in the order specified by `extra_outputs`. """ # Check forward_outputs configured. _check_forward_outputs(forward_outputs=forward_outputs) # Primary output = first configured output # (dicts preserve insertion order). primary_name = next(iter(forward_outputs)) # Normalize and validate extra_outputs: # - None / str / sequence of str, # - subset of forward_outputs.keys(), # - no duplicates, # - no primary_name. extra_names = self._normalize_extra_outputs( extra_outputs, allowed_names=forward_outputs.keys(), primary_name=primary_name, ) # Run module forward once. fw_out = self.neural_net_.forward(X) # Normalize to tuple of raw outputs. if isinstance(fw_out, tuple): raw_outputs = fw_out else: raw_outputs = (fw_out,) # Check that all indices are within range of raw_outputs. if forward_outputs: max_idx = max(idx for idx, _ in forward_outputs.values()) if max_idx >= len(raw_outputs): raise ValueError( f"`forward_outputs` references raw output index " f"{max_idx}, but module.forward returned only " f"{len(raw_outputs)} object(s)." ) # Helper to extract and transform a single named output lazily. def _get_named(name: str): idx, transform = forward_outputs[name] value = raw_outputs[idx] if transform is not None: value = transform(value) return to_numpy(value) # Primary output (transform applied here). primary_np = _get_named(primary_name) # No extra outputs. if not extra_names: return primary_np extras_np = tuple(_get_named(name) for name in extra_names) return (primary_np,) + extras_np @staticmethod def _normalize_extra_outputs( extra_outputs, allowed_names, primary_name=None ): """Validate `extra_outputs` and return a list of names. Parameters ---------- extra_outputs : None or str or sequence of str User-specified extra outputs. allowed_names : Collection[str] Set or iterable of allowed names, e.g., `forward_outputs.keys()`. primary_name : str or None, default=None Name of the primary output which must not be requested as extra. Returns ------- list[str] Validated list of extra output names. """ if extra_outputs is None: return [] # Normalize to list of strings if isinstance(extra_outputs, str): names = [extra_outputs] elif isinstance(extra_outputs, Sequence) and not isinstance( extra_outputs, bytes ): names = list(extra_outputs) else: raise TypeError( "`extra_outputs` must be None, a string, or a sequence " f"of strings, got {type(extra_outputs)}." ) if not all(isinstance(n, str) for n in names): raise TypeError( "All entries in `extra_outputs` must be strings." ) # No duplicates if len(set(names)) != len(names): raise ValueError( "`extra_outputs` must not contain duplicate names." ) allowed_names = set(allowed_names) unknown = [n for n in names if n not in allowed_names] if unknown: raise ValueError( f"Requested extra output(s) {unknown!r} are not defined; " f"allowed names are {sorted(allowed_names)!r}." ) if primary_name is not None and primary_name in names: raise ValueError( f"Primary output {primary_name!r} (first key in " f"`forward_outputs`) cannot be requested again as an " f"`extra_output`." ) return names @abstractmethod def _net_parts(self, X=None, y=None): """Assemble and validate network components. Implementations should perform any optional checks or normalization of constructor/init parameters (e.g., shape consistency, dtype checks, wrapping criteria), then return the ready-to-use pieces for `skorch.NeuralNet`. Parameters ---------- X : array-like of shape (n_samples, ...), default=None Input samples for optional validation. y : array-like of shape (n_samples, ...), default=None Target values for optional validation. Returns ------- module : torch.nn.Module.__class__ or torch.nn.Module A PyTorch `torch.nn.Module`. In general, the uninstantiated class should be passed, although instantiated modules will also work. criterion : torch.nn.Module.__class__ The criterion (loss) used to optimize the module. params : dict Keyword arguments (excluding `predict_non_linearity`) for `skorch.NeuralNet` construction. Must be a mapping and may be empty. """ raise NotImplementedError @abstractmethod def _validate_data_kwargs(self): """Return kwargs forwarded to `_validate_data`. Returns ------- kwargs : dict or None Keyword arguments consumed by `_validate_data`. """ raise NotImplementedError @abstractmethod def _validate_data(self, X, y, **kwargs): """Validate inputs and return cleaned arrays. Parameters ---------- X : array-like of shape (n_samples, ...) Input samples. y : array-like of shape (n_samples, ...) Target values. **kwargs Additional arguments controlling validation. Returns ------- X_out : np.ndarray Validated `X`. y_out : np.ndarray Validated `y`. sample_weight_or_dummy : Any Third return to maintain compatibility with callers expecting sample weights. """ raise NotImplementedError @abstractmethod def _return_training_data(self, X, y): """Return only samples and labels required for training. Parameters ---------- X : array-like of shape (n_samples, ...) Input samples. y : array-like of shape (n_samples, ...) Targets with unlabeled entries following the subclass' convention. Returns ------- X_train : np.ndarray or None Training samples or `None` if none exist. y_train : np.ndarray or None Training labels or `None` if none exist. """ raise NotImplementedError