"""
The :mod:`skactiveml.base` package implements the base classes for
:mod:`skactiveml`.
"""
import warnings
from abc import ABC, abstractmethod
from copy import deepcopy
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
check_array,
check_consistent_length,
column_or_1d,
)
from .exceptions import MappingError
from .utils import (
MISSING_LABEL,
is_labeled,
is_unlabeled,
unlabeled_indices,
ExtLabelEncoder,
rand_argmin,
check_classifier_params,
check_random_state,
check_cost_matrix,
check_scalar,
check_class_prior,
check_missing_label,
check_indices,
check_n_features,
)
# '__all__' is necessary to create the sphinx docs.
__all__ = [
"QueryStrategy",
"SingleAnnotatorPoolQueryStrategy",
"MultiAnnotatorPoolQueryStrategy",
"BudgetManager",
"SingleAnnotatorStreamQueryStrategy",
"SkactivemlClassifier",
"ClassFrequencyEstimator",
"AnnotatorModelMixin",
"SkactivemlRegressor",
"ProbabilisticRegressor",
]
[docs]class QueryStrategy(ABC, BaseEstimator):
"""Base class for all query strategies in scikit-activeml.
Parameters
----------
random_state : int or RandomState instance, optional (default=None)
Controls the randomness of the estimator.
"""
def __init__(self, random_state=None):
self.random_state = random_state
[docs] @abstractmethod
def query(self, *args, **kwargs):
"""
Determines the query for active learning based on input arguments.
"""
raise NotImplementedError
class PoolQueryStrategy(QueryStrategy):
"""Base class for all pool-based active learning query strategies in
scikit-activeml.
Parameters
----------
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : int or RandomState instance or None, default=None
Controls the randomness of the estimator.
"""
def __init__(self, missing_label=MISSING_LABEL, random_state=None):
super().__init__(random_state=random_state)
self.missing_label = missing_label
def _validate_data(
self,
X,
y,
candidates,
batch_size,
return_utilities,
reset=True,
check_X_dict=None,
):
"""Validate input data, all attributes and set or check the
`n_features_in_` attribute.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples, *)
Labels of the training data set (possibly including unlabeled ones
indicated by self.MISSING_LABEL.
candidates : None or array-like of shape (n_candidates), dtype=int or \
array-like of shape (n_candidates, n_features), default=None
- If `candidates` is `None`, the unlabeled samples from
`(X,y)` are considered as `candidates`.
- If `candidates` is of shape `(n_candidates,)` and of type
`int`, `candidates` is considered as the indices of the
samples in `(X,y)`.
- If `candidates` is of shape `(n_candidates, *)`, the
candidate samples are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all
query strategies.
batch_size : int
The number of samples to be selected in one AL cycle.
return_utilities : bool
If true, also return the utilities based on the query strategy.
reset : bool, default=True
Whether to reset the `n_features_in_` attribute.
If False, the input will be checked for consistency with data
provided when reset was last True.
**check_X_dict : kwargs
Parameters passed to :func:`sklearn.utils.check_array`.
Returns
-------
X : np.ndarray of shape (n_samples, n_features)
Checked training data set.
y : np.ndarray of shape (n_samples, *)
Checked labels of the training data set.
candidates : None or np.ndarray of shape (n_candidates), dtype=int or\
np.ndarray of shape (n_candidates, n_features)
Checked candidate samples.
batch_size : int
Checked number of samples to be selected in one AL cycle.
return_utilities : bool
Checked boolean value of `return_utilities`.
"""
# Check samples.
if check_X_dict is None:
check_X_dict = {"allow_nd": True}
X = check_array(X, **check_X_dict)
# Check number of features.
check_n_features(self, X, reset=reset)
# Check labels
y = check_array(
y, ensure_2d=False, ensure_all_finite="allow-nan", dtype=None
)
check_consistent_length(X, y)
# Check missing_label
check_missing_label(self.missing_label, target_type=y.dtype)
self.missing_label_ = self.missing_label
# Check candidates (+1 to avoid zero multiplier).
seed_mult = int(np.sum(is_unlabeled(y, self.missing_label_))) + 1
if candidates is not None:
candidates = np.array(candidates)
if candidates.ndim == 1:
candidates = check_indices(candidates, y, dim=0)
else:
check_candidates_dict = deepcopy(check_X_dict)
check_candidates_dict["ensure_2d"] = False
candidates = check_array(candidates, **check_candidates_dict)
check_n_features(self, candidates, reset=False)
# Check return_utilities.
check_scalar(return_utilities, "return_utilities", bool)
# Check batch size.
check_scalar(batch_size, target_type=int, name="batch_size", min_val=1)
# Check random state.
self.random_state_ = check_random_state(self.random_state, seed_mult)
return X, y, candidates, batch_size, return_utilities
[docs]class SingleAnnotatorPoolQueryStrategy(PoolQueryStrategy):
"""Base class for all pool-based active learning query strategies with a
single annotator in scikit-activeml.
"""
[docs] @abstractmethod
def query(
self,
X,
y,
*args,
candidates=None,
batch_size=1,
return_utilities=False,
**kwargs,
):
"""Determines for which candidate samples labels are to be queried.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples,)
Labels of the training data set (possibly including unlabeled ones
indicated by self.missing_label).
candidates : None or array-like of shape (n_candidates), dtype=int or \
array-like of shape (n_candidates, n_features), default=None
- If `candidates` is `None`, the unlabeled samples from
`(X,y)` are considered as `candidates`.
- If `candidates` is of shape `(n_candidates,)` and of type
`int`, `candidates` is considered as the indices of the
samples in `(X,y)`.
- If `candidates` is of shape `(n_candidates, *)`, the
candidate samples are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all
query strategies.
batch_size : int, default=1
The number of samples to be selected in one AL cycle.
return_utilities : bool, default=False
If true, also return the utilities based on the query strategy.
Returns
-------
query_indices : numpy.ndarray of shape (batch_size,)
The query indices indicate for which candidate sample a label is
to be queried, e.g., `query_indices[0]` indicates the first
selected sample.
- If `candidates` is `None` or of shape
`(n_candidates,)`, the indexing refers to the samples in
`X`.
- If `candidates` is of shape `(n_candidates, n_features)`,
the indexing refers to the samples in `candidates`.
utilities : numpy.ndarray of shape (batch_size, n_samples) or \
numpy.ndarray of shape (batch_size, n_candidates)
The utilities of samples after each selected sample of the batch,
e.g., `utilities[0]` indicates the utilities used for selecting
the first sample (with index `query_indices[0]`) of the batch.
Utilities for labeled samples will be set to np.nan.
- If `candidates` is `None` or of shape
`(n_candidates,)`, the indexing refers to the samples in
`X`.
- If `candidates` is of shape `(n_candidates, n_features)`,
the indexing refers to the samples in `candidates`.
"""
raise NotImplementedError
def _validate_data(
self,
X,
y,
candidates,
batch_size,
return_utilities,
reset=True,
check_X_dict=None,
):
"""Validate input data, all attributes and set or check the
`n_features_in_` attribute.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples)
Labels of the training data set (possibly including unlabeled ones
indicated by self.MISSING_LABEL.
candidates : None or array-like of shape (n_candidates), dtype=int or \
array-like of shape (n_candidates, n_features), default=None
- If `candidates` is `None`, the unlabeled samples from
`(X,y)` are considered as `candidates`.
- If `candidates` is of shape `(n_candidates,)` and of type
`int`, `candidates` is considered as the indices of the
samples in `(X,y)`.
- If `candidates` is of shape `(n_candidates, *)`, the
candidate samples are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all
query strategies.
batch_size : int
The number of samples to be selected in one AL cycle.
return_utilities : bool
If true, also return the utilities based on the query strategy.
reset : bool, default=True
Whether to reset the `n_features_in_` attribute.
If False, the input will be checked for consistency with data
provided when reset was last True.
**check_X_dict : kwargs
Parameters passed to :func:`sklearn.utils.check_array`.
Returns
-------
X : np.ndarray of shape (n_samples, n_features)
Checked training data set.
y : np.ndarray of shape (n_samples,)
Checked labels of the training data set.
candidates : None or np.ndarray of shape (n_candidates), dtype=int or
np.ndarray of shape (n_candidates, n_features)
Checked candidate samples.
batch_size : int
Checked number of samples to be selected in one AL cycle.
return_utilities : bool
Checked boolean value of `return_utilities`.
"""
(
X,
y,
candidates,
batch_size,
return_utilities,
) = super()._validate_data(
X, y, candidates, batch_size, return_utilities, reset, check_X_dict
)
y = column_or_1d(y, warn=True)
if candidates is None:
n_candidates = int(
np.sum(is_unlabeled(y, missing_label=self.missing_label_))
)
else:
n_candidates = len(candidates)
if n_candidates < batch_size:
warnings.warn(
f"'batch_size={batch_size}' is larger than number of "
f"candidates. Instead, 'batch_size={n_candidates}' was set."
)
batch_size = n_candidates
return X, y, candidates, batch_size, return_utilities
def _transform_candidates(
self,
candidates,
X,
y,
enforce_mapping=False,
allow_only_unlabeled=False,
):
"""Transforms the `candidates` parameter into a sample array and the
corresponding index array `mapping` such that
`candidates = X[mapping]`.
Parameters
----------
candidates : None or array-like of shape (n_candidates), dtype=int or \
array-like of shape (n_candidates, n_features), default=None
- If `candidates` is `None`, the unlabeled samples from
`(X,y)` are considered as `candidates`.
- If `candidates` is of shape `(n_candidates,)` and of type
`int`, `candidates` is considered as the indices of the
samples in `(X,y)`.
- If `candidates` is of shape `(n_candidates, *)`, the
candidate samples are directly given in `candidates` (not
necessarily contained in `X`).
X : np.ndarray of shape (n_samples, n_features)
Checked training data set.
y : np.ndarray of shape (n_samples,)
Checked labels of the training data set.
enforce_mapping : bool, default=False
If True, an exception is raised when no exact mapping can be
determined (i.e., `mapping` is None).
allow_only_unlabeled : bool, default=False
If True, an exception is raised when indices of candidates contain
labeled samples.
Returns
-------
candidates : np.ndarray of shape (n_candidates, n_features)
Candidate samples from which the strategy can query the label.
mapping : np.ndarray of shape (n_candidates) or None
Index array that maps `candidates` to `X`.
(`candidates = X[mapping]`)
"""
if candidates is None:
ulbd_idx = unlabeled_indices(y, self.missing_label_)
return X[ulbd_idx], ulbd_idx
elif candidates.ndim == 1:
if allow_only_unlabeled:
if is_labeled(y[candidates], self.missing_label_).any():
raise ValueError(
"Candidates must not contain labeled " "samples."
)
return X[candidates], candidates
else:
if enforce_mapping:
raise MappingError(
"Mapping `candidates` to `X` is not "
"possible but `enforce_mapping` is True. "
"Use index array for `candidates` instead."
)
else:
return candidates, None
[docs]class MultiAnnotatorPoolQueryStrategy(PoolQueryStrategy):
"""Base class for all pool-based active learning query strategies with
multiple annotators in scikit-activeml.
Parameters
----------
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : int or RandomState instance, default=None
Controls the randomness of the estimator.
"""
[docs] @abstractmethod
def query(
self,
X,
y,
*args,
candidates=None,
annotators=None,
batch_size=1,
return_utilities=False,
**kwargs,
):
"""Determines which candidate sample is to be annotated by which
annotator.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e., including the labeled
and unlabeled samples.
y : array-like of shape (n_samples, n_annotators)
Labels of the training data set for each annotator (possibly
including unlabeled ones indicated by self.MISSING_LABEL), meaning
that `y[i, j]` contains the label annotated by annotator `i` for
sample `j`.
candidates : None or array-like of shape (n_candidates), dtype=int or\
array-like of shape (n_candidates, n_features), default=None
See parameter `annotators`.
annotators : None or array-like of shape (n_avl_annotators), dtype=int\
or array-like of shape (n_candidates, n_annotators),\
default=None
- If candidate samples and annotators are not specified, i.e.,
`candidates=None`, `annotators=None` the unlabeled target values,
`y`, are the candidates annotator-sample-pairs.
- If candidate samples and available annotators are specified:
The annotator-sample-pairs, for which the sample is a candidate
sample and the annotator is an available annotator are considered
as candidate annotator-sample-pairs.
- If `candidates` is None, all samples of `X` are considered as
candidate samples. In this case `n_candidates` equals `len(X)`.
- If `candidates` is of shape `(n_candidates,)` and of type int,
`candidates` is considered as the indices of the sample
candidates in `(X, y)`.
- If `candidates` is of shape (n_candidates, n_features), the
sample candidates are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all query
strategies.
- If `annotators` is `None`, all annotators are considered as
available annotators.
- If `annotators` is of shape (n_avl_annotators), and of type int,
`annotators` is considered as the indices of the available
annotators.
- If `annotators` is a boolean array of shape `(n_candidates,
n_annotators)` the annotator-sample-pairs, for which the sample
is a candidate sample and the boolean matrix has entry `True` are
considered as candidate annotator-sample pairs.
batch_size : int or str, default=1
The number of annotators-sample pairs to be selected in one AL
cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed.
return_utilities : bool, default=False
If True, also return the utilities based on the query strategy.
Returns
-------
query_indices : np.ndarray of shape (batch_size, 2)
The `query_indices` indicate which candidate sample pairs are to be
queried is, i.e., which candidate sample is to be annotated by
which annotator, e.g., `query_indices[:, 0]` indicates the selected
candidate samples and `query_indices[:, 1]` indicates the
respectively selected annotators.
- If `candidates` is `None` or of shape `(n_candidates,)`, the
indexing of refers to samples in `X`.
- If `candidates` is of shape `(n_candidates, n_features)`, the
indexing refers to samples in `candidates`.
utilities: numpy.ndarray of shape (batch_size, n_samples,\
n_annotators) or numpy.ndarray of shape (batch_size,\
n_candidates, n_annotators)
The utilities of all candidate samples w.r.t. to the available
annotators after each selected sample of the batch, e.g.,
`utilities[0, :, j]` indicates the utilities used for selecting
the first sample-annotator-pair (with indices `query_indices[0]`).
- If `candidates` is `None` or of shape `(n_candidates,)`, the
indexing refers to samples in `X`.
- If `candidates` is of shape `(n_candidates, n_features)`, the
indexing refers to samples in `candidates`.
"""
raise NotImplementedError
def _validate_data(
self,
X,
y,
candidates,
annotators,
batch_size,
return_utilities,
reset=True,
check_X_dict=None,
):
"""Validate input data, all attributes and set or check the
`n_features_in_` attribute.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e., including the labeled
and unlabeled samples.
y : array-like of shape (n_samples, n_annotators)
Labels of the training data set for each annotator (possibly
including unlabeled ones indicated by `self.missing_label`),
meaning that `y[i, j]` contains the label annotated by annotator
`i` for sample `j`.
candidates : None or array-like of shape (n_candidates), dtype=int or\
array-like of shape (n_candidates, n_features),
See annotators.
annotators : None or array-like of shape (n_avl_annotators), dtype=int\
or array-like of shape (n_candidates, n_annotators),
- If candidate samples and annotators are not specified, i.e.,
`candidates=None`, `annotators=None` the unlabeled target values,
`y`, are the candidates annotator-sample-pairs.
- If candidate samples and available annotators are specified:
The annotator-sample-pairs, for which the sample is a candidate
sample and the annotator is an available annotator are considered
as candidate annotator-sample-pairs.
- If `candidates` is None, all samples of `X` are considered as
candidate samples. In this case `n_candidates` equals `len(X)`.
- If `candidates` is of shape `(n_candidates,)` and of type int,
`candidates` is considered as the indices of the sample
candidates in `(X, y)`.
- If `candidates` is of shape (n_candidates, n_features), the
sample candidates are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all query
strategies.
- If `annotators` is `None`, all annotators are considered as
available annotators.
- If `annotators` is of shape (n_avl_annotators), and of type int,
`annotators` is considered as the indices of the available
annotators.
- If `annotators` is a boolean array of shape `(n_candidates,
n_annotators)` the annotator-sample-pairs, for which the sample
is a candidate sample and the boolean matrix has entry `True` are
considered as candidate annotator-sample pairs.
batch_size : int or string,
The number of annotators sample pairs to be selected in one AL
cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed.
return_utilities : bool
If true, also return the utilities based on the query strategy.
reset : bool, default=True
Whether to reset the `n_features_in_` attribute.
If False, the input will be checked for consistency with data
provided when reset was last True.
**check_X_dict : kwargs
Parameters passed to :func:`sklearn.utils.check_array`.
Returns
-------
X : np.ndarray of shape (n_samples, n_features)
Checked training data set.
y : np.ndarray of shape (n_samples, n_annotators)
Checked labels of the training data set.
candidates : None or np.ndarray of shape (n_candidates), dtype=int or\
np.ndarray of shape (n_candidates, n_features)
Checked candidate samples.
annotators : None or np.ndarray of shape (n_avl_annotators), dtype=int\
or np.ndarray of shape (n_candidates, n_annotators)
Checked annotator boolean array
batch_size : int
Checked number of samples to be selected in one AL cycle.
return_utilities : bool,
Checked boolean value of `return_utilities`.
"""
(
X,
y,
candidates,
batch_size,
return_utilities,
) = super()._validate_data(
X, y, candidates, batch_size, return_utilities, reset, check_X_dict
)
check_array(y, ensure_2d=True, ensure_all_finite="allow-nan")
unlabeled_pairs = is_unlabeled(y, missing_label=self.missing_label_)
if annotators is not None:
annotators = check_array(
annotators, ensure_2d=False, allow_nd=True
)
if annotators.ndim == 1:
annotators = check_indices(annotators, y, dim=1)
elif annotators.ndim == 2:
annotators = check_array(annotators, dtype=bool)
if candidates is None:
check_consistent_length(X, annotators)
else:
check_consistent_length(candidates, annotators)
check_consistent_length(y.T, annotators.T)
else:
raise ValueError(
"`annotators` must be either None, 1d or 2d array-like."
)
if annotators is None:
if candidates is None:
n_candidate_pairs = int(np.sum(unlabeled_pairs))
else:
n_candidate_pairs = len(candidates) * len(y.T)
elif annotators.ndim == 1:
if candidates is None:
n_candidate_pairs = len(X) * len(annotators)
else:
n_candidate_pairs = len(candidates) * len(annotators)
else:
n_candidate_pairs = int(np.sum(annotators))
if n_candidate_pairs < batch_size:
warnings.warn(
f"'batch_size={batch_size}' is larger than number of "
f"candidates pairs. Instead, 'batch_size={n_candidate_pairs}'"
f" was set."
)
batch_size = n_candidate_pairs
return X, y, candidates, annotators, batch_size, return_utilities
def _transform_cand_annot(
self, candidates, annotators, X, y, enforce_mapping=False
):
"""
Transforms the `candidates` parameter into a sample array and the
corresponding index array `mapping` such that
`candidates = X[mapping]`, and transforms `annotators` into a boolean
array such that `A_cand` represents the available annotator sample
pairs for the samples of candidates.
Parameters
----------
candidates : None or array-like of shape (n_candidates), dtype=int or\
array-like of shape (n_candidates, n_features),
See annotators.
annotators : None or array-like of shape (n_avl_annotators), dtype=int\
or array-like of shape (n_candidates, n_annotators),
- If candidate samples and annotators are not specified, i.e.,
`candidates=None`, `annotators=None` the unlabeled target values,
`y`, are the candidates annotator-sample-pairs.
- If candidate samples and available annotators are specified:
The annotator-sample-pairs, for which the sample is a candidate
sample and the annotator is an available annotator are considered
as candidate annotator-sample-pairs.
- If `candidates` is None, all samples of `X` are considered as
candidate samples. In this case `n_candidates` equals `len(X)`.
- If `candidates` is of shape `(n_candidates,)` and of type int,
`candidates` is considered as the indices of the sample
candidates in `(X, y)`.
- If `candidates` is of shape (n_candidates, n_features), the
sample candidates are directly given in `candidates` (not
necessarily contained in `X`). This is not supported by all query
strategies.
- If `annotators` is `None`, all annotators are considered as
available annotators.
- If `annotators` is of shape (n_avl_annotators), and of type int,
`annotators` is considered as the indices of the available
annotators.
- If `annotators` is a boolean array of shape `(n_candidates,
n_annotators)` the annotator-sample-pairs, for which the sample
is a candidate sample and the boolean matrix has entry `True` are
considered as candidate annotator-sample pairs.
X : np.ndarray of shape (n_samples, n_features)
Checked training data set.
y : np.ndarray of shape (n_samples,)
Checked labels of the training data set.
enforce_mapping : bool, default=False
If `True`, an exception is raised when no exact mapping can be
determined (i.e., `mapping` is `None`).
Returns
-------
candidates : np.ndarray of shape (n_selectable_candidates, n_features)
Candidate samples from which the strategy can query the label.
mapping : np.ndarray of shape (n_selectable_candidates) or None
Index array that maps `candidates` to `X`
(`candidates = X[mapping]`).
A_cand : np.ndarray of shape(n_selectable_candidates, n_annotators)
Available annotator-sample-pairs with respect to `candidates`.
"""
unlbd_pairs = is_unlabeled(y, self.missing_label_)
unlbd_sample_indices = np.argwhere(
np.any(unlbd_pairs, axis=1)
).flatten()
n_annotators = y.shape[1]
# if mapping does not exist
if candidates is not None and candidates.ndim == 2:
n_candidates = len(candidates)
if annotators is None:
A_cand = np.full((n_candidates, n_annotators), True)
elif annotators.ndim == 1:
A_cand = np.full((n_candidates, n_annotators), False)
A_cand[:, annotators] = True
else:
A_cand = annotators
if enforce_mapping:
raise ValueError(
"Mapping `candidates` to `X` is not posssible"
"but `enforce_mapping` is True. Use index"
"array for `candidates` instead."
)
else:
return candidates, None, A_cand
# mapping exists
if candidates is None:
if annotators is None:
candidates = unlbd_sample_indices
A_cand = unlbd_pairs[unlbd_sample_indices]
elif annotators.ndim == 1:
candidates = np.arange(len(X), dtype=int)
A_cand = np.full_like(y, False)
A_cand[:, annotators] = True
else:
candidates = np.arange(len(X), dtype=int)
A_cand = annotators
else: # candidates indices array
if annotators is None:
A_cand = np.full((len(candidates), y.shape[1]), True)
elif annotators.ndim == 1:
A_cand = np.full((len(candidates), y.shape[1]), False)
A_cand[:, annotators] = True
else:
candidates = candidates
A_cand = annotators
return X[candidates], candidates, A_cand
[docs]class BudgetManager(ABC, BaseEstimator):
"""Base class for all budget managers for stream-based active learning
to model budgeting constraints.
Parameters
----------
budget : float, default=None
Specifies the ratio of samples which are allowed to be sampled, with
`0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
default budget 0.1.
"""
def __init__(self, budget=None):
self.budget = budget
[docs] @abstractmethod
def query_by_utility(self, utilities, *args, **kwargs):
"""Ask the budget manager which `utilities` are sufficient to query the
corresponding labels.
Parameters
----------
utilities : array-like of shape (n_samples,)
The utilities provided by the stream-based active learning
strategy, which are used to determine whether querying a sample
is worth it given the budgeting constraint.
Returns
-------
queried_indices : np.ndarray of shape (n_queried_indices,)
The indices of samples in candidates whose labels are queried,
with `0 <= queried_indices <= n_candidates`.
"""
raise NotImplementedError
[docs] @abstractmethod
def update(self, candidates, queried_indices, *args, **kwargs):
"""Updates the budget manager.
Parameters
----------
candidates : {array-like, sparse matrix} of shape\
(n_candidates, n_features)
The samples which may be queried. Sparse matrices are accepted
only if they are supported by the base query strategy.
queried_indices : np.ndarray of shape (n_queried_indices,)
The indices of samples in candidates whose labels are queried,
with `0 <= queried_indices <= n_candidates`.
Returns
-------
self : BudgetManager
The budget manager returns itself, after it is updated.
"""
raise NotImplementedError
def _validate_budget(self):
"""check the assigned `budget` and set the default value 0.1 if
`budget` is set to `None`.
"""
if self.budget is not None:
self.budget_ = self.budget
else:
self.budget_ = 0.1
check_scalar(
self.budget_,
"budget",
float,
min_val=0.0,
max_val=1.0,
min_inclusive=False,
)
def _validate_data(self, utilities, *args, **kwargs):
"""Validate input data.
Parameters
----------
utilities: array-like of shape (n_samples,)
The `utilities` provided by the stream-based active learning
strategy.
Returns
-------
utilities: ndarray of shape (n_samples,)
Checked `utilities`.
"""
# Check if utilities is set
if not isinstance(utilities, np.ndarray):
raise TypeError(
"{} is not a valid type for utilities".format(type(utilities))
)
# Check budget
self._validate_budget()
return utilities
[docs]class SingleAnnotatorStreamQueryStrategy(QueryStrategy):
"""Base class for all stream-based active learning query strategies.
Parameters
----------
budget : float
Specifies the ratio of labels which are allowed to be queried, with
`0 <= budget <= 1`.
random_state : int or RandomState instance or None, default=None
Controls the randomness of the estimator.
"""
def __init__(self, budget, random_state=None):
super().__init__(random_state=random_state)
self.budget = budget
[docs] @abstractmethod
def query(self, candidates, *args, return_utilities=False, **kwargs):
"""Determines for which candidate samples labels are to be queried.
The query startegy determines the most useful samples in candidates,
which can be acquired within the budgeting constraint specified by
`budget`. Please note that, this method does not change the internal
state of the query strategy. To adapt the query strategy to the
selected candidates, use `update(...)`.
Parameters
----------
candidates : {array-like, sparse matrix} of shape\
(n_candidates, n_features)
The samples which may be queried. Sparse matrices are accepted
only if they are supported by the base query strategy.
return_utilities : bool, default=False
If `True`, also return the utilities based on the query strategy.
Returns
-------
queried_indices : np.ndarray of shape (n_queried_indices,)
The indices of samples in candidates whose labels are queried,
with `0 <= queried_indices <= n_candidates`.
utilities: np.ndarray of shape (n_candidates,),
The utilities based on the query strategy. Only provided if
`return_utilities` is `True`.
"""
raise NotImplementedError
[docs] @abstractmethod
def update(
self,
candidates,
queried_indices,
*args,
budget_manager_param_dict=None,
**kwargs,
):
"""Updates the budget manager and the count for seen and queried
labels. This function should be used in conjunction with the `query`
function.
Parameters
----------
candidates : {array-like, sparse matrix} of shape\
(n_candidates, n_features)
The samples which may be queried. Sparse matrices are accepted
only if they are supported by the base query strategy.
queried_indices : np.ndarray of shape (n_queried_indices,)
The indices of samples in candidates whose labels are queried,
with `0 <= queried_indices <= n_candidates`.
budget_manager_param_dict : dict, default=None
Optional kwargs for budget_manager.
Returns
-------
self : SingleAnnotatorStreamQueryStrategy
The query strategy returns itself, after it is updated.
"""
raise NotImplementedError
def _validate_random_state(self):
"""Creates a copy 'random_state_' if random_state is an instance of
np.random_state. If not create a new random state. See also
:func:`~sklearn.utils.check_random_state`
"""
if not hasattr(self, "random_state_"):
self.random_state_ = deepcopy(self.random_state)
self.random_state_ = check_random_state(self.random_state_)
def _validate_budget(self):
"""Creates a copy "budget_" if budget is a float between 0 and 1. If it
is `None`, `budget_` is set to 0.1.
"""
if self.budget is not None:
self.budget_ = self.budget
else:
self.budget_ = 0.1
check_scalar(
self.budget_,
"budget",
float,
min_val=0.0,
max_val=1.0,
min_inclusive=False,
)
def _validate_data(
self,
candidates,
return_utilities,
*args,
reset=True,
**check_candidates_params,
):
"""Validate input data and set or check the `n_features_in_` attribute.
Parameters
----------
candidates: array-like of shape (n_candidates, n_features)
The samples which may be queried. Sparse matrices are accepted
only if they are supported by the base query strategy.
return_utilities : bool,
If `True`, also return the utilities based on the query strategy.
reset : bool, default=True
Whether to reset the `n_features_in_` attribute.
If False, the input will be checked for consistency with data
provided when reset was last True.
**check_candidates_params : kwargs
Parameters passed to :func:`sklearn.utils.check_array`.
Returns
-------
candidates: np.ndarray, shape (n_candidates, n_features)
Checked candidate samples.
return_utilities : bool,
Checked boolean value of `return_utilities`.
"""
# Check candidate samples.
candidates = check_array(candidates, **check_candidates_params)
# Check number of features.
check_n_features(self, candidates, reset=reset)
# Check return_utilities.
check_scalar(return_utilities, "return_utilities", bool)
# Check random state.
self._validate_random_state()
# Check budgetmanager.
self._validate_budget()
return candidates, return_utilities
[docs]class SkactivemlClassifier(ClassifierMixin, BaseEstimator, ABC):
"""Skactiveml Classifier
Base class for `scikit-activeml` classifiers such that missing labels,
user-defined classes, and cost-sensitive classification (i.e., cost matrix)
can be handled.
Parameters
----------
classes : array-like of shape (n_classes), default=None
Holds the label for each class. If `None`, the classes are determined
during the fit.
missing_label : scalar, string, np.nan, or None, default=np.nan
Value to represent a missing label.
cost_matrix : array-like of shape (n_classes, n_classes)
Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class
`classes[j]` for a sample of class `classes[i]`. Can be only set, if
`classes` is not `None`.
random_state : int or RandomState instance or None, default=None
Determines random number for `predict` method. Pass an int for
reproducible results across multiple method calls.
Attributes
----------
classes_ : array-like of shape (n_classes,)
Holds the label for each class after fitting.
cost_matrix_ : array-like,of shape (classes, classes)
Cost matrix after fitting with `cost_matrix_[i,j]` indicating cost of
predicting class `classes_[j]` for a sample of class `classes_[i]`.
"""
def __init__(
self,
classes=None,
missing_label=MISSING_LABEL,
cost_matrix=None,
random_state=None,
):
self.classes = classes
self.missing_label = missing_label
self.cost_matrix = cost_matrix
self.random_state = random_state
[docs] @abstractmethod
def fit(self, X, y, sample_weight=None):
"""Fit the model using X as training data and y as class labels.
Parameters
----------
X : matrix-like, shape (n_samples, n_features)
The sample matrix `X` is the feature matrix representing the
samples.
y : array-like, shape (n_samples) or (n_samples, n_outputs)
It contains the class labels of the training samples.
The number of class labels may be variable for the samples, where
missing labels are represented the attribute `missing_label`.
sample_weight : array-like, shape (n_samples) or (n_samples, n_outputs)
It contains the weights of the training samples' class labels.
It must have the same shape as `y`.
Returns
-------
self: skactiveml.base.SkactivemlClassifier,
The `skactiveml.base.SkactivemlClassifier` object fitted on the
training data.
"""
raise NotImplementedError
[docs] def predict_proba(self, X):
"""Return probability estimates for the test data X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
Returns
-------
P : numpy.ndarray of shape (n_samples, classes)
The class probabilities of the test samples. Classes are ordered
according to `self.classes_`.
"""
raise NotImplementedError
[docs] def predict(self, X):
"""Return class label predictions for the test samples `X`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
Returns
-------
y : numpy.ndarray of shape (n_samples,)
Predicted class labels of the test samples `X`.
"""
P = self.predict_proba(X)
costs = np.dot(P, self.cost_matrix_)
y_pred = rand_argmin(costs, random_state=self.random_state_, axis=1)
y_pred = self._le.inverse_transform(y_pred)
y_pred = np.asarray(y_pred, dtype=self.classes_.dtype)
return y_pred
[docs] def score(self, X, y, sample_weight=None):
"""Return the mean accuracy on the given test data and labels.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
y : array-like of shape (n_samples,)
True labels for `X`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
Mean accuracy of `self.predict(X)` regarding `y`.
"""
y = self._le.transform(y)
y_pred = self._le.transform(self.predict(X))
return accuracy_score(y, y_pred, sample_weight=sample_weight)
def _validate_data(
self,
X,
y,
sample_weight=None,
check_X_dict=None,
check_y_dict=None,
y_ensure_1d=True,
reset=True,
):
if check_X_dict is None:
check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0}
if check_y_dict is None:
check_y_dict = {
"ensure_min_samples": 0,
"ensure_min_features": 0,
"ensure_2d": False,
"ensure_all_finite": False,
"dtype": None,
}
# Check common classifier parameters.
check_classifier_params(
self.classes, self.missing_label, self.cost_matrix
)
# Store and check random state.
self.random_state_ = check_random_state(self.random_state)
# Create label encoder.
self._le = ExtLabelEncoder(
classes=self.classes, missing_label=self.missing_label
)
# Check input parameters.
y = check_array(y, **check_y_dict)
error_msg = (
"No class label is known because 'y' contains no actual "
"class labels and 'classes' is not defined. Change at "
"least on of both to overcome this error."
)
if len(y) > 0:
y = column_or_1d(y) if y_ensure_1d else y
y = self._le.fit_transform(y)
is_lbdl = is_labeled(y, missing_label=-1)
if len(y[is_lbdl]) > 0:
check_classification_targets(y[is_lbdl])
if len(self._le.classes_) == 0:
raise ValueError(error_msg)
else:
if self.classes is None:
raise ValueError(error_msg)
self._le.fit(self.classes)
check_X_dict["ensure_2d"] = False
X = check_array(X, **check_X_dict)
check_consistent_length(X, y)
check_n_features(self, X, reset=reset)
# Update detected classes.
self.classes_ = self._le.classes_
# Check classes.
if sample_weight is not None:
sample_weight = check_array(sample_weight, **check_y_dict)
if not np.array_equal(y.shape, sample_weight.shape):
raise ValueError(
f"`y` has the shape {y.shape} and `sample_weight` has the "
f"shape {sample_weight.shape}. Both need to have "
f"identical shapes."
)
# Update cost matrix.
self.cost_matrix_ = (
1 - np.eye(len(self.classes_))
if self.cost_matrix is None
else self.cost_matrix
)
self.cost_matrix_ = check_cost_matrix(
self.cost_matrix_, len(self.classes_)
)
if self.classes is not None:
class_indices = np.argsort(self.classes)
self.cost_matrix_ = self.cost_matrix_[class_indices]
self.cost_matrix_ = self.cost_matrix_[:, class_indices]
return X, y, sample_weight
[docs]class ClassFrequencyEstimator(SkactivemlClassifier):
"""Class Frequency Estimator
Extends `scikit-activeml` classifiers to estimators that are able to
estimate class frequencies for given samples (by calling `predict_freq`).
Parameters
----------
classes : array-like, shape (n_classes), default=None
Holds the label for each class. If `None`, the classes are determined
during the fit.
missing_label : scalar or str or np.nan or None, default=np.nan
Value to represent a missing label.
cost_matrix : array-like of shape (n_classes, n_classes)
Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class
`classes[j]` for a sample of class `classes[i]`. Can be only set, if
classes is not `None`.
class_prior : float or array-like, shape (n_classes), default=0
Prior observations of the class frequency estimates. If `class_prior`
is an array, the entry `class_prior[i]` indicates the non-negative
prior number of samples belonging to class `classes_[i]`. If
`class_prior` is a float, `class_prior` indicates the non-negative
prior number of samples per class.
random_state : int or np.RandomState or None, default=None
Determines random number for `predict` method. Pass an int for
reproducible results across multiple method calls.
Attributes
----------
classes_ : np.ndarray of shape (n_classes)
Holds the label for each class after fitting.
class_prior_ : np.ndarray of shape (n_classes)
Prior observations of the class frequency estimates. The entry
`class_prior_[i]` indicates the non-negative prior number of samples
belonging to class `classes_[i]`.
cost_matrix_ : np.ndarray of shape (classes, classes)
Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting
class `classes_[j]` for a sample of class `classes_[i]`.
"""
def __init__(
self,
class_prior=0,
classes=None,
missing_label=MISSING_LABEL,
cost_matrix=None,
random_state=None,
):
super().__init__(
classes=classes,
missing_label=missing_label,
cost_matrix=cost_matrix,
random_state=random_state,
)
self.class_prior = class_prior
[docs] @abstractmethod
def predict_freq(self, X):
"""Return class frequency estimates for the test samples `X`.
Parameters
----------
X: array-like of shape (n_samples, n_features)
Test samples whose class frequencies are to be estimated.
Returns
-------
F: array-like of shape (n_samples, classes)
The class frequency estimates of the test samples `X`. Classes are
ordered according to attribute `classes_`.
"""
raise NotImplementedError
[docs] def predict_proba(self, X):
"""Return probability estimates for the test data `X`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
Returns
-------
P : array-like of shape (n_samples, classes)
The class probabilities of the test samples. Classes are ordered
according to `self.classes_`.
"""
# Normalize probabilities of each sample.
P = self.predict_freq(X) + self.class_prior_
normalizer = np.sum(P, axis=1)
P[normalizer > 0] /= normalizer[normalizer > 0, np.newaxis]
P[normalizer == 0, :] = [1 / len(self.classes_)] * len(self.classes_)
return P
[docs] def sample_proba(self, X, n_samples=10, random_state=None):
"""Samples probability vectors from Dirichlet distributions whose
parameters `alphas` are defined as the sum of the frequency estimates
returned by `predict_freq` and the `class_prior`.
Parameters
----------
X : array-like of shape (n_test_samples, n_features)
Test samples for which `n_samples` probability vectors are to be
sampled.
n_samples : int, default=10
Number of probability vectors to sample for each `X[i]`.
random_state : int or numpy.random.RandomState or None, default=None
Ensure reproducibility when sampling probability vectors from the
Dirichlet distributions.
Returns
-------
P : array-like of shape (n_samples, n_test_samples, n_classes)
There are `n_samples` class probability vectors for each test
sample in `X`. Classes are ordered according to `self.classes_`.
"""
random_state = check_random_state(random_state)
alphas = self.predict_freq(X) + self.class_prior_
alphas = alphas.repeat(repeats=n_samples, axis=0)
if (alphas == 0).any():
raise ValueError(
"There are zero frequency observations. "
"Set `class_prior > 0` to avoid this error."
)
R = random_state.standard_gamma(alphas)
R_sums = R.sum(axis=-1)
is_zero = (R_sums == 0.0).ravel()
sampled_class_indices = random_state.choice(
np.array(R.shape[-1]), size=is_zero.sum()
)
R[is_zero, sampled_class_indices] = 1.0
P = R / R.sum(axis=-1, keepdims=True)
P = P.reshape(n_samples, len(X), P.shape[-1], order="F")
return P
def _validate_data(
self,
X,
y,
sample_weight=None,
check_X_dict=None,
check_y_dict=None,
y_ensure_1d=True,
):
X, y, sample_weight = super()._validate_data(
X=X,
y=y,
sample_weight=sample_weight,
check_X_dict=check_X_dict,
check_y_dict=check_y_dict,
y_ensure_1d=y_ensure_1d,
)
# Check class prior.
self.class_prior_ = check_class_prior(
self.class_prior, len(self.classes_)
)
return X, y, sample_weight
[docs]class SkactivemlRegressor(RegressorMixin, BaseEstimator, ABC):
"""Skactiveml Regressor
Base class for `scikit-activeml` regressors.
Parameters
__________
missing_label : scalar, string, np.nan, or None, default=np.nan
Value to represent a missing label.
random_state : int, RandomState or None, default=None
Determines random number for `fit` and `predict` method. Pass an int
for reproducible results across multiple method calls.
"""
def __init__(self, missing_label=MISSING_LABEL, random_state=None):
self.missing_label = missing_label
self.random_state = random_state
[docs] @abstractmethod
def fit(self, X, y, sample_weight=None):
"""Fit the model using `X` as training data and y as numerical labels.
Parameters
----------
X : matrix-like of shape (n_samples, n_features)
The sample matrix X is the feature matrix representing the samples.
y : array-like, shape (n_samples) or (n_samples, n_targets)
It contains the labels of the training samples.
The number of numerical labels may be variable for the samples,
where missing labels are represented as `missing_label_`.
sample_weight : array-like, shape (n_samples)
It contains the weights of the training samples' values.
Returns
-------
self: skactiveml.base.SkactivemlRegressor,
The `skactiveml.base.SkactivemlRegressor` object fitted on the
training data.
"""
raise NotImplementedError
[docs] @abstractmethod
def predict(self, X):
"""Return value predictions for the test samples `X`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
Returns
-------
y : numpy.ndarray of shape (n_samples,)
Predicted values of the test samples `X`.
"""
raise NotImplementedError
def _validate_data(
self,
X,
y,
sample_weight=None,
check_X_dict=None,
check_y_dict=None,
y_ensure_1d=True,
reset=True,
):
if check_X_dict is None:
check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0}
if check_y_dict is None:
check_y_dict = {
"ensure_min_samples": 0,
"ensure_min_features": 0,
"ensure_2d": False,
"ensure_all_finite": False,
"dtype": None,
}
check_missing_label(self.missing_label)
self.missing_label_ = self.missing_label
# Store and check random state.
self.random_state_ = check_random_state(self.random_state)
y = check_array(y, **check_y_dict)
if len(y) > 0:
y = column_or_1d(y) if y_ensure_1d else y
else:
check_X_dict["ensure_2d"] = False
if sample_weight is not None:
sample_weight = check_array(sample_weight, **check_y_dict)
if not np.array_equal(y.shape, sample_weight.shape):
raise ValueError(
f"`y` has the shape {y.shape} and `sample_weight` has the "
f"shape {sample_weight.shape}. Both need to have "
f"identical shapes."
)
X = check_array(X, **check_X_dict)
check_consistent_length(X, y)
check_n_features(self, X, reset=reset)
return X, y, sample_weight
[docs]class ProbabilisticRegressor(SkactivemlRegressor):
"""ProbabilisticRegressor
Base class for `scikit-activeml` probabilistic regressors.
"""
[docs] @abstractmethod
def predict_target_distribution(self, X):
"""Returns the predicted target distribution conditioned on the test
samples `X`.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input samples.
Returns
-------
dist : scipy.stats._distn_infrastructure.rv_frozen
The distribution of the targets at the test samples.
"""
raise NotImplementedError
[docs] def predict(self, X, return_std=False, return_entropy=False):
"""Returns the mean, std (optional) and differential entropy (optional)
of the predicted target distribution conditioned on the test samples
`X`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input samples.
return_std : bool, default=False
Whether to return the standard deviation.
return_entropy : bool, default=False
Whether to return the differential entropy.
Returns
-------
mu : numpy.ndarray, shape (n_samples,)
Predicted mean conditioned on `X`.
std : numpy.ndarray, shape (n_samples,), optional
Predicted standard deviation conditioned on `X`.
entropy : numpy.ndarray, optional
Predicted differential entropy conditioned on `X`.
"""
check_scalar(return_std, "return_std", bool)
check_scalar(return_entropy, "return_entropy", bool)
rv = self.predict_target_distribution(X)
result = (rv.mean(),)
if return_std:
result += (rv.std(),)
if return_entropy:
result += (rv.entropy(),)
if len(result) == 1:
result = result[0]
return result
[docs] def sample_y(self, X, n_samples=1, random_state=None):
"""Returns random samples from the predicted target distribution
conditioned on the test samples `X`.
Parameters
----------
X : array-like of shape (n_samples_X, n_features)
Input samples, where the target values are drawn from.
n_samples: int, default=1
Number of random samples to be drawn.
random_state : int or RandomState instance or None, default=None
Determines random number generation to randomly draw samples. Pass
an int for reproducible results across multiple method calls.
Returns
-------
y_samples : numpy.ndarray of shape (n_samples_X, n_samples)
Drawn random target samples.
"""
rv = self.predict_target_distribution(X)
rv_samples = rv.rvs(
size=(n_samples, len(X)), random_state=random_state
)
return rv_samples.T
[docs]class AnnotatorModelMixin(ABC):
"""Annotator Model
Base class of all annotator models estimating the performances of
annotators for given samples.
"""
[docs] @abstractmethod
def predict_annotator_perf(self, X):
"""Calculates the performance of an annotator to provide the true label
for a given sample.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
Returns
-------
P_annot : numpy.ndarray of shape (n_samples, n_annotators)
`P_annot[i,l]` is the performance of annotator `l` regarding the
annotation of sample `X[i]`.
"""
raise NotImplementedError