Source code for skactiveml.base

"""
The :mod:`skactiveml.base` package implements the base classes for
:mod:`skactiveml`.
"""

import warnings
from abc import ABC, abstractmethod
from copy import deepcopy

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
    check_array,
    check_consistent_length,
    column_or_1d,
)

from .exceptions import MappingError
from .utils import (
    MISSING_LABEL,
    is_labeled,
    is_unlabeled,
    unlabeled_indices,
    ExtLabelEncoder,
    rand_argmin,
    check_classifier_params,
    check_random_state,
    check_cost_matrix,
    check_scalar,
    check_class_prior,
    check_missing_label,
    check_indices,
)

# '__all__' is necessary to create the sphinx docs.
__all__ = [
    "QueryStrategy",
    "SingleAnnotatorPoolQueryStrategy",
    "MultiAnnotatorPoolQueryStrategy",
    "BudgetManager",
    "SingleAnnotatorStreamQueryStrategy",
    "SkactivemlClassifier",
    "ClassFrequencyEstimator",
    "AnnotatorModelMixin",
    "SkactivemlRegressor",
    "ProbabilisticRegressor",
]


[docs]class QueryStrategy(ABC, BaseEstimator):
    """Base class for all query strategies in scikit-activeml.

    Parameters
    ----------
    random_state : int or RandomState instance, optional (default=None)
        Controls the randomness of the estimator.
    """

    def __init__(self, random_state=None):
        self.random_state = random_state

[docs]    @abstractmethod
    def query(self, *args, **kwargs):
        """
        Determines the query for active learning based on input arguments.
        """
        raise NotImplementedError


class PoolQueryStrategy(QueryStrategy):
    """Base class for all pool-based active learning query strategies in
    scikit-activeml.

    Parameters
    ----------
    missing_label : scalar or string or np.nan or None, optional
    (default=np.nan)
        Value to represent a missing label.
    random_state : int or RandomState instance, optional (default=None)
        Controls the randomness of the estimator.
    """

    def __init__(self, missing_label=MISSING_LABEL, random_state=None):
        super().__init__(random_state=random_state)
        self.missing_label = missing_label

    def _validate_data(
        self,
        X,
        y,
        candidates,
        batch_size,
        return_utilities,
        reset=True,
        check_X_dict=None,
    ):
        """Validate input data, all attributes and set or check the
        `n_features_in_` attribute.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples.
        y : array-like of shape (n_samples, *)
            Labels of the training data set (possibly including unlabeled ones
            indicated by self.MISSING_LABEL.
        candidates : None or array-like of shape (n_candidates), dtype=int or
            array-like of shape (n_candidates, n_features),
            optional (default=None)
            If candidates is None, the unlabeled samples from (X,y) are
            considered as candidates.
            If candidates is of shape (n_candidates) and of type int,
            candidates is considered as the indices of the samples in (X,y).
            If candidates is of shape (n_candidates, n_features), the
            candidates are directly given in candidates (not necessarily
            contained in X). This is not supported by all query strategies.
        batch_size : int
            The number of samples to be selected in one AL cycle.
        return_utilities : bool
            If true, also return the utilities based on the query strategy.
        reset : bool, default=True
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_X_dict : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        X : np.ndarray of shape (n_samples, n_features)
            Checked training data set.
        y : np.ndarray of shape (n_samples, *)
            Checked labels of the training data set.
        candidates : None or np.ndarray of shape (n_candidates), dtype=int or
            np.ndarray of shape (n_candidates, n_features)
            Checked candidate samples.
        batch_size : int
            Checked number of samples to be selected in one AL cycle.
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        # Check samples.
        if check_X_dict is None:
            check_X_dict = {"allow_nd": True}
        X = check_array(X, **check_X_dict)

        # Check number of features.
        self._check_n_features(X, reset=reset)

        # Check labels
        y = check_array(
            y, ensure_2d=False, force_all_finite="allow-nan", dtype=None
        )
        check_consistent_length(X, y)

        # Check missing_label
        check_missing_label(self.missing_label, target_type=y.dtype)
        self.missing_label_ = self.missing_label

        # Check candidates (+1 to avoid zero multiplier).
        seed_mult = int(np.sum(is_unlabeled(y, self.missing_label_))) + 1
        if candidates is not None:
            candidates = np.array(candidates)
            if candidates.ndim == 1:
                candidates = check_indices(candidates, y, dim=0)
            else:
                check_candidates_dict = deepcopy(check_X_dict)
                check_candidates_dict["ensure_2d"] = False
                candidates = check_array(candidates, **check_candidates_dict)
                self._check_n_features(candidates, reset=False)

        # Check return_utilities.
        check_scalar(return_utilities, "return_utilities", bool)

        # Check batch size.
        check_scalar(batch_size, target_type=int, name="batch_size", min_val=1)

        # Check random state.
        self.random_state_ = check_random_state(self.random_state, seed_mult)

        return X, y, candidates, batch_size, return_utilities


[docs]class SingleAnnotatorPoolQueryStrategy(PoolQueryStrategy):
    """Base class for all pool-based active learning query strategies with a
    single annotator in scikit-activeml.
    """

[docs]    @abstractmethod
    def query(
        self,
        X,
        y,
        *args,
        candidates=None,
        batch_size=1,
        return_utilities=False,
        **kwargs,
    ):
        """Determines for which candidate samples labels are to be queried.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples.
        y : array-like of shape (n_samples)
            Labels of the training data set (possibly including unlabeled ones
            indicated by self.MISSING_LABEL).
        candidates : None or array-like of shape (n_candidates), dtype=int or
            array-like of shape (n_candidates, n_features),
            optional (default=None)
            If candidates is None, the unlabeled samples from (X,y) are
            considered as candidates.
            If candidates is of shape (n_candidates) and of type int,
            candidates is considered as the indices of the samples in (X,y).
            If candidates is of shape (n_candidates, n_features), the
            candidates are directly given in candidates (not necessarily
            contained in X). This is not supported by all query strategies.
        batch_size : int, optional (default=1)
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray of shape (batch_size)
            The query_indices indicate for which candidate sample a label is
            to queried, e.g., `query_indices[0]` indicates the first selected
            sample.
            If candidates is None or of shape (n_candidates), the indexing
            refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        utilities : numpy.ndarray of shape (batch_size, n_samples) or
            numpy.ndarray of shape (batch_size, n_candidates)
            The utilities of samples after each selected sample of the batch,
            e.g., `utilities[0]` indicates the utilities used for selecting
            the first sample (with index `query_indices[0]`) of the batch.
            Utilities for labeled samples will be set to np.nan.
            If candidates is None or of shape (n_candidates), the indexing
            refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        """
        raise NotImplementedError

    def _validate_data(
        self,
        X,
        y,
        candidates,
        batch_size,
        return_utilities,
        reset=True,
        check_X_dict=None,
    ):
        """Validate input data, all attributes and set or check the
        `n_features_in_` attribute.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples.
        y : array-like of shape (n_samples)
            Labels of the training data set (possibly including unlabeled ones
            indicated by self.MISSING_LABEL.
        candidates : None or array-like of shape (n_candidates,), dtype=int or
            array-like of shape (n_candidates, n_features),
            optional (default=None)
            If candidates is None, the unlabeled samples from (X,y) are
            considered as candidates.
            If candidates is of shape (n_candidates,) and of type int,
            candidates is considered as the indices of the samples in (X,y).
            If candidates is of shape (n_candidates, n_features), the
            candidates are directly given in candidates (not necessarily
            contained in X). This is not supported by all query strategies.
        batch_size : int
            The number of samples to be selected in one AL cycle.
        return_utilities : bool
            If true, also return the utilities based on the query strategy.
        reset : bool, default=True
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_X_dict : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        X : np.ndarray of shape (n_samples, n_features)
            Checked training data set.
        y : np.ndarray of shape (n_samples)
            Checked labels of the training data set.
        candidates :  None or np.ndarray of shape (n_candidates), dtype=int or
            np.ndarray of shape (n_candidates, n_features)
            Checked candidate samples.
        batch_size : int
            Checked number of samples to be selected in one AL cycle.
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """

        (
            X,
            y,
            candidates,
            batch_size,
            return_utilities,
        ) = super()._validate_data(
            X, y, candidates, batch_size, return_utilities, reset, check_X_dict
        )
        y = column_or_1d(y, warn=True)

        if candidates is None:
            n_candidates = int(
                np.sum(is_unlabeled(y, missing_label=self.missing_label_))
            )
        else:
            n_candidates = len(candidates)

        if n_candidates < batch_size:
            warnings.warn(
                f"'batch_size={batch_size}' is larger than number of "
                f"candidates. Instead, 'batch_size={n_candidates}' was set."
            )
            batch_size = n_candidates

        return X, y, candidates, batch_size, return_utilities

    def _transform_candidates(
        self,
        candidates,
        X,
        y,
        enforce_mapping=False,
        allow_only_unlabeled=False,
    ):
        """
        Transforms the `candidates` parameter into a sample array and the
        corresponding index array `mapping` such that
        `candidates = X[mapping]`.

        Parameters
        ----------
        candidates :  None or np.ndarray of shape (n_candidates), dtype=int or
            np.ndarray of shape (n_candidates, n_features)
            Checked candidate samples.
            If candidates is None, the unlabeled samples from (X,y) are
            considered as candidates.
            If candidates is of shape (n_candidates) and of type int,
            candidates is considered as the indices of the samples in (X,y).
            If candidates is of shape (n_candidates, n_features), the
            candidates are directly given in candidates (not necessarily
            contained in X). This is not supported by all query strategies.
        X : np.ndarray of shape (n_samples, n_features)
            Checked training data set.
        y : np.ndarray of shape (n_samples)
            Checked labels of the training data set.
        enforce_mapping : bool, default=False
            If True, an exception is raised when no exact mapping can be
            determined (i.e., `mapping` is None).
        allow_only_unlabeled : bool, default=False
            If True, an exception is raised when indices of candidates contain
            labeled samples.

        Returns
        -------
        candidates : np.ndarray of shape (n_candidates, n_features)
            Candidate samples from which the strategy can query the label.
        mapping : np.ndarray of shape (n_candidates) or None
            Index array that maps `candidates` to `X`.
            (`candidates = X[mapping]`)
        """

        if candidates is None:
            ulbd_idx = unlabeled_indices(y, self.missing_label_)
            return X[ulbd_idx], ulbd_idx
        elif candidates.ndim == 1:
            if allow_only_unlabeled:
                if is_labeled(y[candidates], self.missing_label_).any():
                    raise ValueError(
                        "Candidates must not contain labeled " "samples."
                    )
            return X[candidates], candidates
        else:
            if enforce_mapping:
                raise MappingError(
                    "Mapping `candidates` to `X` is not "
                    "possible but `enforce_mapping` is True. "
                    "Use index array for `candidates` instead."
                )
            else:
                return candidates, None


[docs]class MultiAnnotatorPoolQueryStrategy(PoolQueryStrategy):
    """Base class for all pool-based active learning query strategies with
    multiple annotators in scikit-activeml.

    Parameters
    ----------
    missing_label : scalar or string or np.nan or None, optional
    (default=np.nan)
        Value to represent a missing label.
    random_state : int or RandomState instance, optional (default=None)
        Controls the randomness of the estimator.
    """

[docs]    @abstractmethod
    def query(
        self,
        X,
        y,
        *args,
        candidates=None,
        annotators=None,
        batch_size=1,
        return_utilities=False,
        **kwargs,
    ):
        """Determines which candidate sample is to be annotated by which
        annotator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e., including the labeled
            and unlabeled samples.
        y : array-like of shape (n_samples, n_annotators)
            Labels of the training data set for each annotator (possibly
            including unlabeled ones indicated by self.MISSING_LABEL), meaning
            that `y[i, j]` contains the label annotated by annotator `i` for
            sample `j`.
        candidates : None or array-like of shape (n_candidates), dtype=int or
            array-like of shape (n_candidates, n_features),
            optional (default=None)
            See parameter `annotators`.
        annotators : None or array-like of shape (n_avl_annotators), dtype=int
            or array-like of shape (n_candidates, n_annotators), optional
        (default=None)
            If candidate samples and annotators are not specified, i.e.,
            `candidates=None`, `annotators=None` the unlabeled target values,
            `y`, are the candidates annotator-sample-pairs.
            If candidate samples and available annotators are specified:
            The annotator-sample-pairs, for which the sample is a candidate
            sample and the annotator is an available annotator are considered
            as candidate annotator-sample-pairs.
            If `candidates` is None, all samples of `X` are considered as
            candidate samples. In this case `n_candidates` equals `len(X)`.
            If `candidates` is of shape `(n_candidates,)` and of type int,
            `candidates` is considered as the indices of the sample candidates
            in `(X, y)`.
            If `candidates` is of shape (n_candidates, n_features), the
            sample candidates are directly given in `candidates` (not
            necessarily contained in `X`). This is not supported by all query
            strategies.
            If `annotators` is `None`, all annotators are considered as
            available annotators.
            If `annotators` is of shape (n_avl_annotators), and of type int,
            `annotators` is considered as the indices of the available
            annotators.
            If `annotators` is a boolean array of shape (n_candidates,
            n_annotators) the annotator-sample-pairs, for which the sample
            is a candidate sample and the boolean matrix has entry `True` are
            considered as candidate annotator-sample pairs.
        batch_size : int, optional (default=1)
            The number of annotators-sample pairs to be selected in one AL
            cycle.
        return_utilities : bool, optional (default=False)
            If True, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : np.ndarray of shape (batchsize, 2)
            The query_indices indicate which candidate sample pairs are to be
            queried is, i.e., which candidate sample is to be annotated by
            which annotator, e.g., `query_indices[:, 0]` indicates the selected
            candidate samples and `query_indices[:, 1]` indicates the
            respectively selected annotators.
            If candidates is None or of shape (n_candidates), the indexing
            of refers to samples in X.
            If candidates is of shape (n_candidates, n_features), the indexing
            refers to samples in candidates.
        utilities: numpy.ndarray of shape (batch_size, n_samples, n_annotators)
         or numpy.ndarray of shape (batch_size, n_candidates, n_annotators)
            The utilities of all candidate samples w.r.t. to the available
            annotators after each selected sample of the batch, e.g.,
            `utilities[0, :, j]` indicates the utilities used for selecting
            the first sample-annotator-pair (with indices `query_indices[0]`).
            If `candidates is None` or of shape (n_candidates), the indexing
            refers to samples in `X`.
            If `candidates` is of shape (n_candidates, n_features), the
            indexing refers to samples in `candidates`.
        """
        raise NotImplementedError

    def _validate_data(
        self,
        X,
        y,
        candidates,
        annotators,
        batch_size,
        return_utilities,
        reset=True,
        check_X_dict=None,
    ):
        """Validate input data, all attributes and set or check the
        `n_features_in_` attribute.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e., including the labeled
            and unlabeled samples.
        y : array-like of shape (n_samples, n_annotators)
            Labels of the training data set for each annotator (possibly
            including unlabeled ones indicated by self.MISSING_LABEL), meaning
            that `y[i, j]` contains the label annotated by annotator `i` for
            sample `j`.
        candidates : None or array-like of shape (n_candidates), dtype=int or
        array-like of shape (n_candidates, n_features), optional (default=None)
            See annotators.
        annotators : None or array-like of shape (n_avl_annotators), dtype=int
        or array-like of shape (n_candidates, n_annotators), optional
        (default=None)
            If candidate samples and annotators are not specified, i.e.,
            `candidates=None`, `annotators=None` the unlabeled target values,
            `y`, are the candidates annotator-sample-pairs.
            If candidate samples and available annotators are specified:
            The annotator-sample-pairs, for which the sample is a candidate
            sample and the annotator is an available annotator are considered
            as candidate annotator-sample-pairs.
            If `candidates` is None, all samples of `X` are considered as
            candidate samples. In this case `n_candidates` equals `len(X)`.
            If `candidates` is of shape `(n_candidates,)` and of type int,
            `candidates` is considered as the indices of the sample candidates
            in `(X, y)`.
            If `candidates` is of shape `(n_candidates, n_features)`, the
            sample candidates are directly given in `candidates` (not
            necessarily contained in `X`). This is not supported by all query
            strategies.
            If `annotators` is `None`, all annotators are considered as
            available annotators.
            If `annotators` is of shape `(n_avl_annotators)`, and of type int,
            `annotators` is considered as the indices of the available
            annotators.
            If `annotators` is a boolean array of shape `(n_candidates,
            n_annotators)` the annotator-sample-pairs, for which the sample
            is a candidate sample and the boolean matrix has entry `True` are
            considered as candidate annotator-sample-pairs.
        batch_size : int or string, optional (default=1)
            The number of annotators sample pairs to be selected in one AL
            cycle. If `adaptive=True`, `batch_size='adaptive'` is allowed.
        return_utilities : bool
            If true, also return the utilities based on the query strategy.
        reset : bool, default=True
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_X_dict : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        X : np.ndarray of shape (n_samples, n_features)
            Checked training data set.
        y : np.ndarray of shape (n_samples, n_annotators)
            Checked labels of the training data set.
        candidates :  None or np.ndarray of shape (n_candidates), dtype=int or
            np.ndarray of shape (n_candidates, n_features)
            Checked candidate samples.
        annotators : None or np.ndarray of shape (n_avl_annotators), dtype=int
            or np.ndarray of shape (n_candidates, n_annotators)
            Checked annotator boolean array
        batch_size : int
            Checked number of samples to be selected in one AL cycle.
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """

        (
            X,
            y,
            candidates,
            batch_size,
            return_utilities,
        ) = super()._validate_data(
            X, y, candidates, batch_size, return_utilities, reset, check_X_dict
        )

        check_array(y, ensure_2d=True, force_all_finite="allow-nan")
        unlabeled_pairs = is_unlabeled(y, missing_label=self.missing_label_)

        if annotators is not None:
            annotators = check_array(
                annotators, ensure_2d=False, allow_nd=True
            )

            if annotators.ndim == 1:
                annotators = check_indices(annotators, y, dim=1)
            elif annotators.ndim == 2:
                annotators = check_array(annotators, dtype=bool)
                if candidates is None:
                    check_consistent_length(X, annotators)
                else:
                    check_consistent_length(candidates, annotators)
                check_consistent_length(y.T, annotators.T)
            else:
                raise ValueError(
                    "`annotators` must be either None, 1d or 2d array-like."
                )

        if annotators is None:
            if candidates is None:
                n_candidate_pairs = int(np.sum(unlabeled_pairs))
            else:
                n_candidate_pairs = len(candidates) * len(y.T)
        elif annotators.ndim == 1:
            if candidates is None:
                n_candidate_pairs = len(X) * len(annotators)
            else:
                n_candidate_pairs = len(candidates) * len(annotators)
        else:
            n_candidate_pairs = int(np.sum(annotators))

        if n_candidate_pairs < batch_size:
            warnings.warn(
                f"'batch_size={batch_size}' is larger than number of "
                f"candidates pairs. Instead, 'batch_size={n_candidate_pairs}'"
                f" was set."
            )
            batch_size = n_candidate_pairs

        return X, y, candidates, annotators, batch_size, return_utilities

    def _transform_cand_annot(
        self, candidates, annotators, X, y, enforce_mapping=False
    ):
        """
        Transforms the `candidates` parameter into a sample array and the
        corresponding index array `mapping` such that
        `candidates = X[mapping]`, and transforms `annotators` into a boolean
        array such that `A_cand` represents the available annotator sample
        pairs for the samples of candidates.

        Parameters
        ----------
        candidates : None or array-like of shape (n_candidates), dtype=int or
            array-like of shape (n_candidates, n_features),
            optional (default=None)
            See annotators.
        annotators : None or array-like of shape (n_avl_annotators), dtype=int
        or array-like of shape (n_candidates, n_annotators), optional
        (default=None)
            If candidate samples and annotators are not specified, i.e.,
            `candidates=None`, `annotators=None` the unlabeled target values,
            `y`, are the candidates annotator-sample-pairs.
            If candidate samples and available annotators are specified:
            The annotator-sample-pairs, for which the sample is a candidate
            sample and the annotator is an available annotator are considered
            as candidate annotator-sample-pairs.
            If `candidates` is `None`, all samples of `X` are considered as
            candidate samples. In this case `n_candidates` equals `len(X)`.
            If `candidates` is of shape (n_candidates,) and of type int,
            `candidates` is considered as the indices of the sample candidates
            in `(X, y)`.
            If `candidates` is of shape `(n_candidates, n_features)`, the
            sample candidates are directly given in `candidates` (not
            necessarily contained in `X`). This is not supported by all query
            strategies.
            If `annotators` is `None`, all annotators are considered as
            available annotators.
            If `annotators` is of shape (n_avl_annotators), and of type int,
            `annotators` is considered as the indices of the available
            annotators.
            If `annotators` is a boolean array of shape `(n_candidates,
            n_annotators)` the annotator-sample-pairs, for which the sample
            is a candidate sample and the boolean matrix has entry `True` are
            considered as candidate annotator-sample-pairs.
        X : np.ndarray of shape (n_samples, n_features)
            Checked training data set.
        y : np.ndarray of shape (n_samples,)
            Checked labels of the training data set.
        enforce_mapping : bool, optional (default=False)
            If `True`, an exception is raised when no exact mapping can be
            determined (i.e., `mapping` is `None`).

        Returns
        -------
        candidates : np.ndarray of shape (n_selectable_candidates, n_features)
            Candidate samples from which the strategy can query the label.
        mapping : np.ndarray of shape (n_selectable_candidates) or None
            Index array that maps `candidates` to `X`
            (`candidates = X[mapping]`).
        A_cand : np.ndarray of shape(n_selectable_candidates, n_annotators)
            Available annotator-sample-pairs with respect to `candidates`.
        """
        unlbd_pairs = is_unlabeled(y, self.missing_label_)
        unlbd_sample_indices = np.argwhere(
            np.any(unlbd_pairs, axis=1)
        ).flatten()
        n_annotators = y.shape[1]

        # if mapping does not exist
        if candidates is not None and candidates.ndim == 2:
            n_candidates = len(candidates)
            if annotators is None:
                A_cand = np.full((n_candidates, n_annotators), True)
            elif annotators.ndim == 1:
                A_cand = np.full((n_candidates, n_annotators), False)
                A_cand[:, annotators] = True
            else:
                A_cand = annotators

            if enforce_mapping:
                raise ValueError(
                    "Mapping `candidates` to `X` is not posssible"
                    "but `enforce_mapping` is True. Use index"
                    "array for `candidates` instead."
                )
            else:
                return candidates, None, A_cand
        # mapping exists
        if candidates is None:
            if annotators is None:
                candidates = unlbd_sample_indices
                A_cand = unlbd_pairs[unlbd_sample_indices]
            elif annotators.ndim == 1:
                candidates = np.arange(len(X), dtype=int)
                A_cand = np.full_like(y, False)
                A_cand[:, annotators] = True
            else:
                candidates = np.arange(len(X), dtype=int)
                A_cand = annotators
        else:  # candidates indices array
            if annotators is None:
                A_cand = np.full((len(candidates), y.shape[1]), True)
            elif annotators.ndim == 1:
                A_cand = np.full((len(candidates), y.shape[1]), False)
                A_cand[:, annotators] = True
            else:
                candidates = candidates
                A_cand = annotators
        return X[candidates], candidates, A_cand


[docs]class BudgetManager(ABC, BaseEstimator):
    """Base class for all budget managers for stream-based active learning
    in scikit-activeml to model budgeting constraints.

    Parameters
    ----------
    budget : float (default=None)
        Specifies the ratio of instances which are allowed to be sampled, with
        0 <= budget <= 1. If budget is None, it is replaced with the default
        budget 0.1.
    """

    def __init__(self, budget=None):
        self.budget = budget

[docs]    @abstractmethod
    def query_by_utility(self, utilities, *args, **kwargs):
        """Ask the budget manager which utilities are sufficient to query the
        corresponding instance.

        Parameters
        ----------
        utilities : ndarray of shape (n_samples,)
            The utilities provided by the stream-based active learning
            strategy, which are used to determine whether sampling an instance
            is worth it given the budgeting constraint.

        Returns
        -------
        queried_indices : ndarray of shape (n_queried_instances,)
            The indices of instances represented by utilities which should be
            queried, with 0 <= n_queried_instances <= n_samples.
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def update(self, candidates, queried_indices, *args, **kwargs):
        """Updates the BudgetManager.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        queried_indices : array-like
            Indicates which instances from candidates have been queried.

        Returns
        -------
        self : BudgetManager
            The BudgetManager returns itself, after it is updated.
        """
        raise NotImplementedError

    def _validate_budget(self):
        """check the assigned budget and set the default value 0.1 if budget is
        set to None.
        """
        if self.budget is not None:
            self.budget_ = self.budget
        else:
            self.budget_ = 0.1
        check_scalar(
            self.budget_,
            "budget",
            float,
            min_val=0.0,
            max_val=1.0,
            min_inclusive=False,
        )

    def _validate_data(self, utilities, *args, **kwargs):
        """Validate input data.

        Parameters
        ----------
        utilities: ndarray of shape (n_samples,)
            The utilities provided by the stream-based active learning
            strategy.

        Returns
        -------
        utilities: ndarray of shape (n_samples,)
            Checked utilities
        """
        # Check if utilities is set
        if not isinstance(utilities, np.ndarray):
            raise TypeError(
                "{} is not a valid type for utilities".format(type(utilities))
            )
        # Check budget
        self._validate_budget()
        return utilities


[docs]class SingleAnnotatorStreamQueryStrategy(QueryStrategy):
    """Base class for all stream-based active learning query strategies in
    scikit-activeml.

    Parameters
    ----------
    budget : float, default=None
        The budget which models the budgeting constraint used in
        the stream-based active learning setting.
    random_state : int, RandomState instance, default=None
        Controls the randomness of the estimator.
    """

    def __init__(self, budget, random_state=None):
        super().__init__(random_state=random_state)
        self.budget = budget

[docs]    @abstractmethod
    def query(self, candidates, *args, return_utilities=False, **kwargs):
        """Ask the query strategy which instances in candidates to acquire.

        The query startegy determines the most useful instances in candidates,
        which can be acquired within the budgeting constraint specified by the
        budgetmanager.
        Please note that, this method does not alter the internal state of the
        query strategy. To adapt the query strategy to the selected candidates,
        use update(...) with the selected candidates.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        return_utilities : bool, optional
            If true, also return the utilities based on the query strategy.
            The default is False.

        Returns
        -------
        queried_indices : ndarray of shape (n_sampled_instances,)
            The indices of instances in candidates which should be sampled,
            with 0 <= n_sampled_instances <= n_samples.

        utilities: ndarray of shape (n_samples,), optional
            The utilities based on the query strategy. Only provided if
            return_utilities is True.
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def update(
        self,
        candidates,
        queried_indices,
        *args,
        budget_manager_param_dict=None,
        **kwargs,
    ):
        """Update the query strategy with the decisions taken.

        This function should be used in conjunction with the query function,
        when the instances queried from query(...) may differ from the
        instances queried in the end. In this case use query(...) with
        simulate=true and provide the final decisions via update(...).
        This is especially helpful, when developing wrapper query strategies.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        queried_indices : array-like
            Indicates which instances from candidates have been queried.

        budget_manager_param_dict : kwargs, optional
            Optional kwargs for budgetmanager.
        Returns
        -------
        self : StreamBasedQueryStrategy
            The StreamBasedQueryStrategy returns itself, after it is updated.
        """
        raise NotImplementedError

    def _validate_random_state(self):
        """Creates a copy 'random_state_' if random_state is an instance of
        np.random_state. If not create a new random state. See also
        :func:`~sklearn.utils.check_random_state`
        """
        if not hasattr(self, "random_state_"):
            self.random_state_ = deepcopy(self.random_state)
        self.random_state_ = check_random_state(self.random_state_)

    def _validate_budget(self):
        if self.budget is not None:
            self.budget_ = self.budget
        else:
            self.budget_ = 0.1
        check_scalar(
            self.budget_,
            "budget",
            float,
            min_val=0.0,
            max_val=1.0,
            min_inclusive=False,
        )

    def _validate_data(
        self,
        candidates,
        return_utilities,
        *args,
        reset=True,
        **check_candidates_params,
    ):
        """Validate input data and set or check the `n_features_in_` attribute.

        Parameters
        ----------
        candidates: array-like of shape (n_candidates, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        return_utilities : bool,
            If true, also return the utilities based on the query strategy.
        reset : bool, default=True
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_candidates_params : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        # Check candidate instances.
        candidates = check_array(candidates, **check_candidates_params)

        # Check number of features.
        self._check_n_features(candidates, reset=reset)

        # Check return_utilities.
        check_scalar(return_utilities, "return_utilities", bool)

        # Check random state.
        self._validate_random_state()

        # Check budgetmanager.
        self._validate_budget()

        return candidates, return_utilities


[docs]class SkactivemlClassifier(BaseEstimator, ClassifierMixin, ABC):
    """SkactivemlClassifier

    Base class for scikit-activeml classifiers such that missing labels,
    user-defined classes, and cost-sensitive classification (i.e., cost matrix)
    can be handled.

    Parameters
    ----------
    classes : array-like of shape (n_classes), default=None
        Holds the label for each class. If none, the classes are determined
        during the fit.
    missing_label : scalar, string, np.nan, or None, default=np.nan
        Value to represent a missing label.
    cost_matrix : array-like of shape (n_classes, n_classes)
        Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class
        `classes[j]`  for a sample of class `classes[i]`. Can be only set, if
        classes is not none.
    random_state : int or RandomState instance or None, default=None
        Determines random number for `predict` method. Pass an int for
        reproducible results across multiple method calls.

    Attributes
    ----------
    classes_ : array-like, shape (n_classes)
        Holds the label for each class after fitting.
    cost_matrix_ : array-like,of shape (classes, classes)
        Cost matrix after fitting with `cost_matrix_[i,j]` indicating cost of
        predicting class `classes_[j]`  for a sample of class `classes_[i]`.
    """

    def __init__(
        self,
        classes=None,
        missing_label=MISSING_LABEL,
        cost_matrix=None,
        random_state=None,
    ):
        self.classes = classes
        self.missing_label = missing_label
        self.cost_matrix = cost_matrix
        self.random_state = random_state

[docs]    @abstractmethod
    def fit(self, X, y, sample_weight=None):
        """Fit the model using X as training data and y as class labels.

        Parameters
        ----------
        X : matrix-like, shape (n_samples, n_features)
            The sample matrix X is the feature matrix representing the samples.
        y : array-like, shape (n_samples) or (n_samples, n_outputs)
            It contains the class labels of the training samples.
            The number of class labels may be variable for the samples, where
            missing labels are represented the attribute 'missing_label'.
        sample_weight : array-like, shape (n_samples) or (n_samples, n_outputs)
            It contains the weights of the training samples' class labels.
            It must have the same shape as y.

        Returns
        -------
        self: skactiveml.base.SkactivemlClassifier,
            The `skactiveml.base.SkactivemlClassifier` object fitted on the
            training data.
        """
        raise NotImplementedError

[docs]    def predict_proba(self, X):
        """Return probability estimates for the test data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Test samples.

        Returns
        -------
        P : numpy.ndarray, shape (n_samples, classes)
            The class probabilities of the test samples. Classes are ordered
            according to 'classes_'.
        """
        raise NotImplementedError

[docs]    def predict(self, X):
        """Return class label predictions for the test samples `X`.

        Parameters
        ----------
        X :  array-like of shape (n_samples, n_features)
            Input samples.

        Returns
        -------
        y : numpy.ndarray of shape (n_samples)
            Predicted class labels of the test samples `X`. Classes are ordered
            according to `classes_`.
        """
        P = self.predict_proba(X)
        costs = np.dot(P, self.cost_matrix_)
        y_pred = rand_argmin(costs, random_state=self.random_state_, axis=1)
        y_pred = self._le.inverse_transform(y_pred)
        y_pred = np.asarray(y_pred, dtype=self.classes_.dtype)
        return y_pred

[docs]    def score(self, X, y, sample_weight=None):
        """Return the mean accuracy on the given test data and labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.

        y : array-like of shape (n_samples,)
            True labels for `X`.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.

        Returns
        -------
        score : float
            Mean accuracy of `self.predict(X)` regarding `y`.
        """
        y = self._le.transform(y)
        y_pred = self._le.transform(self.predict(X))
        return accuracy_score(y, y_pred, sample_weight=sample_weight)

    def _validate_data(
        self,
        X,
        y,
        sample_weight=None,
        check_X_dict=None,
        check_y_dict=None,
        y_ensure_1d=True,
    ):
        if check_X_dict is None:
            check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0}
        if check_y_dict is None:
            check_y_dict = {
                "ensure_min_samples": 0,
                "ensure_min_features": 0,
                "ensure_2d": False,
                "force_all_finite": False,
                "dtype": None,
            }

        # Check common classifier parameters.
        check_classifier_params(
            self.classes, self.missing_label, self.cost_matrix
        )

        # Store and check random state.
        self.random_state_ = check_random_state(self.random_state)

        # Create label encoder.
        self._le = ExtLabelEncoder(
            classes=self.classes, missing_label=self.missing_label
        )

        # Check input parameters.
        y = check_array(y, **check_y_dict)
        if len(y) > 0:
            y = column_or_1d(y) if y_ensure_1d else y
            y = self._le.fit_transform(y)
            is_lbdl = is_labeled(y, missing_label=-1)
            if len(y[is_lbdl]) > 0:
                check_classification_targets(y[is_lbdl])
            if len(self._le.classes_) == 0:
                raise ValueError(
                    "No class label is known because 'y' contains no actual "
                    "class labels and 'classes' is not defined. Change at "
                    "least on of both to overcome this error."
                )
        else:
            self._le.fit(self.classes)
            check_X_dict["ensure_2d"] = False
        X = check_array(X, **check_X_dict)
        check_consistent_length(X, y)

        # Update detected classes.
        self.classes_ = self._le.classes_

        # Check classes.
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, **check_y_dict)
            if not np.array_equal(y.shape, sample_weight.shape):
                raise ValueError(
                    f"`y` has the shape {y.shape} and `sample_weight` has the "
                    f"shape {sample_weight.shape}. Both need to have "
                    f"identical shapes."
                )

        # Update cost matrix.
        self.cost_matrix_ = (
            1 - np.eye(len(self.classes_))
            if self.cost_matrix is None
            else self.cost_matrix
        )
        self.cost_matrix_ = check_cost_matrix(
            self.cost_matrix_, len(self.classes_)
        )
        if self.classes is not None:
            class_indices = np.argsort(self.classes)
            self.cost_matrix_ = self.cost_matrix_[class_indices]
            self.cost_matrix_ = self.cost_matrix_[:, class_indices]

        return X, y, sample_weight

    def _check_n_features(self, X, reset):
        if reset:
            self.n_features_in_ = X.shape[1] if len(X) > 0 else None
        elif not reset:
            if self.n_features_in_ is not None:
                super()._check_n_features(X, reset=reset)


[docs]class ClassFrequencyEstimator(SkactivemlClassifier):
    """ClassFrequencyEstimator

    Extends scikit-activeml classifiers to estimators that are able to estimate
    class frequencies for given samples (by calling 'predict_freq').

    Parameters
    ----------
    classes : array-like, shape (n_classes), default=None
        Holds the label for each class. If none, the classes are determined
        during the fit.
    missing_label : scalar or str or np.nan or None, default=np.nan
        Value to represent a missing label.
    cost_matrix : array-like of shape (n_classes, n_classes)
        Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class
        `classes[j]`  for a sample of class `classes[i]`. Can be only set, if
        classes is not none.
    class_prior : float or array-like, shape (n_classes), default=0
        Prior observations of the class frequency estimates. If `class_prior`
        is an array, the entry `class_prior[i]` indicates the non-negative
        prior number of samples belonging to class `classes_[i]`. If
        `class_prior` is a float, `class_prior` indicates the non-negative
        prior number of samples per class.
    random_state : int or np.RandomState or None, default=None
        Determines random number for 'predict' method. Pass an int for
        reproducible results across multiple method calls.

    Attributes
    ----------
    classes_ : np.ndarray of shape (n_classes)
        Holds the label for each class after fitting.
    class_prior_ : np.ndarray of shape (n_classes)
        Prior observations of the class frequency estimates. The entry
        `class_prior_[i]` indicates the non-negative prior number of samples
        belonging to class `classes_[i]`.
    cost_matrix_ : np.ndarray of shape (classes, classes)
        Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting
        class `classes_[j]` for a sample of class `classes_[i]`.
    """

    def __init__(
        self,
        class_prior=0,
        classes=None,
        missing_label=MISSING_LABEL,
        cost_matrix=None,
        random_state=None,
    ):
        super().__init__(
            classes=classes,
            missing_label=missing_label,
            cost_matrix=cost_matrix,
            random_state=random_state,
        )
        self.class_prior = class_prior

[docs]    @abstractmethod
    def predict_freq(self, X):
        """Return class frequency estimates for the test samples `X`.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            Test samples whose class frequencies are to be estimated.

        Returns
        -------
        F: array-like of shape (n_samples, classes)
            The class frequency estimates of the test samples 'X'. Classes are
            ordered according to attribute 'classes_'.
        """
        raise NotImplementedError

[docs]    def predict_proba(self, X):
        """Return probability estimates for the test data `X`.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features) or
        shape (n_samples, m_samples) if metric == 'precomputed'
            Input samples.

        Returns
        -------
        P : array-like of shape (n_samples, classes)
            The class probabilities of the test samples. Classes are ordered
            according to classes_.
        """
        # Normalize probabilities of each sample.
        P = self.predict_freq(X) + self.class_prior_
        normalizer = np.sum(P, axis=1)
        P[normalizer > 0] /= normalizer[normalizer > 0, np.newaxis]
        P[normalizer == 0, :] = [1 / len(self.classes_)] * len(self.classes_)
        return P

[docs]    def sample_proba(self, X, n_samples=10, random_state=None):
        """Samples probability vectors from Dirichlet distributions whose
        parameters `alphas` are defined as the sum of the frequency estimates
        returned by `predict_freq` and the `class_prior`.

        Parameters
        ----------
        X : array-like of shape (n_test_samples, n_features)
            Test samples for which `n_samples` probability vectors are to be
            sampled.
        n_samples : int, default=10
            Number of probability vectors to sample for each `X[i]`.
        random_state : int or numpy.random.RandomState or None, default=None
            Ensure reproducibility when sampling probability vectors from the
            Dirichlet distributions.

        Returns
        -------
        P : array-like of shape (n_samples, n_test_samples, n_classes)
            There are `n_samples` class probability vectors for each test
            sample in `X`. Classes are ordered according to classes_.
        """
        random_state = check_random_state(random_state)
        alphas = self.predict_freq(X) + self.class_prior_
        alphas = alphas.repeat(repeats=n_samples, axis=0)
        if (alphas == 0).any():
            raise ValueError(
                "There are zero frequency observations. "
                "Set `class_prior > 0` to avoid this error."
            )
        R = random_state.standard_gamma(alphas)
        R_sums = R.sum(axis=-1)
        is_zero = (R_sums == 0.0).ravel()
        sampled_class_indices = random_state.choice(
            np.array(R.shape[-1]), size=is_zero.sum()
        )
        R[is_zero, sampled_class_indices] = 1.0
        P = R / R.sum(axis=-1, keepdims=True)
        P = P.reshape(n_samples, len(X), P.shape[-1], order="F")
        return P

    def _validate_data(
        self,
        X,
        y,
        sample_weight=None,
        check_X_dict=None,
        check_y_dict=None,
        y_ensure_1d=True,
    ):
        X, y, sample_weight = super()._validate_data(
            X=X,
            y=y,
            sample_weight=sample_weight,
            check_X_dict=check_X_dict,
            check_y_dict=check_y_dict,
            y_ensure_1d=y_ensure_1d,
        )

        # Check class prior.
        self.class_prior_ = check_class_prior(
            self.class_prior, len(self.classes_)
        )

        return X, y, sample_weight


[docs]class SkactivemlRegressor(BaseEstimator, RegressorMixin, ABC):
    """SkactivemlRegressor

    Base class for scikit-activeml regressors.

    Parameters
    __________
    missing_label : scalar, string, np.nan, or None, optional
    (default=skactiveml.utils.MISSING_LABEL)
        Value to represent a missing label.
    random_state : int, RandomState or None, optional (default=None)
        Determines random number for 'fit' and 'predict' method. Pass an int
        for reproducible results across multiple method calls.
    """

    def __init__(self, missing_label=MISSING_LABEL, random_state=None):
        self.missing_label = missing_label
        self.random_state = random_state

[docs]    @abstractmethod
    def fit(self, X, y, sample_weight=None):
        """Fit the model using X as training data and y as numerical labels.

        Parameters
        ----------
        X : matrix-like, shape (n_samples, n_features)
            The sample matrix X is the feature matrix representing the samples.
        y : array-like, shape (n_samples) or (n_samples, n_targets)
            It contains the labels of the training samples.
            The number of numerical labels may be variable for the samples,
            where missing labels are represented the attribute 'missing_label'.
        sample_weight : array-like, shape (n_samples)
            It contains the weights of the training samples' values.

        Returns
        -------
        self: skactiveml.base.SkactivemlRegressor,
            The `skactiveml.base.SkactivemlRegressor` object fitted on the
            training data.
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def predict(self, X):
        """Return value predictions for the test samples X.

        Parameters
        ----------
        X :  array-like, shape (n_samples, n_features)
            Input samples.
        Returns
        -------
        y : numpy.ndarray, shape (n_samples)
            Predicted values of the test samples 'X'.
        """
        raise NotImplementedError

    def _validate_data(
        self,
        X,
        y,
        sample_weight=None,
        check_X_dict=None,
        check_y_dict=None,
        y_ensure_1d=True,
    ):
        if check_X_dict is None:
            check_X_dict = {"ensure_min_samples": 0, "ensure_min_features": 0}
        if check_y_dict is None:
            check_y_dict = {
                "ensure_min_samples": 0,
                "ensure_min_features": 0,
                "ensure_2d": False,
                "force_all_finite": False,
                "dtype": None,
            }

        check_missing_label(self.missing_label)
        self.missing_label_ = self.missing_label

        # Store and check random state.
        self.random_state_ = check_random_state(self.random_state)

        X = check_array(X, **check_X_dict)
        y = check_array(y, **check_y_dict)
        if len(y) > 0:
            y = column_or_1d(y) if y_ensure_1d else y

        if sample_weight is not None:
            sample_weight = check_array(sample_weight, **check_y_dict)
            if not np.array_equal(y.shape, sample_weight.shape):
                raise ValueError(
                    f"`y` has the shape {y.shape} and `sample_weight` has the "
                    f"shape {sample_weight.shape}. Both need to have "
                    f"identical shapes."
                )

        return X, y, sample_weight


[docs]class ProbabilisticRegressor(SkactivemlRegressor):
    """ProbabilisticRegressor

    Base class for scikit-activeml probabilistic regressors.

    """

[docs]    @abstractmethod
    def predict_target_distribution(self, X):
        """Returns the predicted target distribution conditioned on the test
        samples `X`.

        Parameters
        ----------
        X :  array-like, shape (n_samples, n_features)
            Input samples.

        Returns
        -------
        dist : scipy.stats._distn_infrastructure.rv_frozen
            The distribution of the targets at the test samples.

        """
        raise NotImplementedError

[docs]    def predict(self, X, return_std=False, return_entropy=False):
        """Returns the mean, std (optional) and differential entropy (optional)
        of the predicted target distribution conditioned on the test samples
        `X`.

        Parameters
        ----------
        X :  array-like, shape (n_samples, n_features)
            Input samples.
        return_std : bool, optional (default=False)
            Whether to return the standard deviation.
        return_entropy : bool, optional (default=False)
            Whether to return the differential entropy.

        Returns
        -------
        mu : numpy.ndarray, shape (n_samples)
            Predicted mean conditioned on `X`.
        std : numpy.ndarray, shape (n_samples), optional
            Predicted standard deviation conditioned on `X`.
        entropy : numpy..ndarray, optional
            Predicted differential entropy conditioned on `X`.
        """
        check_scalar(return_std, "return_std", bool)
        check_scalar(return_entropy, "return_entropy", bool)
        rv = self.predict_target_distribution(X)
        result = (rv.mean(),)
        if return_std:
            result += (rv.std(),)
        if return_entropy:
            result += (rv.entropy(),)
        if len(result) == 1:
            result = result[0]
        return result

[docs]    def sample_y(self, X, n_samples=1, random_state=None):
        """Returns random samples from the predicted target distribution
        conditioned on the test samples `X`.

        Parameters
        ----------
        X :  array-like, shape (n_samples_X, n_features)
            Input samples, where the target values are drawn from.
        n_samples: int, optional (default=1)
            Number of random samples to be drawn.
        random_state : int, RandomState instance or None, optional
        (default=None)
            Determines random number generation to randomly draw samples. Pass
            an int for reproducible results across multiple method calls.

        Returns
        -------
        y_samples : numpy.ndarray, shape (n_samples_X, n_samples)
            Drawn random target samples.
        """
        rv = self.predict_target_distribution(X)
        rv_samples = rv.rvs(
            size=(n_samples, len(X)), random_state=random_state
        )
        return rv_samples.T


[docs]class AnnotatorModelMixin(ABC):
    """AnnotatorModelMixin

    Base class of all annotator models estimating the performances of
    annotators for given samples.
    """

[docs]    @abstractmethod
    def predict_annotator_perf(self, X):
        """Calculates the performance of an annotator to provide the true label
        for a given sample.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.

        Returns
        -------
        P_annot : numpy.ndarray of shape (n_samples, n_annotators)
            `P_annot[i,l]` is the performance of annotator `l` regarding the
             annotation of sample `X[i]`.
        """
        raise NotImplementedError