Source code for skactiveml.stream._uncertainty_zliobaite

import numpy as np
from sklearn.base import clone
from sklearn.utils import check_array, check_consistent_length
from copy import deepcopy

from .budgetmanager import (
    FixedUncertaintyBudgetManager,
    VariableUncertaintyBudgetManager,
    SplitBudgetManager,
    RandomVariableUncertaintyBudgetManager,
)
from ..base import (
    BudgetManager,
    SingleAnnotatorStreamQueryStrategy,
    SkactivemlClassifier,
)
from ..utils import (
    check_type,
    call_func,
    check_budget_manager,
)


class UncertaintyZliobaite(SingleAnnotatorStreamQueryStrategy):
    """Base class for the uncertainty sampling strategies proposed by Žliobaitė
    et al. in [1]_.

    The UncertaintyZliobaite class provides the base for query strategies
    proposed by Žliobaitė et al. in [1]_. The strategies evaluate the
    classifier's uncertainty based on its predictions and samples' labels are
    queried when the uncertainty exceeds a specific threshold. Žliobaitė et al.
    propose various techniques to calculate such a threshold.

    Parameters
    ----------
    budget_manager : BudgetManager, default=None
        The BudgetManager which models the budgeting constraint used in the
        stream-based active learning setting. if set to `None`, a default
        budger manager will be used.  The budget manager will be initialized
        based on the following conditions:

        - If only a `budget` is given, the default budget manager is
          initialized with the given budget.
        - If only a budget manager is given, use the budget manager.
        - If both are not given, the default budget manager with the default
          budget.
        - If both are given, and the budget differs from
          `budgetmanager.budget`, throw a warning and the budget manager is
          used as is.
    budget : float, default=None
        Specifies the ratio of samples which are allowed to be sampled, with
        `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
        default budget 0.1.
    random_state : int or RandomState instance, default=None
        Controls the randomness of the estimator.

    References
    ----------
    .. [1] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active
        Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn.
        Syst., 25(1):27–39, 2014.
    """

    def __init__(
        self,
        budget_manager=None,
        budget=None,
        random_state=None,
    ):
        super().__init__(budget=budget, random_state=random_state)
        self.budget_manager = budget_manager

    def query(
        self,
        candidates,
        clf,
        X=None,
        y=None,
        sample_weight=None,
        fit_clf=False,
        return_utilities=False,
    ):
        """Determines for which candidate samples labels are to be queried.

        The query startegy determines the most useful samples in candidates,
        which can be acquired within the budgeting constraint specified by
        `budget`. Please note that, this method does not change the internal
        state of the query strategy. To adapt the query strategy to the
        selected candidates, use `update(...)`.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape\
                (n_candidates, n_features)
            The samples which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        clf : skactiveml.base.SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`.
        X : array-like of shape (n_samples, n_features), default=None
            Training data set used to fit the classifier.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        sample_weight : array-like of shape (n_samples,), default=None
            Weights of training samples in `X`.
        fit_clf : bool, default=False
            Defines whether the classifier should be fitted on `X`, `y`, and
            `sample_weight`.
        return_utilities : bool, default=False
            If `True`, also return the `utilities` based on the query strategy.

        Returns
        -------
        queried_indices : np.ndarray of shape (n_queried_indices,)
            The indices of samples in candidates whose labels are queried,
            with `0 <= queried_indices <= n_candidates`.
        utilities: np.ndarray of shape (n_candidates,),
            The utilities based on the query strategy. Only provided if
            `return_utilities` is `True`.
        """
        (
            candidates,
            clf,
            X,
            y,
            sample_weight,
            fit_clf,
            return_utilities,
        ) = self._validate_data(
            candidates,
            clf=clf,
            X=X,
            y=y,
            sample_weight=sample_weight,
            fit_clf=fit_clf,
            return_utilities=return_utilities,
        )

        predict_proba = clf.predict_proba(candidates)
        confidence = np.max(predict_proba, axis=1)
        utilities = 1 - confidence

        queried_indices = self.budget_manager_.query_by_utility(utilities)

        if return_utilities:
            return queried_indices, utilities
        else:
            return queried_indices

    def update(
        self, candidates, queried_indices, budget_manager_param_dict=None
    ):
        """Updates the budget manager and the count for seen and queried
        labels. This function should be used in conjunction with the `query`
        function.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape\
                (n_candidates, n_features)
            The samples which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        queried_indices : np.ndarray of shape (n_queried_indices,)
            The indices of samples in candidates whose labels are queried,
            with `0 <= queried_indices <= n_candidates`.
        budget_manager_param_dict : dict, default=None
            Optional kwargs for `budget_manager`.

        Returns
        -------
        self : SingleAnnotatorStreamQueryStrategy
            The query strategy returns itself, after it is updated.
        """
        # check if a budgetmanager is set
        if not hasattr(self, "budget_manager_"):
            self._validate_random_state()
            random_seed = deepcopy(self.random_state_).randint(2**31 - 1)
            check_type(
                self.budget_manager,
                "budget_manager_",
                BudgetManager,
                type(None),
            )
            default_budget_manager_kwargs = (
                self._get_default_budget_manager_kwargs()
            )
            default_budget_manager_kwargs["random_state"] = random_seed
            self.budget_manager_ = check_budget_manager(
                self.budget,
                self.budget_manager,
                self._get_default_budget_manager(),
                default_budget_manager_kwargs,
            )

        budget_manager_param_dict = (
            {}
            if budget_manager_param_dict is None
            else budget_manager_param_dict
        )

        call_func(
            self.budget_manager_.update,
            candidates=candidates,
            queried_indices=queried_indices,
            **budget_manager_param_dict,
        )
        return self

    def _validate_data(
        self,
        candidates,
        clf,
        X,
        y,
        sample_weight,
        fit_clf,
        return_utilities,
        reset=True,
        **check_candidates_params,
    ):
        """Validate input data and set or check the `n_features_in_` attribute.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape\
                (n_candidates, n_features)
            The samples which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        clf : skactiveml.base.SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`.
        X : array-like of shape (n_samples, n_features), default=None
            Training data set used to fit the classifier.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        sample_weight : array-like of shape (n_samples,), default=None
            Weights of training samples in `X`.
        fit_clf : bool, default=False
            Defines whether the classifier should be fitted on `X`, `y`, and
            `sample_weight`.
        return_utilities : bool, default=False
            If `True`, also return the utilities based on the query strategy.
        reset : bool, default=True
            Whether to reset the `n_features_in_` attribute. If False, the
            input will be checked for consistency with data provided when reset
            was last True.
        **check_candidates_params : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples.
        clf : SkactivemlClassifier
            Checked model implementing the methods `fit` and `predict_freq`.
        X: np.ndarray, shape (n_samples, n_features)
            Checked training data set.
        y: np.ndarray, shape (n_samples)
            Checked training labels.
        sampling_weight: np.ndarray, shape (n_candidates)
            Checked training sample weight.
        fit_clf : bool,
            Checked boolean value of `fit_clf`.
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        candidates, return_utilities = super()._validate_data(
            candidates,
            return_utilities,
            reset=reset,
            **check_candidates_params,
        )
        self._validate_random_state()
        X, y, sample_weight = self._validate_X_y_sample_weight(
            X=X, y=y, sample_weight=sample_weight
        )
        clf = self._validate_clf(clf, X, y, sample_weight, fit_clf)

        # check if a budgetmanager is set
        if not hasattr(self, "budget_manager_"):
            random_seed = deepcopy(self.random_state_).randint(2**31 - 1)
            check_type(
                self.budget_manager,
                "budget_manager_",
                BudgetManager,
                type(None),
            )
            default_budget_manager_kwargs = (
                self._get_default_budget_manager_kwargs()
            )
            default_budget_manager_kwargs["random_state"] = random_seed
            self.budget_manager_ = check_budget_manager(
                self.budget,
                self.budget_manager,
                self._get_default_budget_manager(),
                default_budget_manager_kwargs,
            )

        return candidates, clf, X, y, sample_weight, fit_clf, return_utilities

    def _get_default_budget_manager_kwargs(self):
        """Provide the kwargs for the budget manager that will be used as
        default.

        Returns
        -------
        default_budget_manager_kwargs : dict
            The arguments necessary to initialize the budget manager.
        """
        return {}

    def _validate_clf(self, clf, X, y, sample_weight, fit_clf):
        """Validate if `clf` is a valid `SkactivemlClassifier`. If `clf` is
        untrained and `fit_clf`=`True`, `clf` is trained using X, y and
        sample_weight.

        Parameters
        ----------
        clf : skactiveml.base.SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`.
        X : array-like of shape (n_samples, n_features), default=None
            Training data set used to fit the classifier.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        sample_weight : array-like of shape (n_samples,), default=None
            Weights of training samples in `X`.
        fit_clf : bool, default=False
            Defines whether the classifier should be fitted on `X`, `y`, and
            `sample_weight`.
        Returns
        -------
        clf : skactiveml.base.SkactivemlClassifier
            Checked model implementing the methods `fit` and `predict_freq`.
        """
        # Check if the classifier and its arguments are valid.
        check_type(clf, "clf", SkactivemlClassifier)
        check_type(fit_clf, "fit_clf", bool)
        if fit_clf:
            if sample_weight is None:
                clf = clone(clf).fit(X, y)
            else:
                clf = clone(clf).fit(X, y, sample_weight)
        return clf

    def _validate_X_y_sample_weight(self, X, y, sample_weight):
        """Validate if X, y and sample_weight are numeric and of equal length.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set used to fit the classifier.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        sample_weight : array-like of shape (n_samples,)
            Weights of training samples in `X`.
        Returns
        -------
        X : array-like of shape (n_samples, n_features)
            Checked training data set.
        y : array-like of shape (n_samples)
            Checked labels of the input samples `X`. Converts `y` to a numpy
            array.
        """
        if sample_weight is not None:
            sample_weight = np.array(sample_weight)
            check_consistent_length(sample_weight, y)
        if X is not None and y is not None:
            X = check_array(X)
            y = np.array(y)
            check_consistent_length(X, y)
        return X, y, sample_weight


[docs]class FixedUncertainty(UncertaintyZliobaite):
    """Fixed Uncertainty Strategy

    The FixedUncertainty (Fixed Uncertainty Strategy in [1]_) query strategy
    queries samples based on the classifiers uncertainty that is assessed
    based on the classifier's predictions. The sample is queried when the
    probability of the most likely class exceeds a threshold calculated based
    on the budget and the number of classes. See also
    :class:`.FixedUncertaintyBudgetManager`

    Parameters
    ----------
    classes : array-like of shape (n_classes,)
        Holds the label for each class.
    budget_manager : BudgetManager, default=None
        The BudgetManager which models the budgeting constraint used in the
        stream-based active learning setting. if set to `None`,
        `FixedUncertaintyBudgetManager` will be used by default.  The budget
        manager will be initialized based on the following conditions:

        - If only a `budget` is given, the default budget manager is
          initialized with the given budget.
        - If only a budget manager is given, use the budget manager.
        - If both are not given, the default budget manager with the default
          budget.
        - If both are given, and the budget differs from
          `budgetmanager.budget`, throw a warning and the budget manager is
          used as is.
    budget : float, default=None
        Specifies the ratio of samples which are allowed to be sampled, with
        `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
        default budget 0.1.
    random_state : int or RandomState instance, default=None
        Controls the randomness of the estimator.

    References
    ----------
    .. [1] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active
        Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn.
        Syst., 25(1):27–39, 2014.
    """

    def __init__(
        self,
        classes,
        budget_manager=None,
        budget=None,
        random_state=None,
    ):
        super().__init__(
            budget_manager=budget_manager,
            budget=budget,
            random_state=random_state,
        )
        self.classes = classes

    def _get_default_budget_manager(self):
        """Provide the budget manager that will be used as default.

        Returns
        -------
        budget_manager : BudgetManager
            The BudgetManager that should be used by default.
        """
        return FixedUncertaintyBudgetManager

    def _get_default_budget_manager_kwargs(self):
        """Provide the kwargs for the budget manager that will be used as
        default.

        Returns
        -------
        default_budget_manager_kwargs : dict
            The arguments necessary to initialize the budget manager.
        """
        return {"classes": self.classes}


[docs]class VariableUncertainty(UncertaintyZliobaite):
    """Variable Uncertainty Strategy

    The VariableUncertainty query strategy (Variable Uncertainty Strategy in
    [1]_) queries labels based on the classifiers uncertainty assessed based on
    the classifier's predictions. The sample is queried when the probability
    of the most likely class exceeds a time-dependent threshold calculated
    based on the budget, number of observed and acquired samples. See also
    :class:`.VariableUncertaintyBudgetManager`

    Parameters
    ----------
    budget_manager : BudgetManager, default=None
        The BudgetManager which models the budgeting constraint used in the
        stream-based active learning setting. if set to `None`,
        `VariableUncertaintyBudgetManager` will be used by default. The budget
        manager will be initialized based on the following conditions:

        - If only a `budget` is given, the default budget manager is
          initialized with the given budget.
        - If only a budget manager is given, use the budget manager.
        - If both are not given, the default budget manager with the default
          budget.
        - If both are given, and the budget differs from
          `budgetmanager.budget`, throw a warning and the budget manager is
          used as is.
    budget : float, default=None
        Specifies the ratio of samples which are allowed to be sampled, with
        `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
        default budget 0.1.
    random_state : int or RandomState instance, default=None
        Controls the randomness of the estimator.

    References
    ----------
    .. [1] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active
        Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn.
        Syst., 25(1):27–39, 2014.
    """

    def _get_default_budget_manager(self):
        """Provide the budget manager that will be used as default.

        Returns
        -------
        budget_manager : BudgetManager
            The BudgetManager that should be used by default.
        """
        return VariableUncertaintyBudgetManager


[docs]class RandomVariableUncertainty(UncertaintyZliobaite):
    """RandomVariableUncertainty

    The RandomVariableUncertainty (Uncertainty Strategy With Randomization in
    [1]_) query strategy samples samples based on the classifier's
    uncertainty assessed based on the classifier's predictions. The sample is
    queried when the probability of the most likely class exceeds a
    time-dependent threshold calculated based on the budget, and the number of
    observed and acquired samples. The threshold is randomized by being
    multiplied with a random number sampled from N(1,delta). See also
    :class:`.RandomVariableUncertaintyBudgetManager`

    Parameters
    ----------
    budget_manager : BudgetManager, default=None
        The BudgetManager which models the budgeting constraint used in the
        stream-based active learning setting. if set to `None`,
        `RandomVariableUncertaintyBudgetManager` will be used by default.  The
        budget manager will be initialized based on the following conditions:

        - If only a `budget` is given, the default budget manager is
          initialized with the given budget.
        - If only a budget manager is given, use the budget manager.
        - If both are not given, the default budget manager with the default
          budget.
        - If both are given, and the budget differs from
          `budgetmanager.budget`, throw a warning and the budget manager is
          used as is.
    budget : float, default=None
        Specifies the ratio of samples which are allowed to be sampled, with
        `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
        default budget 0.1.
    random_state : int or RandomState instance, default=None
        Controls the randomness of the estimator.

    References
    ----------
    .. [1] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active
        Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn.
        Syst., 25(1):27–39, 2014.
    """

    def _get_default_budget_manager(self):
        """Provide the budget manager that will be used as default.

        Returns
        -------
        budget_manager : BudgetManager
            The BudgetManager that should be used by default.
        """
        return RandomVariableUncertaintyBudgetManager


[docs]class Split(UncertaintyZliobaite):
    """Split

    The Split query strategy (Split Strategy in [1]_) queries labels based on
    the classifiers uncertainty assessed based on the classifier's predictions.
    The sample is queried when the probability of the most likely class
    exceeds a time-dependent threshold calculated based on the budget, number
    of observed and acquired samples. It is a hybrid strategy that combines
    `VariableUncertainty` with randomly sampling samples with a given
    probability. See also :class:`.SplitBudgetManager`

    Parameters
    ----------
    budget_manager : BudgetManager, default=None
        The BudgetManager which models the budgeting constraint used in the
        stream-based active learning setting. if set to `None`,
        `SplitBudgetManager` will be used by default. The budget manager will
        be initialized based on the following conditions:

        - If only a `budget` is given, the default budget manager is
          initialized with the given budget.
        - If only a budget manager is given, use the budget manager.
        - If both are not given, the default budget manager with the default
          budget.
        - If both are given, and the budget differs from
          `budgetmanager.budget`, throw a warning and the budget manager is
          used as is.
    budget : float, default=None
        Specifies the ratio of samples which are allowed to be sampled, with
        `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
        default budget 0.1.
    random_state : int or RandomState instance, default=None
        Controls the randomness of the estimator.

    References
    ----------
    .. [1] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active
        Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn.
        Syst., 25(1):27–39, 2014
    """

    def _get_default_budget_manager(self):
        """Provide the budget manager that will be used as default.

        Returns
        -------
        budget_manager : BudgetManager
            The BudgetManager that should be used by default.
        """
        return SplitBudgetManager