Source code for skactiveml.stream._uncertainty_zliobaite

import numpy as np
from sklearn.base import clone
from sklearn.utils import check_array, check_consistent_length
from copy import deepcopy

from .budgetmanager import (
    FixedUncertaintyBudgetManager,
    VariableUncertaintyBudgetManager,
    SplitBudgetManager,
    RandomVariableUncertaintyBudgetManager,
)
from ..base import (
    BudgetManager,
    SingleAnnotatorStreamQueryStrategy,
    SkactivemlClassifier,
)
from ..utils import (
    check_type,
    call_func,
    check_budget_manager,
)


class UncertaintyZliobaite(SingleAnnotatorStreamQueryStrategy):
    """UncertaintyZliobaite

    The UncertaintyZliobaite class provides the base for query strategies
    proposed by Žliobaitė et al. in [1]. The strategies evaluate the
    classifier's uncertainty based on its predictions and instances' labels are
    queried when the uncertainty exceeds a specific threshold. Žliobaitė et al.
    propose various techniques to calculate such a threshold.

    Parameters
    ----------
    budget : float, optional (default=None)
        The budget which models the budgeting constraint used in
        the stream-based active learning setting.
    budget_manager : BudgetManager, optional (default=None)
        The BudgetManager which models the budgeting constraint used in
        the stream-based active learning setting. if set to None,
        FixedUncertaintyBudgetManager will be used by default. The
        budget manager will be initialized based on the following conditions:
            If only a budget is given the default budget manager is initialized
            with the given budget.
            If only a budget manager is given use the budget manager.
            If both are not given the default budget manager with the
            default budget.
            If both are given and the budget differs from budgetmanager.budget
            a warning is thrown.
    random_state : int, RandomState instance, optional (default=None)
        Controls the randomness of the estimator.

    References
    ----------
    [1] Žliobaitė, I., Bifet, A., Pfahringer, B., & Holmes, G. (2014). Active
        Learning With Drifting Streaming Data. IEEE Transactions on Neural
        Networks and Learning Systems, 25(1), 27-39.

    """

    def __init__(
        self,
        budget_manager=None,
        budget=None,
        random_state=None,
    ):
        super().__init__(budget=budget, random_state=random_state)
        self.budget_manager = budget_manager

    def query(
        self,
        candidates,
        clf,
        X=None,
        y=None,
        sample_weight=None,
        fit_clf=False,
        return_utilities=False,
    ):
        """Ask the query strategy which instances in candidates to acquire.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        clf : SkactivemlClassifier
            Model implementing the methods `fit` and `predict_freq`.
        X : array-like of shape (n_samples, n_features), optional
        (default=None)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples), optional (default=None)
            Labels of the input samples 'X'. There may be missing labels.
        sample_weight : array-like of shape (n_samples,), optional
        (default=None)
            Sample weights for X, used to fit the clf.
        fit_clf : bool, optional (default=False)
            If true, refit the classifier also requires X and y to be given.
        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.
            The default is False.

        Returns
        -------
        queried_indices : ndarray of shape (n_queried_instances,)
            The indices of instances in candidates which should be queried,
            with 0 <= n_queried_instances <= n_samples.

        utilities: ndarray of shape (n_samples,), optional
            The utilities based on the query strategy. Only provided if
            return_utilities is True.
        """
        (
            candidates,
            clf,
            X,
            y,
            sample_weight,
            fit_clf,
            return_utilities,
        ) = self._validate_data(
            candidates,
            clf=clf,
            X=X,
            y=y,
            sample_weight=sample_weight,
            fit_clf=fit_clf,
            return_utilities=return_utilities,
        )

        predict_proba = clf.predict_proba(candidates)
        confidence = np.max(predict_proba, axis=1)
        utilities = 1 - confidence

        queried_indices = self.budget_manager_.query_by_utility(utilities)

        if return_utilities:
            return queried_indices, utilities
        else:
            return queried_indices

    def update(
        self, candidates, queried_indices, budget_manager_param_dict=None
    ):
        """Updates the budget manager and the count for seen and queried
        instances

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        queried_indices : array-like of shape (n_samples,)
            Indicates which instances from candidates have been queried.

        budget_manager_param_dict : kwargs, optional (default=None)
            Optional kwargs for budget manager.

        Returns
        -------
        self : UncertaintyZliobaite
            The UncertaintyZliobaite returns itself, after it is updated.
        """
        # check if a budgetmanager is set
        if not hasattr(self, "budget_manager_"):
            self._validate_random_state()
            random_seed = deepcopy(self.random_state_).randint(2**31 - 1)
            check_type(
                self.budget_manager,
                "budget_manager_",
                BudgetManager,
                type(None),
            )
            self.budget_manager_ = check_budget_manager(
                self.budget,
                self.budget_manager,
                self._get_default_budget_manager(),
                {"random_state": random_seed},
            )

        budget_manager_param_dict = (
            {}
            if budget_manager_param_dict is None
            else budget_manager_param_dict
        )

        call_func(
            self.budget_manager_.update,
            candidates=candidates,
            queried_indices=queried_indices,
            **budget_manager_param_dict
        )
        return self

    def _validate_data(
        self,
        candidates,
        clf,
        X,
        y,
        sample_weight,
        fit_clf,
        return_utilities,
        reset=True,
        **check_candidates_params
    ):
        """Validate input data and set or check the `n_features_in_` attribute.

        Parameters
        ----------
        candidates: array-like of shape (n_candidates, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        clf : SkactivemlClassifier
            Model implementing the methods `fit` and `predict_freq`.
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.
        sample_weight : array-like of shape (n_samples,)
            Sample weights for X, used to fit the clf.
        return_utilities : bool,
            If true, also return the utilities based on the query strategy.
        fit_clf : bool,
            If true, refit the classifier also requires X and y to be given.
        reset : bool, optional (default=True)
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_candidates_params : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples
        clf : SkactivemlClassifier
            Checked model implementing the methods `fit` and `predict_freq`.
        X: np.ndarray, shape (n_samples, n_features)
            Checked training samples
        y: np.ndarray, shape (n_candidates)
            Checked training labels
        sampling_weight: np.ndarray, shape (n_candidates)
            Checked training sample weight
        fit_clf : bool,
            Checked boolean value of `fit_clf`
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        candidates, return_utilities = super()._validate_data(
            candidates,
            return_utilities,
            reset=reset,
            **check_candidates_params
        )
        self._validate_random_state()
        X, y, sample_weight = self._validate_X_y_sample_weight(
            X=X, y=y, sample_weight=sample_weight
        )
        clf = self._validate_clf(clf, X, y, sample_weight, fit_clf)

        # check if a budgetmanager is set
        if not hasattr(self, "budget_manager_"):
            random_seed = deepcopy(self.random_state_).randint(2**31 - 1)
            check_type(
                self.budget_manager,
                "budget_manager_",
                BudgetManager,
                type(None),
            )
            self.budget_manager_ = check_budget_manager(
                self.budget,
                self.budget_manager,
                self._get_default_budget_manager(),
                {"random_state": random_seed},
            )

        return candidates, clf, X, y, sample_weight, fit_clf, return_utilities

    def _validate_clf(self, clf, X, y, sample_weight, fit_clf):
        """Validate if clf is a valid SkactivemlClassifier. If clf is
        untrained, clf is trained using X, y and sample_weight.

        Parameters
        ----------
        clf : SkactivemlClassifier
            Model implementing the methods `fit` and `predict_freq`.
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.
        sample_weight : array-like of shape (n_samples,)
            Sample weights for X, used to fit the clf.
        fit_clf : bool,
            If true, refit the classifier also requires X and y to be given.
        Returns
        -------
        clf : SkactivemlClassifier
            Checked model implementing the methods `fit` and `predict_freq`.
        """
        # Check if the classifier and its arguments are valid.
        check_type(clf, "clf", SkactivemlClassifier)
        check_type(fit_clf, "fit_clf", bool)
        if fit_clf:
            if sample_weight is None:
                clf = clone(clf).fit(X, y)
            else:
                clf = clone(clf).fit(X, y, sample_weight)
        return clf

    def _validate_X_y_sample_weight(self, X, y, sample_weight):
        """Validate if X, y and sample_weight are numeric and of equal length.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.

        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.

        sample_weight : array-like of shape (n_samples,)
            Sample weights for X, used to fit the clf.

        Returns
        -------
        X : array-like of shape (n_samples, n_features)
            Checked Input samples.
        y : array-like of shape (n_samples)
            Checked Labels of the input samples 'X'. Converts y to a numpy
            array
        """
        if sample_weight is not None:
            sample_weight = np.array(sample_weight)
            check_consistent_length(sample_weight, y)
        if X is not None and y is not None:
            X = check_array(X)
            y = np.array(y)
            check_consistent_length(X, y)
        return X, y, sample_weight


[docs]class FixedUncertainty(UncertaintyZliobaite): """FixedUncertainty The FixedUncertainty (Fixed-Uncertainty in [1]) query strategy samples instances based on the classifiers uncertainty assessed based on the classifier's predictions. The instance is queried when the probability of the most likely class exceeds a threshold calculated based on the budget and the number of classes. Parameters ---------- budget : float, optional (default=None) The budget which models the budgeting constraint used in the stream-based active learning setting. budgetmanager : BudgetManager, optional (default=None) The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to None, FixedUncertaintyBudgetManager will be used by default. The budget manager will be initialized based on the following conditions: If only a budget is given the default budget manager is initialized with the given budget. If only a budget manager is given use the budget manager. If both are not given the default budget manager with the default budget. If both are given and the budget differs from budget manager.budget a warning is thrown. random_state : int, RandomState instance, optional (default=None) Controls the randomness of the estimator. References ---------- [1] Žliobaitė, I., Bifet, A., Pfahringer, B., & Holmes, G. (2014). Active Learning With Drifting Streaming Data. IEEE Transactions on Neural Networks and Learning Systems, 25(1), 27-39. """ def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budgetmanager : BudgetManager The BudgetManager that should be used by default. """ return FixedUncertaintyBudgetManager
[docs]class VariableUncertainty(UncertaintyZliobaite): """VariableUncertainty The VariableUncertainty (Var-Uncertainty in [1]) query strategy samples instances based on the classifiers uncertainty assessed based on the classifier's predictions. The instance is queried when the probability of the most likely class exceeds a time-dependent threshold calculated based on the budget, the number of classes and the number of observed and acquired samples. Parameters ---------- budget : float, optional (default=None) The budget which models the budgeting constraint used in the stream-based active learning setting. budgetmanager : BudgetManager, optional (default=None) The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to None, VariableUncertaintyBudgetManager will be used by default. The budget manager will be initialized based on the following conditions: If only a budget is given the default budgetmanager is initialized with the given budget. If only a budgetmanager is given use the budgetmanager. If both are not given the default budgetmanager with the default budget. If both are given and the budget differs from budgetmanager.budget a warning is thrown. random_state : int, RandomState instance, optional (default=None) Controls the randomness of the estimator. References ---------- [1] Žliobaitė, I., Bifet, A., Pfahringer, B., & Holmes, G. (2014). Active Learning With Drifting Streaming Data. IEEE Transactions on Neural Networks and Learning Systems, 25(1), 27-39. """ def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budgetmanager : BudgetManager The BudgetManager that should be used by default. """ return VariableUncertaintyBudgetManager
[docs]class RandomVariableUncertainty(UncertaintyZliobaite): """RandomVariableUncertainty The RandomVariableUncertainty (Ran-Var-Uncertainty in [1]) query strategy samples instances based on the classifier's uncertainty assessed based on the classifier's predictions. The instance is queried when the probability of the most likely class exceeds a time-dependent threshold calculated based on the budget, the number of classes and the number of observed and acquired samples. To better adapt at change detection the threshold is multiplied by a random number generator with N(1,delta). Parameters ---------- budget : float, optional (default=None) The budget which models the budgeting constraint used in the stream-based active learning setting. budgetmanager : BudgetManager, optional (default=None) The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to None, RandomVariableUncertaintyBudgetManager will be used by default. The budget manager will be initialized based on the following conditions: If only a budget is given the default budgetmanager is initialized with the given budget. If only a budgetmanager is given use the budgetmanager. If both are not given the default budgetmanager with the default budget. If both are given and the budget differs from budgetmanager.budget a warning is thrown. random_state : int, RandomState instance, optional (default=None) Controls the randomness of the estimator. References ---------- [1] Žliobaitė, I., Bifet, A., Pfahringer, B., & Holmes, G. (2014). Active Learning With Drifting Streaming Data. IEEE Transactions on Neural Networks and Learning Systems, 25(1), 27-39. """ def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budgetmanager : BudgetManager The BudgetManager that should be used by default. """ return RandomVariableUncertaintyBudgetManager
[docs]class Split(UncertaintyZliobaite): """Split The Split [1] query strategy samples in 100*v% of instances randomly and in 100*(1-v)% of cases according to VariableUncertainty. Parameters ---------- budget : float, optional (default=None) The budget which models the budgeting constraint used in the stream-based active learning setting. budgetmanager : BudgetManager, optional (default=None) The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to None, SplitBudgetManager will be used by default. The budget manager will be initialized based on the following conditions: If only a budget is given the default budget manager is initialized with the given budget. If only a budgetmanager is given use the budgetmanager. If both are not given the default budgetmanager with the default budget. If both are given and the budget differs from budgetmanager.budget a warning is thrown. random_state : int, RandomState instance, optional (default=None) Controls the randomness of the estimator. References ---------- [1] Žliobaitė, I., Bifet, A., Pfahringer, B., & Holmes, G. (2014). Active Learning With Drifting Streaming Data. IEEE Transactions on Neural Networks and Learning Systems, 25(1), 27-39. """ def _get_default_budget_manager(self): return SplitBudgetManager