Source code for skactiveml.stream.budgetmanager._threshold_budget

import numpy as np
from copy import deepcopy

from skactiveml.base import BudgetManager

from skactiveml.utils import check_scalar, check_random_state


[docs]class DensityBasedSplitBudgetManager(BudgetManager):
    """Budget Manager for DBALStream

    This budget manager is an adaptation of
    :class:`.RandomVariableUncertaintyBudgetManager` for DBALStream [1]_. It
    mainly differs in how the available budget ist estimated. Instead of the
    estimated budget proposed by Žliobaitė et. al. [2]_, this budget manager
    counts the number of queried and seen instance, such that the number of
    available queries is given as `n_seen_samples-n_queried_samples*budget`.

    Parameters
    ----------
    theta : float, default=1.0
        Specifies the initial value for `theta_` that is used for calculating
        the threshold.
    s : float, default=0.1
        Specifies the relative increase or decrease of the threshold if an
        sample is queried or not, respectively.
    delta : float, default=1.0
        Specifies the standart deviation of the normal distribution used for
        randomization of the threshold.
    random_state : int or RandomState instance or None, default=None
        Controls the randomness of the budget manager.
    budget : float, default=None
        Specifies the ratio of samples which are allowed to be sampled, with
        `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the
        default budget 0.1.

    References
    ----------
    .. [1] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused
        uncertainty sampling for active learning over evolving stream data. In
        Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst.
        Program. Models Appl., pages 133–148, 2014.
    .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active
        Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn.
        Syst., 25(1):27–39, 2014
    """

    def __init__(
        self,
        theta=1.0,
        s=0.01,
        delta=1.0,
        random_state=None,
        budget=None,
    ):
        super().__init__(budget)
        self.theta = theta
        self.s = s
        self.delta = delta
        self.random_state = random_state

[docs]    def query_by_utility(self, utilities):
        """Ask the budget manager which `utilities` are sufficient to query the
        corresponding labels.

        Parameters
        ----------
        utilities : array-like of shape (n_samples,)
            The utilities provided by the stream-based active learning
            strategy, which are used to determine whether querying a sample
            is worth it given the budgeting constraint.

        Returns
        -------
        queried_indices : np.ndarray of shape (n_queried_indices,)
            The indices of samples in candidates whose labels are queried,
            with `0 <= queried_indices <= n_candidates`.
        """
        utilities = self._validate_data(utilities)
        confidence = 1 - utilities

        # intialize return parameters
        queried_indices = []
        tmp_u = self.u_
        tmp_t = self.t_
        tmp_theta = self.theta_

        prior_random_state = self.random_state_.get_state()

        # get confidence
        for i, u in enumerate(confidence):
            tmp_t += 1
            budget_left = self.budget_ > tmp_u / tmp_t
            if not budget_left:
                sample = False
            else:
                eta = self.random_state_.normal(1, self.delta)
                theta_random = tmp_theta * eta
                sample = u < theta_random
                # get the indices samples that should be queried
                if sample:
                    tmp_theta *= 1 - self.s
                    queried_indices.append(i)
                else:
                    tmp_theta *= 1 + self.s
            tmp_u += sample

        self.random_state_.set_state(prior_random_state)

        return queried_indices

[docs]    def update(self, candidates, queried_indices):
        """Updates the budget manager.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape\
                (n_samples, n_features)
            The samples which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        queried_indices : np.ndarray of shape (n_queried_indices,)
            The indices of samples in candidates whose labels are queried,
            with `0 <= queried_indices <= n_candidates`.

        Returns
        -------
        self : RandomVariableUncertaintyBudgetManager
            The budget manager returns itself, after it is updated.
        """
        self._validate_data(np.array([]))

        queried = np.zeros(len(candidates))
        queried[queried_indices] = 1
        self.random_state_.random_sample(len(candidates))
        for s in queried:
            self.t_ += 1
            if self.budget_ > self.u_ / self.t_:
                if s:
                    self.theta_ *= 1 - self.s
                else:
                    self.theta_ *= 1 + self.s
            self.u_ += s

        return self

    def _validate_data(self, utilities):
        """Validate input data.

        Parameters
        ----------
        utilities: array-like of shape (n_samples,)
            The `utilities` provided by the stream-based active learning
            strategy.

        Returns
        -------
        utilities: ndarray of shape (n_samples,)
            Checked `utilities`.
        """
        utilities = super()._validate_data(utilities)
        # Check theta
        self._validate_theta()
        # Chack s
        check_scalar(
            self.s, "s", float, min_val=0, min_inclusive=False, max_val=1
        )
        # Check delta
        check_scalar(
            self.delta, "delta", float, min_val=0, min_inclusive=False
        )
        # check if calculation of estimate bought/true lables has begun
        if not hasattr(self, "u_"):
            self.u_ = 0
        if not hasattr(self, "t_"):
            self.t_ = 0
        self._validate_random_state()

        return utilities

    def _validate_theta(self):
        """Validate if theta is set as a float."""
        check_scalar(self.theta, "theta", float)
        # check if theta exists
        if not hasattr(self, "theta_"):
            self.theta_ = self.theta

    def _validate_random_state(self):
        """Creates a copy 'random_state_' if random_state is an instance of
        np.random_state. If not create a new random state. See also
        :func:`~sklearn.utils.check_random_state`
        """
        if not hasattr(self, "random_state_"):
            self.random_state_ = deepcopy(self.random_state)
        self.random_state_ = check_random_state(self.random_state_)