Source code for skactiveml.stream._stream_baselines

import numpy as np

from ..base import SingleAnnotatorStreamQueryStrategy
from ..utils import check_scalar


[docs]class StreamRandomSampling(SingleAnnotatorStreamQueryStrategy): """Random Sampling for Data Streams. The RandomSampling strategy queries labels completely randomly. The probability to query a sample is dependent on the budget specified in the budget manager. Given a budget of 10%, the utility exceeds 0.9 (1-0.1) with a probability of 10%. samples are queried regardless of their position in the feature space and disregards any information about the sample. Thus, it should only be used as a baseline strategy. The `allow_exceeding_budget` parameter allows to configure the strategy to strictly adhere to a given budget. Parameters ---------- allow_exceeding_budget : bool, default=True If `True`, the query strategy is allowed to exceed it's budget as long as the average number of queries will be within the budget. If `False`, queries are not allowed if the budget is exhausted. budget : float, default=None The budget which models the budgeting constraint used in the stream-based active learning setting. random_state : int or RandomState instance or None, default=None Controls the randomness of the estimator. """ def __init__( self, allow_exceeding_budget=True, budget=None, random_state=None ): super().__init__(budget=budget, random_state=random_state) self.allow_exceeding_budget = allow_exceeding_budget
[docs] def query(self, candidates, return_utilities=False): """Determines for which candidate samples labels are to be queried. The query startegy determines the most useful samples in candidates, which can be acquired within the budgeting constraint specified by `budget`. Please note that, this method does not change the internal state of the query strategy. To adapt the query strategy to the selected candidates, use `update(...)`. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, default=False If `True`, also return the `utilities` based on the query strategy. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. utilities: np.ndarray of shape (n_candidates,), The utilities based on the query strategy. Only provided if `return_utilities` is `True`. """ candidates, return_utilities = self._validate_data( candidates, return_utilities ) # copy random state in case of simulating the query prior_random_state = self.random_state_.get_state() utilities = self.random_state_.random_sample(len(candidates)) self.random_state_.set_state(prior_random_state) # keep record if the sample is queried and if there was budget left, # when assessing the corresponding utilities queried = np.full(len(utilities), False) # keep the internal state to reset it later if simulate is true tmp_observed_samples = self.observed_samples_ tmp_queried_samples = self.queried_samples_ # check for each sample separately if budget is left and the utility is # high enough for i, utility in enumerate(utilities): tmp_observed_samples += 1 available_budget = ( tmp_observed_samples * self.budget_ - tmp_queried_samples ) queried[i] = ( self.allow_exceeding_budget or available_budget > 1 ) and (utility >= 1 - self.budget_) tmp_queried_samples += queried[i] # get the indices samples that should be queried queried_indices = np.where(queried)[0] # queried_indices = self.budget_manager_.query_by_utility(utilities) if return_utilities: return queried_indices, utilities else: return queried_indices
[docs] def update(self, candidates, queried_indices): """Updates the count for seen and queried labels. This function should be used in conjunction with the `query` function. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. Returns ------- self : SingleAnnotatorStreamQueryStrategy The query strategy returns itself, after it is updated. """ # check if a random state is set self._validate_data([[0]], False) # update observed samples and queried samples queried = np.zeros(len(candidates)) queried[queried_indices] = 1 self.observed_samples_ += candidates.shape[0] self.queried_samples_ += np.sum(queried) # update the random state assuming, that query(..., simulate=True) was # used self.random_state_.random_sample(len(candidates)) return self
def _validate_data( self, candidates, return_utilities, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples. return_utilities : bool, Checked boolean value of `return_utilities`. """ # check if counting of samples has begun if not hasattr(self, "observed_samples_"): self.observed_samples_ = 0 if not hasattr(self, "queried_samples_"): self.queried_samples_ = 0 check_scalar( self.allow_exceeding_budget, "allow_exceeding_budget", bool ) candidates, return_utilities = super()._validate_data( candidates, return_utilities, reset=reset, **check_candidates_params, ) self._validate_random_state() return candidates, return_utilities
[docs]class PeriodicSampling(SingleAnnotatorStreamQueryStrategy): """Periodic Sampling for Data Streams The PeriodicSampling strategy samples labels periodically. The length of that period is determined by the `budget`. For instance, a `budget` of 0.25 would result in querying every fourth sample. The main idea behind this query strategy is to exhaust a given budget as soon as it is available. samples are queried regardless of their position in the feature space and disregards any information about the sample. Thus, it should only be used as a baseline strategy. Parameters ---------- budget : float, default=None The budget which models the budgeting constraint used in the stream-based active learning setting. random_state : int or RandomState instance or None, default=None Controls the randomness of the estimator. """ def __init__(self, budget=None, random_state=None): super().__init__(budget=budget, random_state=random_state)
[docs] def query(self, candidates, return_utilities=False): """Determines for which candidate samples labels are to be queried. The query startegy determines the most useful samples in candidates, which can be acquired within the budgeting constraint specified by `budget`. Please note that, this method does not change the internal state of the query strategy. To adapt the query strategy to the selected candidates, use `update(...)`. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, default=False If `True`, also return the `utilities` based on the query strategy. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. utilities: np.ndarray of shape (n_candidates,), The utilities based on the query strategy. Only provided if `return_utilities` is `True`. """ candidates, return_utilities = self._validate_data( candidates, return_utilities ) utilities = np.zeros(candidates.shape[0]) # keep record if the sample is queried and if there was budget left, # when assessing the corresponding utilities queried = np.full(len(candidates), False) tmp_observed_samples = self.observed_samples_ tmp_queried_samples = self.queried_samples_ for i, x in enumerate(candidates): tmp_observed_samples += 1 remaining_budget = ( tmp_observed_samples * self.budget_ - tmp_queried_samples ) queried[i] = remaining_budget >= 1 if queried[i]: utilities[i] = 1 tmp_queried_samples += queried[i] # get the indices samples that should be queried queried_indices = np.where(queried)[0] # queried_indices = self.budget_manager_.query_by_utility(utilities) if return_utilities: return queried_indices, utilities else: return queried_indices
[docs] def update(self, candidates, queried_indices): """Updates the count for seen and queried labels. This function should be used in conjunction with the `query` function. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. Returns ------- self : SingleAnnotatorStreamQueryStrategy The query strategy returns itself, after it is updated. """ # check if a budgetmanager is set self._validate_data(np.array([[0]]), False) queried = np.zeros(len(candidates)) queried[queried_indices] = 1 self.observed_samples_ += len(queried) self.queried_samples_ += np.sum(queried) return self
def _validate_data( self, candidates, return_utilities, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples. return_utilities : bool, Checked boolean value of `return_utilities`. """ candidates, return_utilities = super()._validate_data( candidates, return_utilities, reset=reset, **check_candidates_params, ) self._validate_random_state() # check if counting of samples has begun if not hasattr(self, "observed_samples_"): self.observed_samples_ = 0 if not hasattr(self, "queried_samples_"): self.queried_samples_ = 0 return candidates, return_utilities