Source code for skactiveml.stream._stream_baselines

import numpy as np

from ..base import SingleAnnotatorStreamQueryStrategy
from ..utils import check_scalar


[docs]class StreamRandomSampling(SingleAnnotatorStreamQueryStrategy): """Random Sampling for Datastreams. The RandomSampling samples instances completely randomly. The probability to sample an instance is dependent on the budget specified in the budget manager. Given a budget of 10%, the utility exceeds 0.9 (1-0.1) with a probability of 10%. Instances are queried regardless of their position in the feature space. As this query strategy disregards any information about the instance. Thus, it should only be used as a baseline strategy. Parameters ---------- budget : float, optional (default=None) The budget which models the budgeting constraint used in the stream-based active learning setting. allow_exceeding_budget : bool, optional (default=True) If True, the query strategy is allowed to exceed it's budget as long as the average number of queries will be within the budget. If False, queries are not allowed if the budget is exhausted. random_state : int, RandomState instance, optional (default=None) Controls the randomness of the estimator. """ def __init__( self, budget=None, allow_exceeding_budget=True, random_state=None ): super().__init__(budget=budget, random_state=random_state) self.allow_exceeding_budget = allow_exceeding_budget
[docs] def query(self, candidates, return_utilities=False): """Ask the query strategy which instances in candidates to acquire. Please note that, when the decisions from this function may differ from the final sampling, simulate=True can set, so that the query strategy can be updated later with update(...) with the final sampling. This is especially helpful, when developing wrapper query strategies. Parameters ---------- candidates : array-like or sparse matrix of shape (n_samples, n_features) The instances which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, optional (default=False) If true, also return the utilities based on the query strategy. The default is False. Returns ------- queried_indices : ndarray of shape (n_queried_instances,) The indices of instances in candidates which should be queried, with 0 <= n_queried_instances <= n_samples. utilities: ndarray of shape (n_samples,), optional The utilities based on the query strategy. Only provided if return_utilities is True. """ candidates, return_utilities = self._validate_data( candidates, return_utilities ) # copy random state in case of simulating the query prior_random_state = self.random_state_.get_state() utilities = self.random_state_.random_sample(len(candidates)) self.random_state_.set_state(prior_random_state) # keep record if the instance is queried and if there was budget left, # when assessing the corresponding utilities queried = np.full(len(utilities), False) # keep the internal state to reset it later if simulate is true tmp_observed_instances = self.observed_instances_ tmp_queried_instances = self.queried_instances_ # check for each sample separately if budget is left and the utility is # high enough for i, utility in enumerate(utilities): tmp_observed_instances += 1 available_budget = ( tmp_observed_instances * self.budget_ - tmp_queried_instances ) queried[i] = ( self.allow_exceeding_budget or available_budget > 1 ) and (utility >= 1 - self.budget_) tmp_queried_instances += queried[i] # get the indices instances that should be queried queried_indices = np.where(queried)[0] # queried_indices = self.budget_manager_.query_by_utility(utilities) if return_utilities: return queried_indices, utilities else: return queried_indices
[docs] def update(self, candidates, queried_indices): """Updates the budget manager and the count for seen and queried instances Parameters ---------- candidates : array-like or sparse matrix of shape (n_samples, n_features) The instances which could be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : array-like of shape (n_samples,) Indicates which instances from candidates have been queried. Returns ------- self : StreamRandomSampling The RandomSampling returns itself, after it is updated. """ # check if a random state is set self._validate_data([[0]], False) # update observed instances and queried instances queried = np.zeros(len(candidates)) queried[queried_indices] = 1 self.observed_instances_ += candidates.shape[0] self.queried_instances_ += np.sum(queried) # update the random state assuming, that query(..., simulate=True) was # used self.random_state_.random_sample(len(candidates)) return self
def _validate_data( self, candidates, return_utilities, reset=True, **check_candidates_params ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The instances which could be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, If true, also return the utilities based on the query strategy. reset : bool, optional (default=True) Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray of shape (n_candidates, n_features) Checked candidate samples. return_utilities : bool, Checked boolean value of `return_utilities`. """ # check if counting of instances has begun if not hasattr(self, "observed_instances_"): self.observed_instances_ = 0 if not hasattr(self, "queried_instances_"): self.queried_instances_ = 0 check_scalar( self.allow_exceeding_budget, "allow_exceeding_budget", bool ) candidates, return_utilities = super()._validate_data( candidates, return_utilities, reset=reset, **check_candidates_params ) self._validate_random_state() return candidates, return_utilities
[docs]class PeriodicSampling(SingleAnnotatorStreamQueryStrategy): """The PeriodicSampling samples instances periodically. The length of that period is determined by the budget specified in the budgetmanager. For instance, a budget of 25% would result in the PeriodicSampling sampling every fourth instance. The main idea behind this query strategy is to exhaust a given budget as soon it is available. Instances are queried regardless of their position in the feature space. As this query strategy disregards any information about the instance. Thus, it should only be used as a baseline strategy. Parameters ---------- budget : float, optional (default=None) The budget which models the budgeting constraint used in the stream-based active learning setting. random_state : int, RandomState instance, optional (default=None) Controls the randomness of the estimator. """ def __init__(self, budget=None, random_state=None): super().__init__(budget=budget, random_state=random_state)
[docs] def query(self, candidates, return_utilities=False): """Ask the query strategy which instances in candidates to acquire. This query strategy only evaluates the time each instance arrives at. The utilities returned, when return_utilities is set to True, are either 0 (the instance is not queried) or 1 (the instance is queried). Please note that, when the decisions from this function may differ from the final sampling, simulate=True can set, so that the query strategy can be updated later with update(...) with the final sampling. This is especially helpful, when developing wrapper query strategies. Parameters ---------- candidates : array-like or sparse matrix of shape (n_samples, n_features) The instances which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, optional (default=False) If true, also return the utilities based on the query strategy. The default is False. Returns ------- queried_indices : ndarray of shape (n_queried_instances,) The indices of instances in candidates which should be queried, with 0 <= n_queried_instances <= n_samples. utilities: ndarray of shape (n_samples,), optional The utilities based on the query strategy. Only provided if return_utilities is True. """ candidates, return_utilities = self._validate_data( candidates, return_utilities ) utilities = np.zeros(candidates.shape[0]) # keep record if the instance is queried and if there was budget left, # when assessing the corresponding utilities queried = np.full(len(candidates), False) tmp_observed_instances = self.observed_instances_ tmp_queried_instances = self.queried_instances_ for i, x in enumerate(candidates): tmp_observed_instances += 1 remaining_budget = ( tmp_observed_instances * self.budget_ - tmp_queried_instances ) queried[i] = remaining_budget >= 1 if queried[i]: utilities[i] = 1 tmp_queried_instances += queried[i] # get the indices instances that should be queried queried_indices = np.where(queried)[0] # queried_indices = self.budget_manager_.query_by_utility(utilities) if return_utilities: return queried_indices, utilities else: return queried_indices
[docs] def update(self, candidates, queried_indices): """Updates the budget manager and the count for seen and queried instances Parameters ---------- candidates : array-like or sparse matrix of shape (n_samples, n_features) The instances which could be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : array-like of shape (n_samples,) Indicates which instances from candidates have been queried. Returns ------- self : PeriodicSampling The PeriodicSampler returns itself, after it is updated. """ # check if a budgetmanager is set self._validate_data(np.array([[0]]), False) queried = np.zeros(len(candidates)) queried[queried_indices] = 1 self.observed_instances_ += len(queried) self.queried_instances_ += np.sum(queried) return self
def _validate_data( self, candidates, return_utilities, reset=True, **check_candidates_params ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The instances which could be queried. Sparse matrices are accepted only if they are supported by the base query strategy. return_utilities : bool, If true, also return the utilities based on the query strategy. reset : bool, optional (default=True) Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray of shape (n_candidates, n_features) Checked candidate samples. batch_size : int Checked number of samples to be selected in one AL cycle. return_utilities : bool, Checked boolean value of `return_utilities`. """ candidates, return_utilities = super()._validate_data( candidates, return_utilities, reset=reset, **check_candidates_params ) self._validate_random_state() # check if counting of instances has begun if not hasattr(self, "observed_instances_"): self.observed_instances_ = 0 if not hasattr(self, "queried_instances_"): self.queried_instances_ = 0 return candidates, return_utilities