Source code for skactiveml.stream._stream_baselines

import numpy as np

from ..base import SingleAnnotatorStreamQueryStrategy
from ..utils import check_scalar


[docs]class StreamRandomSampling(SingleAnnotatorStreamQueryStrategy):
    """Random Sampling for Datastreams.

    The RandomSampling samples instances completely randomly. The
    probability to sample an instance is dependent on the budget specified in
    the budget manager. Given a budget of 10%, the utility exceeds 0.9 (1-0.1)
    with a probability of 10%. Instances are queried regardless of their
    position in the feature space. As this query strategy disregards any
    information about the instance. Thus, it should only be used as a baseline
    strategy.

    Parameters
    ----------
    budget : float, optional (default=None)
        The budget which models the budgeting constraint used in
        the stream-based active learning setting.

    allow_exceeding_budget : bool, optional (default=True)
        If True, the query strategy is allowed to exceed it's budget as long as
        the average number of queries will be within the budget. If False,
        queries are not allowed if the budget is exhausted.

    random_state : int, RandomState instance, optional (default=None)
        Controls the randomness of the estimator.
    """

    def __init__(
        self, budget=None, allow_exceeding_budget=True, random_state=None
    ):
        super().__init__(budget=budget, random_state=random_state)
        self.allow_exceeding_budget = allow_exceeding_budget

[docs]    def query(self, candidates, return_utilities=False):
        """Ask the query strategy which instances in candidates to acquire.

        Please note that, when the decisions from this function may differ from
        the final sampling, simulate=True can set, so that the query strategy
        can be updated later with update(...) with the final sampling. This is
        especially helpful, when developing wrapper query strategies.

        Parameters
        ----------
        candidates : array-like or sparse matrix of shape
        (n_samples, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.
            The default is False.

        Returns
        -------
        queried_indices : ndarray of shape (n_queried_instances,)
            The indices of instances in candidates which should be queried,
            with 0 <= n_queried_instances <= n_samples.

        utilities: ndarray of shape (n_samples,), optional
            The utilities based on the query strategy. Only provided if
            return_utilities is True.
        """
        candidates, return_utilities = self._validate_data(
            candidates, return_utilities
        )

        # copy random state in case of simulating the query
        prior_random_state = self.random_state_.get_state()

        utilities = self.random_state_.random_sample(len(candidates))

        self.random_state_.set_state(prior_random_state)

        # keep record if the instance is queried and if there was budget left,
        # when assessing the corresponding utilities
        queried = np.full(len(utilities), False)

        # keep the internal state to reset it later if simulate is true
        tmp_observed_instances = self.observed_instances_
        tmp_queried_instances = self.queried_instances_
        # check for each sample separately if budget is left and the utility is
        # high enough
        for i, utility in enumerate(utilities):
            tmp_observed_instances += 1
            available_budget = (
                tmp_observed_instances * self.budget_ - tmp_queried_instances
            )
            queried[i] = (
                self.allow_exceeding_budget or available_budget > 1
            ) and (utility >= 1 - self.budget_)
            tmp_queried_instances += queried[i]

        # get the indices instances that should be queried
        queried_indices = np.where(queried)[0]

        # queried_indices = self.budget_manager_.query_by_utility(utilities)

        if return_utilities:
            return queried_indices, utilities
        else:
            return queried_indices

[docs]    def update(self, candidates, queried_indices):
        """Updates the budget manager and the count for seen and queried
        instances

        Parameters
        ----------
        candidates : array-like or sparse matrix of shape
        (n_samples, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        queried_indices : array-like of shape (n_samples,)
            Indicates which instances from candidates have been queried.

        Returns
        -------
        self : StreamRandomSampling
            The RandomSampling returns itself, after it is updated.
        """
        # check if a random state is set
        self._validate_data([[0]], False)
        # update observed instances and queried instances
        queried = np.zeros(len(candidates))
        queried[queried_indices] = 1
        self.observed_instances_ += candidates.shape[0]
        self.queried_instances_ += np.sum(queried)
        # update the random state assuming, that query(..., simulate=True) was
        # used
        self.random_state_.random_sample(len(candidates))
        return self

    def _validate_data(
        self,
        candidates,
        return_utilities,
        reset=True,
        **check_candidates_params
    ):
        """Validate input data and set or check the `n_features_in_` attribute.

        Parameters
        ----------
        candidates: array-like of shape (n_candidates, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        return_utilities : bool,
            If true, also return the utilities based on the query strategy.
        reset : bool, optional (default=True)
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_candidates_params : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        candidates: np.ndarray of shape (n_candidates, n_features)
            Checked candidate samples.
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        # check if counting of instances has begun
        if not hasattr(self, "observed_instances_"):
            self.observed_instances_ = 0
        if not hasattr(self, "queried_instances_"):
            self.queried_instances_ = 0

        check_scalar(
            self.allow_exceeding_budget, "allow_exceeding_budget", bool
        )

        candidates, return_utilities = super()._validate_data(
            candidates,
            return_utilities,
            reset=reset,
            **check_candidates_params
        )

        self._validate_random_state()

        return candidates, return_utilities


[docs]class PeriodicSampling(SingleAnnotatorStreamQueryStrategy):
    """The PeriodicSampling samples instances periodically. The length of that
    period is determined by the budget specified in the budgetmanager. For
    instance, a budget of 25% would result in the PeriodicSampling sampling
    every fourth instance. The main idea behind this query strategy is to
    exhaust a given budget as soon it is available. Instances are queried
    regardless of their position in the feature space. As this query strategy
    disregards any information about the instance. Thus, it should only be used
    as a baseline strategy.

    Parameters
    ----------
    budget : float, optional (default=None)
        The budget which models the budgeting constraint used in
        the stream-based active learning setting.

    random_state : int, RandomState instance, optional (default=None)
        Controls the randomness of the estimator.
    """

    def __init__(self, budget=None, random_state=None):
        super().__init__(budget=budget, random_state=random_state)

[docs]    def query(self, candidates, return_utilities=False):
        """Ask the query strategy which instances in candidates to acquire.

        This query strategy only evaluates the time each instance arrives at.
        The utilities returned, when return_utilities is set to True, are
        either 0 (the instance is not queried) or 1 (the instance is queried).
        Please note that, when the decisions from this function may differ from
        the final sampling, simulate=True can set, so that the query strategy
        can be updated later with update(...) with the final sampling. This is
        especially helpful, when developing wrapper query strategies.

        Parameters
        ----------
        candidates : array-like or sparse matrix of shape
        (n_samples, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.
            The default is False.

        Returns
        -------
        queried_indices : ndarray of shape (n_queried_instances,)
            The indices of instances in candidates which should be queried,
            with 0 <= n_queried_instances <= n_samples.

        utilities: ndarray of shape (n_samples,), optional
            The utilities based on the query strategy. Only provided if
            return_utilities is True.
        """
        candidates, return_utilities = self._validate_data(
            candidates, return_utilities
        )

        utilities = np.zeros(candidates.shape[0])

        # keep record if the instance is queried and if there was budget left,
        # when assessing the corresponding utilities
        queried = np.full(len(candidates), False)

        tmp_observed_instances = self.observed_instances_
        tmp_queried_instances = self.queried_instances_
        for i, x in enumerate(candidates):
            tmp_observed_instances += 1
            remaining_budget = (
                tmp_observed_instances * self.budget_ - tmp_queried_instances
            )
            queried[i] = remaining_budget >= 1
            if queried[i]:
                utilities[i] = 1
            tmp_queried_instances += queried[i]

        # get the indices instances that should be queried
        queried_indices = np.where(queried)[0]

        # queried_indices = self.budget_manager_.query_by_utility(utilities)

        if return_utilities:
            return queried_indices, utilities
        else:
            return queried_indices

[docs]    def update(self, candidates, queried_indices):
        """Updates the budget manager and the count for seen and queried
        instances

        Parameters
        ----------
        candidates : array-like or sparse matrix of shape
        (n_samples, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        queried_indices : array-like of shape (n_samples,)
            Indicates which instances from candidates have been queried.

        Returns
        -------
        self : PeriodicSampling
            The PeriodicSampler returns itself, after it is updated.
        """
        # check if a budgetmanager is set
        self._validate_data(np.array([[0]]), False)
        queried = np.zeros(len(candidates))
        queried[queried_indices] = 1
        self.observed_instances_ += len(queried)
        self.queried_instances_ += np.sum(queried)
        return self

    def _validate_data(
        self,
        candidates,
        return_utilities,
        reset=True,
        **check_candidates_params
    ):
        """Validate input data and set or check the `n_features_in_` attribute.

        Parameters
        ----------
        candidates: array-like of shape (n_candidates, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.
        return_utilities : bool,
            If true, also return the utilities based on the query strategy.
        reset : bool, optional (default=True)
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_candidates_params : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        candidates: np.ndarray of shape (n_candidates, n_features)
            Checked candidate samples.
        batch_size : int
            Checked number of samples to be selected in one AL cycle.
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        candidates, return_utilities = super()._validate_data(
            candidates,
            return_utilities,
            reset=reset,
            **check_candidates_params
        )

        self._validate_random_state()

        # check if counting of instances has begun
        if not hasattr(self, "observed_instances_"):
            self.observed_instances_ = 0
        if not hasattr(self, "queried_instances_"):
            self.queried_instances_ = 0

        return candidates, return_utilities