Source code for skactiveml.stream._density_uncertainty

from collections import deque

from copy import copy, deepcopy
import warnings
import numpy as np
from sklearn.utils import check_array, check_consistent_length, check_scalar
from sklearn.base import clone
from sklearn.metrics.pairwise import pairwise_distances

from skactiveml.base import (
    BudgetManager,
    SingleAnnotatorStreamQueryStrategy,
    SkactivemlClassifier,
)
from skactiveml.utils import (
    check_type,
    call_func,
    check_budget_manager,
)
from skactiveml.stream.budgetmanager import (
    FixedUncertaintyBudgetManager,
    DensityBasedSplitBudgetManager,
    VariableUncertaintyBudgetManager,
    RandomBudgetManager,
    RandomVariableUncertaintyBudgetManager,
)


[docs]class StreamDensityBasedAL(SingleAnnotatorStreamQueryStrategy): """StreamDensityBasedAL The StreamDensityBasedAL [1]_ query strategy is an extension to the uncertainty based query strategies proposed by Žliobaitė et al. [2]_. In addition to the uncertainty assessment, StreamDensityBasedAL assesses the local density and only allows querying the label for a candidate if that local density is sufficiently high. The local density is represented by the number of other samples, the new sample is the new nearest neighbor to within a sliding window. Parameters ---------- dist_func : callable, default=None The distance function used to calculate the distances within the local density window. If None, `sklearn.metrics.pairwise.pairwise_distances` will be used by default. dist_func_dict : dict, default=None Additional parameters for `dist_func`. window_size : int, default=100 The sliding window size for the local density estimation. budget_manager : BudgetManager, default=None The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to `None`, `DensityBasedBudgetManager` will be used by default. The budget manager will be initialized based on the following conditions: - If only a `budget` is given, the default budget manager is initialized with the given budget. - If only a budget manager is given, use the budget manager. - If both are not given, the default budget manager with the default budget. - If both are given, and the budget differs from `budgetmanager.budget`, throw a warning and the budget manager is used as is. budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. References ---------- .. [1] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused uncertainty sampling for active learning over evolving stream data. In Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst. Program. Models Appl., pages 133–148, 2014. .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn. Syst., 25(1):27–39, 2014. """ def __init__( self, dist_func=None, dist_func_dict=None, window_size=100, budget_manager=None, budget=None, random_state=None, ): super().__init__(budget=budget, random_state=random_state) self.dist_func = dist_func self.dist_func_dict = dist_func_dict self.window_size = window_size self.budget_manager = budget_manager
[docs] def query( self, candidates, clf, X=None, y=None, sample_weight=None, fit_clf=False, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. The query startegy determines the most useful samples in candidates, which can be acquired within the budgeting constraint specified by `budget`. Please note that, this method does not change the internal state of the query strategy. To adapt the query strategy to the selected candidates, use `update(...)`. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. X : array-like of shape (n_samples, n_features), default=None Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. fit_clf : bool, default=False Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. return_utilities : bool, default=False If `True`, also return the `utilities` based on the query strategy. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. utilities: np.ndarray of shape (n_candidates,), The utilities based on the query strategy. Only provided if `return_utilities` is `True`. """ ( candidates, clf, X, y, sample_weight, fit_clf, return_utilities, ) = self._validate_data( candidates, clf=clf, X=X, y=y, sample_weight=sample_weight, fit_clf=fit_clf, return_utilities=return_utilities, ) # calculate the margin used as utillities predict_proba = clf.predict_proba(candidates) utilities_index = np.argpartition(predict_proba, -2)[:, -2:] confidence = ( np.take_along_axis(predict_proba, utilities_index[:, [1]], 1) - np.take_along_axis(predict_proba, utilities_index[:, [0]], 1) ).reshape([-1]) utilities = 1 - confidence tmp_min_dist = copy(self.min_dist_) tmp_window = copy(self.window_) queried_indices = [] for t, (u, x_cand) in enumerate(zip(utilities, candidates)): local_density_factor = self._calculate_ldf([x_cand]) if local_density_factor > 0: queried_indice = self.budget_manager_.query_by_utility( np.array([u]) ) if len(queried_indice) > 0: queried_indices.append(t) else: self.budget_manager_.query_by_utility(np.array([np.nan])) self.window_.append(x_cand) self.min_dist_ = tmp_min_dist self.window_ = tmp_window if return_utilities: return queried_indices, utilities else: return queried_indices
[docs] def update( self, candidates, queried_indices, budget_manager_param_dict=None ): """Updates the budget manager and the count for seen and queried labels. This function should be used in conjunction with the `query` function. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. budget_manager_param_dict : dict, default=None Optional kwargs for `budget_manager`. Returns ------- self : SingleAnnotatorStreamQueryStrategy The query strategy returns itself, after it is updated. """ # check if a budget_manager is set if not hasattr(self, "budget_manager_"): self._validate_random_state() random_seed = deepcopy(self.random_state_).randint(2**31 - 1) check_type( self.budget_manager, "budget_manager_", BudgetManager, type(None), ) self.budget_manager_ = check_budget_manager( self.budget, self.budget_manager, self._get_default_budget_manager(), {"random_state": random_seed}, ) if not hasattr(self, "window_"): self.window_ = deque(maxlen=self.window_size) if not hasattr(self, "min_dist_"): self.min_dist_ = deque(maxlen=self.window_size) if self.dist_func is None: self.dist_func_ = pairwise_distances else: self.dist_func_ = self.dist_func if not callable(self.dist_func_): raise TypeError("frequency_estimation needs to be a callable") self.dist_func_dict_ = ( self.dist_func_dict if self.dist_func_dict is not None else {} ) if not isinstance(self.dist_func_dict_, dict): raise TypeError("'dist_func_dict' must be a Python dictionary.") budget_manager_param_dict = ( {} if budget_manager_param_dict is None else budget_manager_param_dict ) new_candidates = [] for x_cand in candidates: local_density_factor = self._calculate_ldf([x_cand]) if local_density_factor > 0: new_candidates.append(x_cand) else: new_candidates.append(np.nan) self.window_.append(x_cand) call_func( self.budget_manager_.update, candidates=new_candidates, queried_indices=queried_indices, **budget_manager_param_dict, ) return self
def _calculate_ldf(self, candidates): """Calculate the number of new nearest neighbors for candidates in the sliding window. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. Returns ------- ldf: np.ndarray of shape (n_candiates,) Numbers of new nearest neighbor for `candidates` """ ldf = 0 if len(self.window_) >= 1: distances = self.dist_func_(self.window_, candidates).ravel() is_new_nn = distances < np.array(self.min_dist_) ldf = np.sum(is_new_nn) for i in np.where(is_new_nn)[0]: self.min_dist_[i] = distances[i] self.min_dist_.append(np.min(distances)) else: self.min_dist_.append(np.inf) return ldf def _validate_data( self, candidates, clf, X, y, sample_weight, fit_clf, return_utilities, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. X : array-like of shape (n_samples, n_features), default=None Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. fit_clf : bool, default=False Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples. clf : SkactivemlClassifier Checked model implementing the methods `fit` and `predict_freq`. X: np.ndarray, shape (n_samples, n_features) Checked training data set. y: np.ndarray, shape (n_samples) Checked training labels. sampling_weight: np.ndarray, shape (n_candidates) Checked training sample weight. fit_clf : bool, Checked boolean value of `fit_clf`. return_utilities : bool, Checked boolean value of `return_utilities`. """ candidates, return_utilities = super()._validate_data( candidates, return_utilities, reset=reset, **check_candidates_params, ) X, y, sample_weight = self._validate_X_y_sample_weight( X=X, y=y, sample_weight=sample_weight ) clf = self._validate_clf(clf, X, y, sample_weight, fit_clf) # check if a budget_manager is set if not hasattr(self, "budget_manager_"): random_seed = deepcopy(self.random_state_).randint(2**31 - 1) check_type( self.budget_manager, "budget_manager_", BudgetManager, type(None), ) self.budget_manager_ = check_budget_manager( self.budget, self.budget_manager, self._get_default_budget_manager(), {"random_state": random_seed}, ) if self.dist_func is None: self.dist_func_ = pairwise_distances else: self.dist_func_ = self.dist_func if not callable(self.dist_func_): raise TypeError("dist_func_ needs to be a callable") self.dist_func_dict_ = ( self.dist_func_dict if self.dist_func_dict is not None else {} ) if not isinstance(self.dist_func_dict_, dict): raise TypeError("'dist_func_dict' must be a Python dictionary.") # check density_threshold check_scalar(self.window_size, "window_size", int, min_val=1) if not hasattr(self, "window_"): self.window_ = deque(maxlen=self.window_size) if not hasattr(self, "min_dist_"): self.min_dist_ = deque(maxlen=self.window_size) return candidates, clf, X, y, sample_weight, fit_clf, return_utilities def _validate_clf(self, clf, X, y, sample_weight, fit_clf): """Validate if `clf` is a valid `SkactivemlClassifier`. If `clf` is untrained and `fit_clf`=`True`, `clf` is trained using X, y and sample_weight. Parameters ---------- clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. X : array-like of shape (n_samples, n_features), default=None Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. fit_clf : bool, default=False Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. Returns ------- clf : skactiveml.base.SkactivemlClassifier Checked model implementing the methods `fit` and `predict_freq`. """ # Check if the classifier and its arguments are valid. check_type(clf, "clf", SkactivemlClassifier) check_type(fit_clf, "fit_clf", bool) if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) return clf def _validate_X_y_sample_weight(self, X, y, sample_weight): """Validate if X, y and sample_weight are numeric and of equal length. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,) Weights of training samples in `X`. Returns ------- X : array-like of shape (n_samples, n_features) Checked training data set. y : array-like of shape (n_samples) Checked labels of the input samples `X`. Converts `y` to a numpy array. """ if sample_weight is not None: sample_weight = np.array(sample_weight) check_consistent_length(sample_weight, y) if X is not None and y is not None: X = check_array(X) y = np.array(y) check_consistent_length(X, y) return X, y, sample_weight def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budget_manager : BudgetManager The `BudgetManager` that should be used by default. """ return DensityBasedSplitBudgetManager
[docs]class CognitiveDualQueryStrategy(SingleAnnotatorStreamQueryStrategy): """CognitiveDualQueryStrategy This class is the base for the CognitiveDualQueryStrategy query strategy proposed in [1]_. To use this strategy, refer to `CognitiveDualQueryStrategyRan`, `CognitiveDualQueryStrategyRanVarUn`, `CognitiveDualQueryStrategyVarUn` , and `CognitiveDualQueryStrategyFixUn`. The CognitiveDualQueryStrategy strategy is an extension to the uncertainty based query strategies proposed by Žliobaitė et al. [2]_ and follows the same idea as StreamDensityBasedAL [3]_ where queries for labels is only allowed if the local density around the corresponding sample is sufficiently high. The authors propose the use of a cognitive window that monitors the most representative samples within a data stream. Parameters ---------- force_full_budget : bool, default=False If `True`, tries to utilize the full budget. The article does not update the budget manager if the locale density factor is 0. dist_func : callable, default=None The distance function used to calculate the distances within the local density window. If it is `None`, `sklearn.metrics.pairwise.pairwise_distances` will be used by default. dist_func_dict : dict, default=None Additional parameters for `dist_func`. density_threshold : int, default=1 Determines the local density factor size that needs to be reached in order to query the candidate's label. cognition_window_size : int, default=10 Determines the size of the cognition window. budget_manager : BudgetManager, default=None The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to `None`, `DensityBasedBudgetManager` will be used by default. The budget manager will be initialized based on the following conditions: - If only a `budget` is given, the default budget manager is initialized with the given budget. - If only a budget manager is given, use the budget manager. - If both are not given, the default budget manager with the default budget. - If both are given, and the budget differs from `budgetmanager.budget`, throw a warning and the budget manager is used as is. budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. See Also -------- CognitiveDualQueryStrategyRan : CognitiveDualQueryStrategy using the RandomBudgetManager. CognitiveDualQueryStrategyFixUn : CognitiveDualQueryStrategy using the FixedUncertaintyBudgetManager. CognitiveDualQueryStrategyVarUn : VariableUncertaintyBudgetManager using the VariableUncertaintyBudgetManager. CognitiveDualQueryStrategyRanVarUn : CognitiveDualQueryStrategy using the RandomVariableUncertaintyBudgetManager. References ---------- .. [1] S. Liu, S. Xue, J. Wu, C. Zhou, J. Yang, Z. Li, and J. Cao. Online Active Learning for Drifting Data Streams. IEEE Trans. Neural Netw. Learn. Syst., 34(1):186–200, 2023. .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn. Syst., 25(1):27–39, 2014. .. [3] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused uncertainty sampling for active learning over evolving stream data. In Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst. Program. Models Appl., pages 133–148, 2014. """ def __init__( self, force_full_budget=False, dist_func=None, dist_func_dict=None, density_threshold=1, cognition_window_size=10, budget_manager=None, budget=None, random_state=None, ): super().__init__(budget=budget, random_state=random_state) self.budget_manager = budget_manager self.density_threshold = density_threshold self.dist_func = dist_func self.dist_func_dict = dist_func_dict self.cognition_window_size = cognition_window_size self.force_full_budget = force_full_budget
[docs] def query( self, candidates, clf, X=None, y=None, sample_weight=None, fit_clf=False, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. The query startegy determines the most useful samples in candidates, which can be acquired within the budgeting constraint specified by `budget`. Please note that, this method does not change the internal state of the query strategy. To adapt the query strategy to the selected candidates, use `update(...)`. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. X : array-like of shape (n_samples, n_features), default=None Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. fit_clf : bool, default=False Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. return_utilities : bool, default=False If `True`, also return the `utilities` based on the query strategy. Returns ------- queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. utilities: np.ndarray of shape (n_candidates,), The utilities based on the query strategy. Only provided if `return_utilities` is `True`. """ ( candidates, clf, X, y, sample_weight, fit_clf, return_utilities, ) = self._validate_data( candidates, clf=clf, X=X, y=y, sample_weight=sample_weight, fit_clf=fit_clf, return_utilities=return_utilities, ) # its the margin but used as utillities predict_proba = clf.predict_proba(candidates) confidence = np.max(predict_proba, axis=1) utilities = 1 - confidence # copy variables tmp_cognition_window = copy(self.cognition_window_) tmp_theta = copy(self.theta_) tmp_s = copy(self.s_) tmp_t_x = copy(self.t_x_) f = copy(self.f_) min_dist = copy(self.min_dist_) t = copy(self.t_) queried_indices = [] for i, (u, x_cand) in enumerate(zip(utilities, candidates)): local_density_factor = self._calculate_ldf([x_cand]) if local_density_factor >= self.density_threshold: queried_indice = self.budget_manager_.query_by_utility( np.array([u]) ) if len(queried_indice) > 0: queried_indices.append(i) elif self.force_full_budget: self.budget_manager_.query_by_utility(np.array([np.nan])) self.t_ += 1 # overwrite changes self.cognition_window_ = tmp_cognition_window self.theta_ = tmp_theta self.s_ = tmp_s self.t_x_ = tmp_t_x self.f_ = f self.min_dist_ = min_dist self.t_ = t if return_utilities: return queried_indices, utilities else: return queried_indices
[docs] def update( self, candidates, queried_indices, budget_manager_param_dict=None ): """Updates the budget manager and the count for seen and queried labels. This function should be used in conjunction with the `query` function. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. queried_indices : np.ndarray of shape (n_queried_indices,) The indices of samples in candidates whose labels are queried, with `0 <= queried_indices <= n_candidates`. budget_manager_param_dict : dict, default=None Optional kwargs for `budget_manager`. Returns ------- self : CognitiveDualQueryStrategy The query strategy returns itself, after it is updated. """ self._validate_force_full_budget() # check if a budget_manager is set if not hasattr(self, "budget_manager_"): self._validate_random_state() random_seed = deepcopy(self.random_state_).randint(2**31 - 1) check_type( self.budget_manager, "budget_manager_", BudgetManager, type(None), ) default_budget_manager_kwargs = ( self._get_default_budget_manager_kwargs() ) default_budget_manager_kwargs["random_state"] = random_seed self.budget_manager_ = check_budget_manager( self.budget, self.budget_manager, self._get_default_budget_manager(), default_budget_manager_kwargs, ) # _init_members if self.dist_func is None: self.dist_func_ = pairwise_distances else: self.dist_func_ = self.dist_func if not callable(self.dist_func_): raise TypeError("frequency_estimation needs to be a callable") self.dist_func_dict_ = ( self.dist_func_dict if self.dist_func_dict is not None else {} ) if not isinstance(self.dist_func_dict_, dict): raise TypeError("'dist_func_dict' must be a Python dictionary.") if not hasattr(self, "min_dist_"): self.min_dist_ = [] if not hasattr(self, "t_"): self.t_ = 0 if not hasattr(self, "cognition_window_"): self.cognition_window_ = [] if not hasattr(self, "f_"): self.f_ = [] if not hasattr(self, "theta_"): self.theta_ = [] if not hasattr(self, "s_"): self.s_ = [] if not hasattr(self, "t_x_"): self.t_x_ = [] budget_manager_param_dict = ( {} if budget_manager_param_dict is None else budget_manager_param_dict ) new_candidates = [] for x_cand in candidates: local_density_factor = self._calculate_ldf([x_cand]) if local_density_factor >= self.density_threshold: new_candidates.append(x_cand) elif self.force_full_budget: new_candidates.append(np.nan) self.t_ += 1 call_func( self.budget_manager_.update, candidates=new_candidates, queried_indices=queried_indices, **budget_manager_param_dict, ) return self
def _calculate_ldf(self, candidates): """Calculate the number of new nearest neighbors for candidates in the sliding window. Parameters ---------- candidates: array-like of shape (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. Returns ------- ldf: np.ndarray of shape (n_candiates,) Numbers of new nearest neighbor for `candidates` """ ldf = 0 f = 1 t_x = self.t_ s = 1 theta = 0 if len(self.cognition_window_) >= 1: distances = self.dist_func_( self.cognition_window_, candidates ).ravel() is_new_nn = distances < np.array(self.min_dist_) ldf = np.sum(is_new_nn) for i in np.where(is_new_nn)[0]: self.t_x_[i] = t_x self.theta_[i] += 1 self.min_dist_[i] = distances[i] self.min_dist_.append(np.min(distances)) else: self.min_dist_.append(np.inf) for t, _ in enumerate(self.cognition_window_): self.f_[t] = 1 / (self.theta_[t] + 1) tmp = -self.f_[t] * (t_x - self.t_x_[t]) self.s_[t] = np.exp(tmp) if len(self.cognition_window_) > self.cognition_window_size: # remove element with the smallest memory strength remove_index = np.argmin(self.s_) self.cognition_window_.pop(remove_index) self.theta_.pop(remove_index) self.s_.pop(remove_index) self.t_x_.pop(remove_index) self.f_.pop(remove_index) self.min_dist_.pop(remove_index) self.cognition_window_.extend(candidates) self.theta_.append(theta) self.s_.append(s) self.t_x_.append(t_x) self.f_.append(f) return ldf def _validate_data( self, candidates, clf, X, y, sample_weight, fit_clf, return_utilities, reset=True, **check_candidates_params, ): """Validate input data and set or check the `n_features_in_` attribute. Parameters ---------- candidates : {array-like, sparse matrix} of shape\ (n_candidates, n_features) The samples which may be queried. Sparse matrices are accepted only if they are supported by the base query strategy. clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. X : array-like of shape (n_samples, n_features), default=None Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. fit_clf : bool, default=False Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. reset : bool, default=True Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. **check_candidates_params : kwargs Parameters passed to :func:`sklearn.utils.check_array`. Returns ------- candidates: np.ndarray, shape (n_candidates, n_features) Checked candidate samples. clf : SkactivemlClassifier Checked model implementing the methods `fit` and `predict_freq`. X: np.ndarray, shape (n_samples, n_features) Checked training data set. y: np.ndarray, shape (n_samples) Checked training labels. sampling_weight: np.ndarray, shape (n_candidates) Checked training sample weight. fit_clf : bool, Checked boolean value of `fit_clf`. return_utilities : bool, Checked boolean value of `return_utilities`. """ candidates, return_utilities = super()._validate_data( candidates, return_utilities, reset=reset, **check_candidates_params, ) self._validate_random_state() X, y, sample_weight = self._validate_X_y_sample_weight( X=X, y=y, sample_weight=sample_weight ) clf = self._validate_clf(clf, X, y, sample_weight, fit_clf) # check density_threshold check_scalar( self.density_threshold, "density_threshold", int, min_val=0 ) check_scalar( self.cognition_window_size, "cognition_window_size", int, min_val=1 ) self._validate_force_full_budget() # check if a budget_manager is set if not hasattr(self, "budget_manager_"): random_seed = deepcopy(self.random_state_).randint(2**31 - 1) check_type( self.budget_manager, "budget_manager_", BudgetManager, type(None), ) default_budget_manager_kwargs = ( self._get_default_budget_manager_kwargs() ) default_budget_manager_kwargs["random_state"] = random_seed self.budget_manager_ = check_budget_manager( self.budget, self.budget_manager, self._get_default_budget_manager(), default_budget_manager_kwargs, ) if self.dist_func is None: self.dist_func_ = pairwise_distances else: self.dist_func_ = self.dist_func if not callable(self.dist_func_): raise TypeError("frequency_estimation needs to be a callable") self.dist_func_dict_ = ( self.dist_func_dict if self.dist_func_dict is not None else {} ) if not isinstance(self.dist_func_dict_, dict): raise TypeError("'dist_func_dict' must be a Python dictionary.") if not hasattr(self, "min_dist_"): self.min_dist_ = [] if not hasattr(self, "t_"): self.t_ = 0 if not hasattr(self, "cognition_window_"): self.cognition_window_ = [] if not hasattr(self, "f_"): self.f_ = [] if not hasattr(self, "theta_"): self.theta_ = [] if not hasattr(self, "s_"): self.s_ = [] if not hasattr(self, "t_x_"): self.t_x_ = [] return candidates, clf, X, y, sample_weight, fit_clf, return_utilities def _get_default_budget_manager_kwargs(self): """Provide the kwargs for the budget manager that will be used as default. Returns ------- default_budget_manager_kwargs : dict The arguments necessary to initialize the budget manager. """ return {} def _validate_clf(self, clf, X, y, sample_weight, fit_clf): """Validate if `clf` is a valid `SkactivemlClassifier`. If `clf` is untrained and `fit_clf`=`True`, `clf` is trained using X, y and sample_weight. Parameters ---------- clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. X : array-like of shape (n_samples, n_features), default=None Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. fit_clf : bool, default=False Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. Returns ------- clf : skactiveml.base.SkactivemlClassifier Checked model implementing the methods `fit` and `predict_freq`. """ # Check if the classifier and its arguments are valid. check_type(clf, "clf", SkactivemlClassifier) check_type(fit_clf, "fit_clf", bool) if fit_clf: if sample_weight is None: clf = clone(clf).fit(X, y) else: clf = clone(clf).fit(X, y, sample_weight) return clf def _validate_force_full_budget(self): # check force_full_budget check_type(self.force_full_budget, "force_full_budget", bool) if not hasattr(self, "budget_manager_") and not self.force_full_budget: warnings.warn( "force_full_budget is set to False. " "Therefore the full budget may not be utilised." ) def _validate_X_y_sample_weight(self, X, y, sample_weight): """Validate if X, y and sample_weight are numeric and of equal length. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set used to fit the classifier. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). sample_weight : array-like of shape (n_samples,) Weights of training samples in `X`. Returns ------- X : array-like of shape (n_samples, n_features) Checked training data set. y : array-like of shape (n_samples) Checked labels of the input samples `X`. Converts `y` to a numpy array. """ if sample_weight is not None: sample_weight = np.array(sample_weight) check_consistent_length(sample_weight, y) if X is not None and y is not None: X = check_array(X) y = np.array(y) check_consistent_length(X, y) return X, y, sample_weight def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budget_manager : BudgetManager The `BudgetManager` that should be used by default. """ return RandomVariableUncertaintyBudgetManager
[docs]class CognitiveDualQueryStrategyRan(CognitiveDualQueryStrategy): """CognitiveDualQueryStrategyRan This class implements the CognitiveDualQueryStrategy [1]_ strategy with Random Sampling. The CognitiveDualQueryStrategy strategy is an extension to the uncertainty based query strategies proposed by Žliobaitė et al. [2]_ and follows the same idea as StreamDensityBasedAL [3]_ where queries for labels is only allowed if the local density around the corresponding sample is sufficiently high. The authors propose the use of a cognitive window that monitors the most representative samples within a data stream. Parameters ---------- force_full_budget : bool, default=False If `True`, tries to utilize the full budget. The article does not update the budget manager if the locale density factor is 0. dist_func : callable, default=None The distance function used to calculate the distances within the local density window. If it is `None`, `sklearn.metrics.pairwise.pairwise_distances` will be used by default. dist_func_dict : dict, default=None Additional parameters for `dist_func`. density_threshold : int, default=1 Determines the local density factor size that needs to be reached in order to query the candidate's label. cognition_window_size : int, default=10 Determines the size of the cognition window. budget_manager : BudgetManager, default=None The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to `None`, `RandomBudgetManager` will be used by default. The budget manager will be initialized based on the following conditions: - If only a `budget` is given, the default budget manager is initialized with the given budget. - If only a budget manager is given, use the budget manager. - If both are not given, the default budget manager with the default budget. - If both are given, and the budget differs from `budgetmanager.budget`, throw a warning and the budget manager is used as is. budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. See Also -------- .budgetmanager.RandomBudgetManager : The default budget manager. .budgetmanager.CognitiveDualQueryStrategy : The base class for this strategy. References ---------- .. [1] S. Liu, S. Xue, J. Wu, C. Zhou, J. Yang, Z. Li, and J. Cao. Online Active Learning for Drifting Data Streams. IEEE Trans. Neural Netw. Learn. Syst., 34(1):186–200, 2023. .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn. Syst., 25(1):27–39, 2014. .. [3] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused uncertainty sampling for active learning over evolving stream data. In Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst. Program. Models Appl., pages 133–148, 2014. """ def __init__( self, force_full_budget=False, dist_func=None, dist_func_dict=None, density_threshold=1, cognition_window_size=10, budget=None, random_state=None, ): super().__init__( budget=budget, random_state=random_state, budget_manager=None, density_threshold=density_threshold, dist_func=dist_func, dist_func_dict=dist_func_dict, cognition_window_size=cognition_window_size, force_full_budget=force_full_budget, ) def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budget_manager : BudgetManager The BudgetManager that should be used by default. """ return RandomBudgetManager
[docs]class CognitiveDualQueryStrategyFixUn(CognitiveDualQueryStrategy): """CognitiveDualQueryStrategyFixUn This class implements the CognitiveDualQueryStrategy [1]_ strategy with FixedUncertainty. The CognitiveDualQueryStrategy strategy is an extension to the uncertainty based query strategies proposed by Žliobaitė et al. [2]_ and follows the same idea as StreamDensityBasedAL [3]_ where queries for labels is only allowed if the local density around the corresponding sample is sufficiently high. The authors propose the use of a cognitive window that monitors the most representative samples within a data stream. Parameters ---------- classes : array-like of shape (n_classes,) Holds the label for each class. force_full_budget : bool, default=False If `True`, tries to utilize the full budget. The article does not update the budget manager if the locale density factor is 0. dist_func : callable, default=None The distance function used to calculate the distances within the local density window. If it is `None`, `sklearn.metrics.pairwise.pairwise_distances` will be used by default. dist_func_dict : dict, default=None Additional parameters for `dist_func`. density_threshold : int, default=1 Determines the local density factor size that needs to be reached in order to query the candidate's label. cognition_window_size : int, default=10 Determines the size of the cognition window. budget_manager : BudgetManager, default=None The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to `None`, `FixedUncertaintyBudgetManager` will be used by default. The budget manager will be initialized based on the following conditions: - If only a `budget` is given, the default budget manager is initialized with the given budget. - If only a budget manager is given, use the budget manager. - If both are not given, the default budget manager with the default budget. - If both are given, and the budget differs from `budgetmanager.budget`, throw a warning and the budget manager is used as is. budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. See Also -------- .budgetmanager.FixedUncertaintyBudgetManager : The default budget manager .budgetmanager.CognitiveDualQueryStrategy : The base class for this strategy. References ---------- .. [1] S. Liu, S. Xue, J. Wu, C. Zhou, J. Yang, Z. Li, and J. Cao. Online Active Learning for Drifting Data Streams. IEEE Trans. Neural Netw. Learn. Syst., 34(1):186–200, 2023. .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn. Syst., 25(1):27–39, 2014. .. [3] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused uncertainty sampling for active learning over evolving stream data. In Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst. Program. Models Appl., pages 133–148, 2014. """ def __init__( self, classes, force_full_budget=False, dist_func=None, dist_func_dict=None, density_threshold=1, cognition_window_size=10, budget=None, random_state=None, ): super().__init__( budget=budget, random_state=random_state, budget_manager=None, density_threshold=density_threshold, dist_func=dist_func, dist_func_dict=dist_func_dict, cognition_window_size=cognition_window_size, force_full_budget=force_full_budget, ) self.classes = classes def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budget_manager : BudgetManager The BudgetManager that should be used by default. """ return FixedUncertaintyBudgetManager def _get_default_budget_manager_kwargs(self): """Provide the kwargs for the budget manager that will be used as default. Returns ------- default_budget_manager_kwargs : dict The arguments necessary to initialize the budget manager. """ return {"classes": self.classes}
[docs]class CognitiveDualQueryStrategyVarUn(CognitiveDualQueryStrategy): """CognitiveDualQueryStrategyVarUn This class implements the CognitiveDualQueryStrategy [1]_ strategy with VariableUncertainty. The CognitiveDualQueryStrategy strategy is an extension to the uncertainty based query strategies proposed by Žliobaitė et al. [2]_ and follows the same idea as StreamDensityBasedAL [3]_ where queries for labels is only allowed if the local density around the corresponding sample is sufficiently high. The authors propose the use of a cognitive window that monitors the most representative samples within a data stream. Parameters ---------- force_full_budget : bool, default=False If `True`, tries to utilize the full budget. The article does not update the budget manager if the locale density factor is 0. dist_func : callable, default=None The distance function used to calculate the distances within the local density window. If it is `None`, `sklearn.metrics.pairwise.pairwise_distances` will be used by default. dist_func_dict : dict, default=None Additional parameters for `dist_func`. density_threshold : int, default=1 Determines the local density factor size that needs to be reached in order to query the candidate's label. cognition_window_size : int, default=10 Determines the size of the cognition window. budget_manager : BudgetManager, default=None The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to `None`, `VariableUncertaintyBudgetManager` will be used by default. The budget manager will be initialized based on the following conditions: - If only a `budget` is given, the default budget manager is initialized with the given budget. - If only a budget manager is given, use the budget manager. - If both are not given, the default budget manager with the default budget. - If both are given, and the budget differs from `budgetmanager.budget`, throw a warning and the budget manager is used as is. budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. See Also -------- .budgetmanager.RandomBudgetManager : The default budget manager. .budgetmanager.CognitiveDualQueryStrategy : The base class for this strategy. References ---------- .. [1] S. Liu, S. Xue, J. Wu, C. Zhou, J. Yang, Z. Li, and J. Cao. Online Active Learning for Drifting Data Streams. IEEE Trans. Neural Netw. Learn. Syst., 34(1):186–200, 2023. .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn. Syst., 25(1):27–39, 2014. .. [3] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused uncertainty sampling for active learning over evolving stream data. In Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst. Program. Models Appl., pages 133–148, 2014. """ def __init__( self, force_full_budget=False, dist_func=None, dist_func_dict=None, density_threshold=1, cognition_window_size=10, budget=None, random_state=None, ): super().__init__( budget=budget, random_state=random_state, budget_manager=None, density_threshold=density_threshold, dist_func=dist_func, dist_func_dict=dist_func_dict, cognition_window_size=cognition_window_size, force_full_budget=force_full_budget, ) def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budget_manager : BudgetManager The BudgetManager that should be used by default. """ return VariableUncertaintyBudgetManager
[docs]class CognitiveDualQueryStrategyRanVarUn(CognitiveDualQueryStrategy): """CognitiveDualQueryStrategyRanVarUn This class implements the CognitiveDualQueryStrategy [1]_ strategy with RandomVariableUncertainty. The CognitiveDualQueryStrategy strategy is an extension to the uncertainty based query strategies proposed by Žliobaitė et al. [2]_ and follows the same idea as StreamDensityBasedAL [3]_ where queries for labels is only allowed if the local density around the corresponding sample is sufficiently high. The authors propose the use of a cognitive window that monitors the most representative samples within a data stream. Parameters ---------- force_full_budget : bool, default=False If `True`, tries to utilize the full budget. The article does not update the budget manager if the locale density factor is 0. dist_func : callable, default=None The distance function used to calculate the distances within the local density window. If it is `None`, `sklearn.metrics.pairwise.pairwise_distances` will be used by default. dist_func_dict : dict, default=None Additional parameters for `dist_func`. density_threshold : int, default=1 Determines the local density factor size that needs to be reached in order to query the candidate's label. cognition_window_size : int, default=10 Determines the size of the cognition window. budget_manager : BudgetManager, default=None The BudgetManager which models the budgeting constraint used in the stream-based active learning setting. if set to `None`, `RandomBudgetManager` will be used by default. The budget manager will be initialized based on the following conditions: - If only a `budget` is given, the default budget manager is initialized with the given budget. - If only a budget manager is given, use the budget manager. - If both are not given, the default budget manager with the default budget. - If both are given, and the budget differs from `budgetmanager.budget`, throw a warning and the budget manager is used as is. budget : float, default=None Specifies the ratio of samples which are allowed to be sampled, with `0 <= budget <= 1`. If `budget` is `None`, it is replaced with the default budget 0.1. random_state : int or RandomState instance, default=None Controls the randomness of the estimator. See Also -------- .budgetmanager.RandomBudgetManager : The default budget manager. .budgetmanager.CognitiveDualQueryStrategy : The base class for this strategy. References ---------- .. [1] S. Liu, S. Xue, J. Wu, C. Zhou, J. Yang, Z. Li, and J. Cao. Online Active Learning for Drifting Data Streams. IEEE Trans. Neural Netw. Learn. Syst., 34(1):186–200, 2023. .. [2] I. Žliobaitė, A. Bifet, B. Pfahringer, and G. Holmes. Active Learning With Drifting Streaming Data. IEEE Trans. Neural Netw. Learn. Syst., 25(1):27–39, 2014. .. [3] D. Ienco, I. Žliobaitė, and B. Pfahringer. High density-focused uncertainty sampling for active learning over evolving stream data. In Int. Workshop Big Data Streams Heterog. Source Min. Algorithms Syst. Program. Models Appl., pages 133–148, 2014. """ def __init__( self, force_full_budget=False, dist_func=None, dist_func_dict=None, density_threshold=1, cognition_window_size=10, budget=None, random_state=None, ): super().__init__( budget=budget, random_state=random_state, budget_manager=None, density_threshold=density_threshold, dist_func=dist_func, dist_func_dict=dist_func_dict, cognition_window_size=cognition_window_size, force_full_budget=force_full_budget, ) def _get_default_budget_manager(self): """Provide the budget manager that will be used as default. Returns ------- budget_manager : BudgetManager The BudgetManager that should be used by default. """ return RandomVariableUncertaintyBudgetManager