Source code for skactiveml.stream._stream_probabilistic_al

import numpy as np
from sklearn import clone
from sklearn.utils import check_array, check_consistent_length

from ..classifier import ParzenWindowClassifier
from .budgetmanager import BalancedIncrementalQuantileFilter
from ..base import (
    SingleAnnotatorStreamQueryStrategy,
    SkactivemlClassifier,
    BudgetManager,
)
from ..pool import cost_reduction
from ..utils import (
    check_type,
    check_random_state,
    check_scalar,
    call_func,
    check_budget_manager,
)


[docs]class StreamProbabilisticAL(SingleAnnotatorStreamQueryStrategy):
    """StreamProbabilisticAL

    Probabilistic Active Learning in Datastreams (StreamProbabilisticAL) is an
    extension to Multi-Class Probabilistic Active Learning (McPAL)
    (see pool.ProbabilisticAL). It assesses McPAL spatial to assess the spatial
    utility. The Balanced Incremental Quantile Filter
    (BalancedIncrementalQuantileFilter), that is implemented within the
    default budget manager, is used to evaluate the temporal utility
    (see stream.budgetmanager.BalancedIncrementalQuantileFilter).

    Parameters
    ----------
    budget : float, optional (default=None)
        The budget which models the budgeting constraint used in
        the stream-based active learning setting.
    budget_manager : BudgetManager, optional (default=None)
        The BudgetManager which models the budgeting constraint used in
        the stream-based active learning setting. if set to None,
        BalancedIncrementalQuantileFilter will be used by default. The budget
        manager will be initialized based on the following conditions:
            If only a budget is given the default budget manager is initialized
            with the given budget.
            If only a budget manager is given use the budget manager.
            If both are not given the default budget manager with the
            default budget.
            If both are given and the budget differs from budgetmanager.budget
            a warning is thrown.
    metric : str or callable, optional (default=None)
        The metric must a be None or a valid kernel as defined by the function
        `sklearn.metrics.pairwise.pairwise_kernels`. The kernel is used to
        calculate the frequency of labels near the candidates and multiplied
        with the probabilities returned by the `clf` to get a kernel frequency
        estimate for each class.
        If metric is set to None, the `predict_freq` function of the `clf` will
        be used instead. If this is not defined, an Exception is raised.
    metric_dict : dict, optional (default=None)
        Any further parameters are passed directly to the kernel function.
        If metric_dict is None and metric is 'rbf' metric_dict is set to
        {'gamma': 'mean'}.
    random_state : int, RandomState instance, optional (default=None)
        Controls the randomness of the query strategy.
    prior : float, optional (default=1.0e-3)
        The prior value that is passed onto ProbabilisticAL
        (see pool.ProbabilisticAL).
    m_max : float, optional (default=2)
        The m_max value that is passed onto ProbabilisticAL
        (see pool.ProbabilisticAL).

    References
    ----------
    [1] Kottke, M. (2015). Probabilistic Active Learning in Datastreams. In
        Advances in Intelligent Data Analysis XIV (pp. 145–157). Springer.
    """

    def __init__(
        self,
        budget_manager=None,
        budget=None,
        metric=None,
        metric_dict=None,
        random_state=None,
        prior=1.0e-3,
        m_max=2,
    ):
        super().__init__(budget=budget, random_state=random_state)
        self.budget_manager = budget_manager
        self.prior = prior
        self.m_max = m_max
        self.metric = metric
        self.metric_dict = metric_dict

[docs]    def query(
        self,
        candidates,
        clf,
        X=None,
        y=None,
        sample_weight=None,
        fit_clf=False,
        utility_weight=None,
        return_utilities=False,
    ):
        """Ask the query strategy which instances in candidates to acquire.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which may be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        clf : SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`. If
            `self.metric` is None, the `clf` must also implement
            `predict_freq`.

        X : array-like of shape (n_samples, n_features), optional
        (default=None)
            Input samples used to fit the classifier.

        y : array-like of shape (n_samples), optional (default=None)
            Labels of the input samples 'X'. There may be missing labels.

        sample_weight : array-like of shape (n_samples,), optional
        (default=None)
            Sample weights for X, used to fit the clf.

        fit_clf : bool,optional (default=False)
            If True, refit the classifier also requires X and y to be given.

        utility_weight : array-like of shape (n_candidate_samples), optional
        (default=None)
            Densities for each sample in `candidates`.

        return_utilities : bool, optional (default=False)
            If true, also return the utilities based on the query strategy.
            The default is False.

        Returns
        -------
        queried_indices : ndarray of shape (n_queried_instances,)
            The indices of instances in candidates which should be queried,
            with 0 <= n_queried_instances <= n_samples.
        utilities: ndarray of shape (n_samples,), optional
            The utilities based on the query strategy. Only provided if
            return_utilities is True.
        """
        (
            candidates,
            clf,
            X,
            y,
            sample_weight,
            fit_clf,
            utility_weight,
            return_utilities,
        ) = self._validate_data(
            candidates=candidates,
            clf=clf,
            X=X,
            y=y,
            sample_weight=sample_weight,
            fit_clf=fit_clf,
            utility_weight=utility_weight,
            return_utilities=return_utilities,
        )
        if self.metric is not None:
            if self.metric_dict is None and self.metric == "rbf":
                self.metric_dict = {"gamma": "mean"}
            pwc = ParzenWindowClassifier(
                metric=self.metric,
                metric_dict=self.metric_dict,
                missing_label=clf.missing_label,
                classes=clf.classes,
            )
            pwc.fit(X=X, y=y, sample_weight=sample_weight)
            n = pwc.predict_freq(candidates).sum(axis=1, keepdims=True)
            pred_proba = clf.predict_proba(candidates)
            k_vec = n * pred_proba
        else:
            k_vec = clf.predict_freq(candidates)

        utilities = cost_reduction(k_vec, prior=self.prior, m_max=self.m_max)

        utilities *= utility_weight

        queried_indices = self.budget_manager_.query_by_utility(utilities)

        if return_utilities:
            return queried_indices, utilities
        else:
            return queried_indices

[docs]    def update(
        self, candidates, queried_indices, budget_manager_param_dict=None
    ):
        """Updates the budget manager.

        Parameters
        ----------
        candidates : {array-like, sparse matrix} of shape
        (n_samples, n_features)
            The instances which could be queried. Sparse matrices are accepted
            only if they are supported by the base query strategy.

        queried_indices : array-like of shape (n_samples,)
            Indicates which instances from candidates have been queried.

        budget_manager_param_dict : kwargs, optional (default=None)
            Optional kwargs for budgetmanager.

        Returns
        -------
        self : StreamProbabilisticAL
            PALS returns itself, after it is updated.
        """
        # check if a budgetmanager is set
        if not hasattr(self, "budget_manager_"):
            check_type(
                self.budget_manager,
                "budget_manager_",
                BudgetManager,
                type(None),
            )
            self.budget_manager_ = check_budget_manager(
                self.budget,
                self.budget_manager,
                BalancedIncrementalQuantileFilter,
            )
        budget_manager_param_dict = (
            {}
            if budget_manager_param_dict is None
            else budget_manager_param_dict
        )
        call_func(
            self.budget_manager_.update,
            candidates=candidates,
            queried_indices=queried_indices,
            **budget_manager_param_dict
        )
        return self

    def _validate_data(
        self,
        candidates,
        clf,
        X,
        y,
        sample_weight,
        fit_clf,
        utility_weight,
        return_utilities,
        reset=True,
        **check_candidates_params
    ):
        """Validate input data and set or check the `n_features_in_` attribute.

        Parameters
        ----------
        candidates: array-like, shape (n_candidates, n_features)
            Candidate samples.
        clf : SkactivemlClassifier
            Model implementing the methods `fit` and `predict_proba`. If
            `self.metric` is None, the `clf` must also implement
            `predict_freq`.
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.
        sample_weight : array-like of shape (n_samples,)
            Sample weights for X, used to fit the clf.
        fit_clf : bool,
            If true, refit the classifier also requires X and y to be given.
        utility_weight: array-like of shape (n_candidate_samples)
            Densities for each sample in `candidates`.
        return_utilities : bool,
            If true, also return the utilities based on the query strategy.
        reset : bool, optional (default=True)
            Whether to reset the `n_features_in_` attribute.
            If False, the input will be checked for consistency with data
            provided when reset was last True.
        **check_candidates_params : kwargs
            Parameters passed to :func:`sklearn.utils.check_array`.

        Returns
        -------
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples
        clf : SkactivemlClassifier
            Checked model implementing the methods `fit` and `predict_freq`.
        X: np.ndarray, shape (n_samples, n_features)
            Checked training samples
        y: np.ndarray, shape (n_candidates)
            Checked training labels
        sampling_weight: np.ndarray, shape (n_candidates)
            Checked training sample weight
        fit_clf : bool,
            Checked boolean value of `fit_clf`
        utility_weight: array-like of shape (n_candidate_samples)
            Checked densities for each sample in `candidates`.
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples
        return_utilities : bool,
            Checked boolean value of `return_utilities`.
        """
        candidates, return_utilities = super()._validate_data(
            candidates,
            return_utilities,
            reset=reset,
            **check_candidates_params
        )
        # check if a budgetmanager is set

        if not hasattr(self, "budget_manager_"):
            check_type(
                self.budget_manager,
                "budget_manager_",
                BudgetManager,
                type(None),
            )
            self.budget_manager_ = check_budget_manager(
                self.budget,
                self.budget_manager,
                BalancedIncrementalQuantileFilter,
            )

        X, y, sample_weight = self._validate_X_y_sample_weight(
            X, y, sample_weight
        )
        clf = self._validate_clf(clf, X, y, sample_weight, fit_clf)
        utility_weight = self._validate_utility_weight(
            utility_weight, candidates
        )

        if self.metric is None and not hasattr(clf, "predict_freq"):
            raise TypeError(
                "clf has no predict_freq and metric was set to None"
            )

        check_scalar(
            self.prior, "prior", float, min_val=0, min_inclusive=False
        )
        check_scalar(self.m_max, "m_max", int, min_val=0, min_inclusive=False)
        self._validate_random_state()

        return (
            candidates,
            clf,
            X,
            y,
            sample_weight,
            fit_clf,
            utility_weight,
            return_utilities,
        )

    def _validate_X_y_sample_weight(self, X, y, sample_weight):
        """Validate if X, y and sample_weight are numeric and of equal length.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.
        sample_weight : array-like of shape (n_samples,)
            Sample weights for X, used to fit the clf.

        Returns
        -------
        X : array-like of shape (n_samples, n_features)
            Checked Input samples.
        y : array-like of shape (n_samples)
            Checked Labels of the input samples 'X'. Converts y to a numpy
            array
        """
        if sample_weight is not None:
            sample_weight = np.array(sample_weight)
            check_consistent_length(sample_weight, y)
        if X is not None and y is not None:
            X = check_array(X)
            y = np.array(y)
            check_consistent_length(X, y)
        return X, y, sample_weight

    def _validate_clf(self, clf, X, y, sample_weight, fit_clf):
        """Validate if clf is a valid SkactivemlClassifier. If clf is
        untrained, clf is trained using X, y and sample_weight.

        Parameters
        ----------
        clf : SkactivemlClassifier
            Model implementing the methods `fit` and `predict_freq`.
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.
        sample_weight : array-like of shape (n_samples,)
            Sample weights for X, used to fit the clf.

        Returns
        -------
        clf : SkactivemlClassifier
            Checked model implementing the methods `fit` and `predict_freq`.
        """
        # Check if the classifier and its arguments are valid.
        check_type(clf, "clf", SkactivemlClassifier)
        check_type(fit_clf, "fit_clf", bool)
        if fit_clf:
            if sample_weight is None:
                clf = clone(clf).fit(X, y)
            else:
                clf = clone(clf).fit(X, y, sample_weight)
        return clf

    def _validate_utility_weight(self, utility_weight, candidates):
        """Validate if utility_weight is numeric and of equal length as
        candidates.

        Parameters
        ----------
        candidates: np.ndarray, shape (n_candidates, n_features)
            Checked candidate samples
        utility_weight: array-like of shape (n_candidate_samples)
            Densities for each sample in `candidates`.

        Returns
        -------
        utility_weight : array-like of shape (n_candidate_samples)
            Checked densities for each sample in `candidates`.
        """
        if utility_weight is None:
            utility_weight = np.ones(len(candidates))
        utility_weight = check_array(utility_weight, ensure_2d=False)
        check_consistent_length(utility_weight, candidates)
        return utility_weight

    def _validate_random_state(self):
        """Creates a copy 'random_state_' if random_state is an instance of
        np.random_state. If not create a new random state. See also
        :func:`~sklearn.utils.check_random_state`
        """
        if not hasattr(self, "random_state_"):
            self.random_state_ = self.random_state
        self.random_state_ = check_random_state(self.random_state_)