Source code for skactiveml.pool._query_by_committee

"""
Query-by-committee strategies.
"""

# Author: Pascal Mergard <Pascal.Mergard@student.uni-kassel.de>
#         Marek Herde <marek.herde@uni-kassel.de>
import copy

import numpy as np
from sklearn import clone
from sklearn.utils.validation import check_array, check_is_fitted
from iteration_utilities import flatten

from ..base import (
    SingleAnnotatorPoolQueryStrategy,
    SkactivemlClassifier,
    SkactivemlRegressor,
)
from ..utils import (
    simple_batch,
    check_type,
    compute_vote_vectors,
    MISSING_LABEL,
    check_equal_missing_label,
    check_scalar,
)


[docs]class QueryByCommittee(SingleAnnotatorPoolQueryStrategy):
    """Query-by-Committee (QBC)

    The Query-by-Committee (QBC) strategy uses an ensemble of estimators to
    identify on which samples many estimators disagree.

    Parameters
    ----------
    method : "KL_divergence" or "vote_entropy" or "variation_ratios, \
            default='KL_divergence'
        The method to calculate the disagreement in the case of classification.
        'KL_divergence', 'vote_entropy', and 'variation_ratios' are possible.
        In the case of regression, this parameter is ignored and the empirical
        variance is used.
    eps : float > 0, default=1e-7
        Minimum probability threshold to compute log-probabilities (only
        relevant for `method='KL_divergence'`).
    sample_predictions_method_name : str, default=None
        Certain estimators may offer methods enabling to construct a committee
        by sampling predictions of committee members. This parameter is to
        indicate the name of such a method.
            - If `sample_predictions_method_name=None` no sampling is
              performed.
            - If `sample_predictions_method_name` is not `None` and in the
              case of classification, the method is expected to take samples of
              the shape `(n_samples, *)` as input and to output probabilities
              of the shape `(n_members, n_samples, n_classes)`, e.g.,
              `sample_proba` in `skactiveml.base.ClassFrequencyEstimator`.
            - If `sample_predictions_method_name` is not `None` and in the
              case of regression, the method is expected to take samples of the
              shape `(n_samples, *)` as input and to output numerical values of
              the shape `(n_members, n_samples)`, e.g., `sample_y` in
              `sklearn.gaussian_process.GaussianProcessRegressor`.
    sample_predictions_dict : dict, default=None
        Parameters (excluding the samples) that are passed to the method with
        the name `sample_predictions_method_name`.
            - This parameter must be `None`, if
              `sample_predictions_method_name` is `None`.
            - Otherwise, it may be used to define the number of sampled
              members, e.g., by defining `n_samples` as parameter to the method
              `sample_proba` of `skactiveml.base.ClassFrequencyEstimator` or
              `sample_y` of
              `sklearn.gaussian_process.GaussianProcessRegressor`.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : int or np.random.RandomState or None, default=None
        The random state to use.

    References
    ----------
    .. [1] H.S. Seung, M. Opper, and H. Sompolinsky. Query by committee.
       In ACM Workshop on Computational Learning Theory, pages 287-294, 1992.

    .. [2] N. Abe and H. Mamitsuka. Query Learning Strategies Using Boosting
       and Bagging. In International Conference on Machine Learning, pages 1-9,
       1998.

    .. [3] Burbidge, Robert and Rowland, Jem J and King, Ross D. Active
       Learning for Regression Based on Query by Committee. In International
       Conference on Intelligent Data Engineering and Automated Learning,
       pages 209-218, 2007.

    .. [4] Beluch, W. H., Genewein, T., Nürnberger, A., and Köhler, J. M.
       The Power of Ensembles for Active Learning in Image Classification. In
       Conference on Computer Vision and Pattern Recognition, pages 9368-9377,
       2018
    """

    def __init__(
        self,
        method="KL_divergence",
        eps=1e-7,
        sample_predictions_method_name=None,
        sample_predictions_dict=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        super().__init__(
            missing_label=missing_label, random_state=random_state
        )
        self.method = method
        self.eps = eps
        self.sample_predictions_method_name = sample_predictions_method_name
        self.sample_predictions_dict = sample_predictions_dict

[docs]    def query(
        self,
        X,
        y,
        ensemble,
        fit_ensemble=True,
        sample_weight=None,
        candidates=None,
        batch_size=1,
        return_utilities=False,
    ):
        """Determines for which candidate samples labels are to be queried.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`.)
        ensemble : array-like of SkactivemlClassifier or array-like of \
                SkactivemlRegressor or SkactivemlClassifier or \
                SkactivemlRegressor
            - If `ensemble` is a `SkactivemlClassifier` or a
              `SkactivemlRegressor` and has `n_estimators` plus
              `estimators_` after fitting as attributes, its estimators will
              be used as committee.
            - If `ensemble` is array-like, each element of this list must be
              `SkactivemlClassifier` or a `SkactivemlRegressor` and will be
              used as committee member.
            - If `ensemble` is a `SkactivemlClassifier` or a
              `SkactivemlRegressor` and implements a method with the name
              `sample_predictions_method_name`, this method is used to sample
              predictions of committee members.
        fit_ensemble : bool, default=True
            Defines whether the ensemble should be fitted on `X`, `y`, and
            `sample_weight`.
        sample_weight: array-like of shape (n_samples,), default=None
            Weights of training samples in `X`.
        candidates : None or array-like of shape (n_candidates), dtype=int or \
                array-like of shape (n_candidates, n_features), default=None
            - If `candidates` is `None`, the unlabeled samples from
              `(X,y)` are considered as `candidates`.
            - If `candidates` is of shape `(n_candidates,)` and of type
              `int`, `candidates` is considered as the indices of the
              samples in `(X,y)`.
            - If `candidates` is of shape `(n_candidates, *)`, the
              candidate samples are directly given in `candidates` (not
              necessarily contained in `X`). This is not supported by all
              query strategies.
        batch_size : int, default=1
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, default=False
            If true, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray of shape (batch_size)
            The query_indices indicate for which candidate sample a label is
            to queried, e.g., `query_indices[0]` indicates the first selected
            sample.
                - If `candidates` is `None` or of shape
                  `(n_candidates,)`, the indexing refers to the samples in
                  `X`.
                - If `candidates` is of shape `(n_candidates, n_features)`,
                  the indexing refers to the samples in `candidates`.
        utilities : numpy.ndarray of shape (batch_size, n_samples) or \
                numpy.ndarray of shape (batch_size, n_candidates)
            The utilities of samples after each selected sample of the batch,
            e.g., `utilities[0]` indicates the utilities used for selecting
            the first sample (with index `query_indices[0]`) of the batch.
            Utilities for labeled samples will be set to np.nan.
                - If `candidates` is `None` or of shape
                  `(n_candidates,)`, the indexing refers to the samples in
                  `X`.
                - If `candidates` is of shape `(n_candidates, n_features)`,
                  the indexing refers to the samples in `candidates`.
        """
        # Validate input parameters.
        X, y, candidates, batch_size, return_utilities = self._validate_data(
            X, y, candidates, batch_size, return_utilities, reset=True
        )

        X_cand, mapping = self._transform_candidates(candidates, X, y)
        check_type(fit_ensemble, "fit_ensemble", bool)
        ensemble, est_arr, classes, sample_func, sample_dict = _check_ensemble(
            ensemble=ensemble,
            X=X,
            y=y,
            sample_weight=sample_weight,
            fit_ensemble=fit_ensemble,
            missing_label=self.missing_label_,
            estimator_types=[SkactivemlClassifier, SkactivemlRegressor],
            sample_predictions_method_name=self.sample_predictions_method_name,
            sample_predictions_dict=self.sample_predictions_dict,
        )
        check_type(
            self.method,
            "method",
            target_vals=["KL_divergence", "vote_entropy", "variation_ratios"],
        )

        # `classes` is `None`, if `ensemble` is a regressor.
        if classes is not None:
            # Compute utilities.
            if self.method == "KL_divergence":
                if sample_func is None:
                    probas = self._aggregate_predict_probas(
                        X_cand, ensemble, est_arr
                    )
                else:
                    probas = sample_func(X_cand, **sample_dict)
                utilities_cand = average_kl_divergence(probas, self.eps)
            else:
                if sample_func is None:
                    votes = np.array(
                        [est.predict(X_cand) for est in est_arr]
                    ).T
                else:
                    probas = sample_func(X_cand, **sample_dict)
                    votes = probas.argmax(axis=-1).T
                if self.method == "vote_entropy":
                    utilities_cand = vote_entropy(votes, classes)
                else:
                    utilities_cand = variation_ratios(votes)
        else:
            if sample_func is None:
                results = np.array(
                    [learner.predict(X_cand) for learner in est_arr]
                )
            else:
                results = sample_func(X_cand, **sample_dict).T
            utilities_cand = np.std(results, axis=0)

        if mapping is None:
            utilities = utilities_cand
        else:
            utilities = np.full(len(X), np.nan)
            utilities[mapping] = utilities_cand

        return simple_batch(
            utilities,
            self.random_state_,
            batch_size=batch_size,
            return_utilities=return_utilities,
        )

    def _aggregate_predict_probas(self, X_cand, ensemble, est_arr):
        """Aggregate the predicted probabilities across all ensemble members
        and ensure that all classes are mapped correctly.

        Parameters
        ----------
        X_cand : array-like of shape (n_samples, n_features)
            Samples whose probabilities are to be predicted.
        ensemble : SkactivemlClassifier or list or tuple of \
                SkactivemlClassifier
            - If `ensemble` is a `SkactivemlClassifier`, it must have
              `n_estimators` and `estimators_` after fitting as attribute.
              Then, its estimators will be used as committee.
            - If `ensemble` is array-like, each element of this list must be
              `SkactivemlClassifier` and will be used as committee member.
        est_arr : list or tuple of SkactivemlClassifier
            List of ensemble members contained in `ensemble`.

        Returns
        -------
        probas : np.ndarray of shape (n_samples, n_classes)
            The mapped predicted probabilities.
        """
        if hasattr(ensemble, "classes_"):
            ensemble_classes = ensemble.classes_
        else:
            ensemble_classes = np.unique(
                list(flatten([est.classes_ for est in est_arr]))
            )
        probas = np.zeros((len(est_arr), len(X_cand), len(ensemble_classes)))
        for i, est in enumerate(est_arr):
            est_proba = est.predict_proba(X_cand)
            est_classes = est.classes_

            if len(est_classes) == len(ensemble_classes):
                indices_ensemble = np.arange(len(ensemble_classes))
            else:
                indices_est = np.where(np.isin(est_classes, ensemble_classes))[
                    0
                ]
                indices_ensemble = np.searchsorted(
                    ensemble_classes, est_classes[indices_est]
                )
            probas[i, :, indices_ensemble] = est_proba.T
        return probas


[docs]def average_kl_divergence(probas, eps=1e-7):
    """Calculates the average Kullback-Leibler (KL) divergence for measuring
    the level of disagreement in QueryByCommittee.

    Parameters
    ----------
    probas : array-like of shape (n_estimators, n_samples, n_classes)
        The probability estimates of all estimators, samples, and classes.
    eps : float  > 0, optional (default=1e-7)
        Minimum probability threshold to compute log-probabilities.

    Returns
    -------
    scores : np.ndarray, shape (n_samples,)
        The Kullback-Leibler (KL) divergences.

    References
    ----------
    .. [1] A. McCallum and K. Nigam. Employing EM in pool-based active learning
       for text classification. In International Conference on Machine
       Learning, pages 359-367, 1998.
    """
    # Check parameters.
    check_scalar(
        eps,
        "eps",
        min_val=0,
        max_val=0.1,
        target_type=(float, int),
        min_inclusive=False,
    )
    probas = check_array(probas, allow_nd=True)
    if probas.ndim != 3:
        raise ValueError(
            f"Expected 3D array, got {probas.ndim}D array instead."
        )
    n_estimators = probas.shape[0]

    np.clip(probas, a_min=eps, a_max=1, out=probas)
    probas /= probas.sum(axis=2, keepdims=True)

    # Calculate the average KL divergence.
    probas_mean = np.mean(probas, axis=0)
    with np.errstate(divide="ignore", invalid="ignore"):
        scores = np.nansum(
            np.nansum(probas * np.log(probas / probas_mean), axis=2), axis=0
        )
    scores = scores / n_estimators

    return scores


[docs]def vote_entropy(votes, classes):
    """Calculates the vote entropy for measuring the level of disagreement in
    QueryByCommittee.

    Parameters
    ----------
    votes : array-like, shape (n_samples, n_estimators)
        The class predicted by the estimators for each sample.
    classes : array-like, shape (n_classes)
        A list of all possible classes.

    Returns
    -------
    vote_entropy : np.ndarray of shape (n_samples,)
        The vote entropy of each row in `votes`.

    References
    ----------
    .. [1] Engelson, Sean P., and Ido Dagan. "Minimizing Manual Annotation Cost
       in Supervised Training from Corpora." In Annual Meeting of the
       Association for Computational Linguistics, pages 319-326, 1996.
    """
    # Check `votes` array.
    votes = check_array(votes)
    n_estimators = votes.shape[1]

    # Count the votes.
    vote_count = compute_vote_vectors(
        y=votes, classes=classes, missing_label=None
    )

    # Compute vote entropy.
    v = vote_count / n_estimators

    with np.errstate(divide="ignore", invalid="ignore"):
        scores = np.nansum(-v * np.log(v), axis=1)
    return scores


[docs]def variation_ratios(votes):
    """Calculates the variation ratios for measuring the level of disagreement
    in `QueryByCommittee`.

    Parameters
    ----------
    votes : array-like of shape (n_samples, n_estimators)
        The class predicted by the estimators for each sample.

    Returns
    -------
    scores : np.ndarray of shape (n_samples,)
        The variation ratios of each row in `votes`.

    References
    ----------
    .. [1] Beluch, W. H., Genewein, T., Nürnberger, A., and Köhler, J. M.
       The Power of Ensembles for Active Learning in Image Classification. In
       Conference on Computer Vision and Pattern Recognition, pages 9368-9377,
       2018.
    """
    # Check `votes` array.
    votes = check_array(votes)
    n_estimators = votes.shape[1]

    # Count the votes.
    vote_count = compute_vote_vectors(y=votes, missing_label=None)
    scores = 1 - (vote_count.max(axis=-1) / n_estimators)

    return scores


def _check_ensemble(
    ensemble,
    estimator_types,
    X,
    y,
    sample_weight,
    fit_ensemble=True,
    missing_label=MISSING_LABEL,
    sample_predictions_method_name=None,
    sample_predictions_dict=None,
):
    error_msg = (
        f"`ensemble` must either be a `{estimator_types} "
        f"with the attribute `n_ensembles` or `estimators_` or an array-like "
        f"of {estimator_types} objects or implement a method with the name "
        f"`{sample_predictions_method_name}`."
    )

    # Check if the parameter `ensemble` is valid.
    for estimator_type in estimator_types:
        if isinstance(ensemble, estimator_type):
            check_equal_missing_label(ensemble.missing_label, missing_label)
            # Fit the ensemble.
            if fit_ensemble:
                if sample_weight is None:
                    ensemble = clone(ensemble).fit(X, y)
                else:
                    ensemble = clone(ensemble).fit(X, y, sample_weight)
            else:
                check_is_fitted(ensemble)

            if sample_predictions_method_name is not None:
                check_type(
                    sample_predictions_method_name,
                    "sample_predictions_method_name",
                    str,
                )
                if not hasattr(ensemble, sample_predictions_method_name):
                    raise ValueError(
                        "If `sample_predictions_method_name` is not `None`, "
                        "`ensemble` must implement a method with this name."
                    )
                sample_func = getattr(ensemble, sample_predictions_method_name)
                if sample_predictions_dict is None:
                    sample_predictions_dict = {}
                if not isinstance(sample_predictions_dict, dict):
                    raise ValueError(
                        "`sample_predictions_dict` must be a `dict`, if "
                        "`sample_predictions_method_name` is not `None`."
                    )
            else:
                sample_func = None
                if sample_predictions_dict is not None:
                    raise ValueError(
                        "`sample_predictions_dict` must be `None`, if "
                        "`sample_predictions_method_name` is `None`."
                    )

            if sample_func is not None:
                est_arr = None
            elif hasattr(ensemble, "estimators_"):
                est_arr = ensemble.estimators_
            elif hasattr(ensemble, "estimators"):
                est_arr = [ensemble] * len(ensemble.estimators)
            elif hasattr(ensemble, "n_estimators"):
                est_arr = [ensemble] * ensemble.n_estimators
            else:
                raise TypeError(error_msg)

            cls = getattr(ensemble, "classes_", None)
            return ensemble, est_arr, cls, sample_func, sample_predictions_dict

        elif isinstance(ensemble, (list, tuple)) and isinstance(
            ensemble[0], estimator_type
        ):
            if (
                sample_predictions_dict is not None
                or sample_predictions_method_name is not None
            ):
                raise ValueError(
                    "`sample_predictions_method_name` and "
                    "`sample_predictions_dict` must be `None`, if `ensemble` "
                    "is array-like."
                )
            est_arr = copy.deepcopy(ensemble)
            for i in range(len(est_arr)):
                check_type(
                    est_arr[i], f"ensemble[{i}]", estimator_type
                )  # better error message
                check_equal_missing_label(
                    est_arr[i].missing_label, missing_label
                )
                # Fit the ensemble.
                if fit_ensemble:
                    if sample_weight is None:
                        est_arr[i] = est_arr[i].fit(X, y)
                    else:
                        est_arr[i] = est_arr[i].fit(X, y, sample_weight)
                else:
                    check_is_fitted(est_arr[i])

                if i > 0 and estimator_type == SkactivemlClassifier:
                    np.testing.assert_array_equal(
                        est_arr[i - 1].classes_,
                        est_arr[i].classes_,
                        err_msg=f"The inferred classes of the {i - 1}-th and "
                        f"{i}-th are not equal. Set the `classes` "
                        f"parameter of each ensemble member to avoid "
                        f"this error.",
                    )
            cls = getattr(est_arr[0], "classes_", None)
            return ensemble, est_arr, cls, None, None

    raise TypeError(error_msg)