Source code for skactiveml.pool._bald

"""
Code is based on https://blackhc.github.io/batchbald_redux/ distributed under
the Apache-2.0 license and the associated query strategy is presented in [1].

[1] Kirsch, Andreas, Joost Van Amersfoort, and Yarin Gal. "BatchBALD:
Efficient and Diverse Batch Acquisition for Deep Bayesian Active
Learning." Advances in Neural Information Processing Systems 32 (2019).
"""

import numpy as np
from sklearn.utils import check_array

from ..base import SkactivemlClassifier
from ..pool._query_by_committee import _check_ensemble, QueryByCommittee
from ..utils import (
    MISSING_LABEL,
    rand_argmax,
    check_type,
    check_scalar,
    check_random_state,
    simple_batch,
)


class _GeneralBALD(QueryByCommittee):
    """General Bayesian Active Learning by Disagreement (_GeneralBALD)

    The Bayesian Active Learning by Disagreement (BatchBALD) [1] strategy
    reduces the number of possible hypotheses maximally fast to minimize the
    uncertainty about the parameters using Shannon's entropy. It seeks the data
    point that maximises the decrease in expected posterior entropy. For the
    batch case, by default the advanced strategy BatchBALD [2] is used.
    If desired a greedy (top-k) selection can be applied by setting
    `greedy_selection=True`.

    Parameters
    ----------
    n_MC_samples : int > 0, default=n_estimators
        The number of monte carlo samples used for label estimation.
    greedy_selection : bool, default=False
        Flag to either use BatchBALD (`greedy_selection=False`) or a greedy
        (top-k) selection (`greedy_selection=True`) if `batch_size>1`.
    eps : float > 0, default=1e-7
        Minimum probability threshold to compute log-probabilities.
    sample_predictions_method_name : str, default=None
        Certain estimators may offer methods enabling to construct a committee
        by sampling predictions of committee members. This parameter is to
        indicate the name of such a method.
            - If `sample_predictions_method_name=None` no sampling is
              performed.
            - If `sample_predictions_method_name` is not `None` and in the
              case of classification, the method is expected to take samples of
              the shape `(n_samples, *)` as input and to output probabilities
              of the shape `(n_members, n_samples, n_classes)`, e.g.,
              `sample_proba` in `skactiveml.base.ClassFrequencyEstimator`.
    sample_predictions_dict : dict, default=None
        Parameters (excluding the samples) that are passed to the method with
        the name `sample_predictions_method_name`.
            - This parameter must be `None`, if
              `sample_predictions_method_name` is `None`.
            - Otherwise, it may be used to define the number of sampled
              members, e.g., by defining `n_samples` as parameter to the method
              `sample_proba` of `skactiveml.base.ClassFrequencyEstimator`.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : int or None or np.random.RandomState, default=None
        The random state to use.

    References
    ----------
    .. [1] Houlsby, Neil, Ferenc Huszár, Zoubin Ghahramani, and Máté Lengyel.
       "Bayesian Active Learning for Classification and Preference
       Learning." arXiv preprint arXiv:1112.5745 (2011).
    .. [2] Kirsch, Andreas, Joost Van Amersfoort, and Yarin Gal. "BatchBALD:
       Efficient and Diverse Batch Acquisition for Deep Bayesian Active
       Learning." Advances in Neural Information Processing Systems 32 (2019).
    """

    def __init__(
        self,
        n_MC_samples=None,
        greedy_selection=False,
        eps=1e-7,
        sample_predictions_method_name=None,
        sample_predictions_dict=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        super().__init__(
            eps=eps,
            sample_predictions_method_name=sample_predictions_method_name,
            sample_predictions_dict=sample_predictions_dict,
            missing_label=missing_label,
            random_state=random_state,
        )
        self.n_MC_samples = n_MC_samples
        self.greedy_selection = greedy_selection

    def query(
        self,
        X,
        y,
        ensemble,
        fit_ensemble=True,
        sample_weight=None,
        candidates=None,
        batch_size=1,
        return_utilities=False,
    ):
        """Determines for which candidate samples labels are to be queried.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data set, usually complete, i.e. including the labeled and
            unlabeled samples.
        y : array-like of shape (n_samples,)
            Labels of the training data set (possibly including unlabeled ones
            indicated by `self.missing_label`).
        ensemble : array-like of SkactivemlClassifier or SkactivemlClassifier
            - If `ensemble` is a `SkactivemlClassifier` and has
              `n_estimators` plus `estimators_` after fitting as
              attributes, its estimators will be used as committee.
            - If `ensemble` is array-like, each element of this list must be
              `SkactivemlClassifier` and will be
              used as committee member.
            - If `ensemble` is a `SkactivemlClassifier` and implements a
              method with the name `sample_predictions_method_name`, this
              method is used to sample predictions of committee members.
        fit_ensemble : bool, default=True
            Defines whether the ensemble should be fitted on `X`, `y`, and
            `sample_weight`.
        sample_weight: array-like of shape (n_samples), default=None
            Weights of training samples in `X`.
        candidates : None or array-like of shape (n_candidates), dtype=int or \
                array-like of shape (n_candidates, n_features), default=None
            - If `candidates` is `None`, the unlabeled samples from
              `(X,y)` are considered as `candidates`.
            - If `candidates` is of shape `(n_candidates,)` and of type
              `int`, `candidates` is considered as the indices of the
              samples in `(X,y)`.
            - If `candidates` is of shape `(n_candidates, *)`, the
              candidate samples are directly given in `candidates` (not
              necessarily contained in `X`). This is not supported by all
              query strategies.
        batch_size : int, default=1
            The number of samples to be selected in one AL cycle.
        return_utilities : bool, default=False
            If `True`, also return the utilities based on the query strategy.

        Returns
        -------
        query_indices : numpy.ndarray of shape (batch_size)
            The query_indices indicate for which candidate sample a label is
            to queried, e.g., `query_indices[0]` indicates the first selected
            sample.
                - If `candidates` is `None` or of shape
                  `(n_candidates,)`, the indexing refers to the samples in
                  `X`.
                - If `candidates` is of shape `(n_candidates, n_features)`,
                  the indexing refers to the samples in `candidates`.
        utilities : numpy.ndarray of shape (batch_size, n_samples) or \
                numpy.ndarray of shape (batch_size, n_candidates)
            The utilities of samples after each selected sample of the batch,
            e.g., `utilities[0]` indicates the utilities used for selecting
            the first sample (with index `query_indices[0]`) of the batch.
            Utilities for labeled samples will be set to np.nan.
                - If `candidates` is `None` or of shape
                  `(n_candidates,)`, the indexing refers to the samples in
                  `X`.
                - If `candidates` is of shape `(n_candidates, n_features)`,
                  the indexing refers to the samples in `candidates`.
        """
        # Validate input parameters.
        X, y, candidates, batch_size, return_utilities = self._validate_data(
            X, y, candidates, batch_size, return_utilities, reset=True
        )
        check_scalar(
            self.greedy_selection, "greedy_selection", target_type=bool
        )
        X_cand, mapping = self._transform_candidates(candidates, X, y)

        # Validate classifier type.
        check_type(fit_ensemble, "fit_ensemble", bool)

        ensemble, est_arr, _, sample_func, sample_dict = _check_ensemble(
            ensemble=ensemble,
            X=X,
            y=y,
            sample_weight=sample_weight,
            fit_ensemble=fit_ensemble,
            missing_label=self.missing_label_,
            estimator_types=[SkactivemlClassifier],
            sample_predictions_method_name=self.sample_predictions_method_name,
            sample_predictions_dict=self.sample_predictions_dict,
        )

        if sample_func is None:
            probas = self._aggregate_predict_probas(X_cand, ensemble, est_arr)
        else:
            probas = sample_func(X_cand, **sample_dict)

        if self.n_MC_samples is None:
            n_MC_samples_ = len(probas)
        else:
            n_MC_samples_ = self.n_MC_samples
        check_scalar(n_MC_samples_, "n_MC_samples", int, min_val=1)

        utils_batch_size = 1 if self.greedy_selection else batch_size
        batch_utilities_cand = batch_bald(
            probas=probas,
            batch_size=utils_batch_size,
            n_MC_samples=n_MC_samples_,
            eps=self.eps,
            random_state=self.random_state_,
        )

        if mapping is None:
            batch_utilities = batch_utilities_cand
        else:
            batch_utilities = np.full((utils_batch_size, len(X)), np.nan)
            batch_utilities[:, mapping] = batch_utilities_cand

        if self.greedy_selection:
            return simple_batch(
                batch_utilities[0],
                self.random_state_,
                batch_size=batch_size,
                return_utilities=return_utilities,
            )
        else:
            best_indices = rand_argmax(
                batch_utilities, axis=1, random_state=self.random_state_
            )
            if return_utilities:
                return best_indices, batch_utilities
            else:
                return best_indices


[docs]class BatchBALD(_GeneralBALD):
    """Batch Bayesian Active Learning by Disagreement (BatchBALD)

    The Batch Bayesian Active Learning by Disagreement (BatchBALD) [1] strategy
    reduces the number of possible hypotheses maximally fast to minimize the
    uncertainty about the parameters using Shannon's entropy. It seeks the data
    point that maximises the decrease in expected posterior entropy.

    Parameters
    ----------
    n_MC_samples : int > 0, default=n_estimators
        The number of monte carlo samples used for label estimation.
    eps : float > 0, default=1e-7
        Minimum probability threshold to compute log-probabilities.
    sample_predictions_method_name : str, default=None
        Certain estimators may offer methods enabling to construct a committee
        by sampling predictions of committee members. This parameter is to
        indicate the name of such a method.
            - If `sample_predictions_method_name=None` no sampling is
              performed.
            - If `sample_predictions_method_name` is not `None` and in the
              case of classification, the method is expected to take samples of
              the shape `(n_samples, *)` as input and to output probabilities
              of the shape `(n_members, n_samples, n_classes)`, e.g.,
              `sample_proba` in `skactiveml.base.ClassFrequencyEstimator`.
    sample_predictions_dict : dict, default=None
        Parameters (excluding the samples) that are passed to the method with
        the name `sample_predictions_method_name`.
            - This parameter must be `None`, if
              `sample_predictions_method_name` is `None`.
            - Otherwise, it may be used to define the number of sampled
              members, e.g., by defining `n_samples` as parameter to the method
              `sample_proba` of `skactiveml.base.ClassFrequencyEstimator`.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : int or None or np.random.RandomState, default=None
        The random state to use.

    References
    ----------
    .. [1] Kirsch, Andreas, Joost Van Amersfoort, and Yarin Gal. "BatchBALD:
       Efficient and Diverse Batch Acquisition for Deep Bayesian  Active
       Learning." Advances in Neural Information Processing Systems 32 (2019).
    """

    def __init__(
        self,
        n_MC_samples=None,
        eps=1e-7,
        sample_predictions_method_name=None,
        sample_predictions_dict=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        super().__init__(
            n_MC_samples=n_MC_samples,
            greedy_selection=False,
            eps=eps,
            sample_predictions_method_name=sample_predictions_method_name,
            sample_predictions_dict=sample_predictions_dict,
            missing_label=missing_label,
            random_state=random_state,
        )


[docs]class GreedyBALD(_GeneralBALD):
    """Greedy Bayesian Active Learning by Disagreement (GreedyBALD)

    The Bayesian Active Learning by Disagreement (BALD) [1] strategy
    reduces the number of possible hypotheses maximally fast to minimize the
    uncertainty about the parameters using Shannon's entropy. It seeks the data
    point that maximises the decrease in expected posterior entropy. For the
    batch case, a greedy (top-k) selection is applied

    Parameters
    ----------
    n_MC_samples : int > 0, default=n_estimators
        The number of monte carlo samples used for label estimation.
    eps : float > 0, default=1e-7
        Minimum probability threshold to compute log-probabilities.
    sample_predictions_method_name : str, default=None
        Certain estimators may offer methods enabling to construct a committee
        by sampling predictions of committee members. This parameter is to
        indicate the name of such a method.
            - If `sample_predictions_method_name=None` no sampling is
              performed.
            - If `sample_predictions_method_name` is not `None` and in the
              case of classification, the method is expected to take samples of
              the shape `(n_samples, *)` as input and to output probabilities
              of the shape `(n_members, n_samples, n_classes)`, e.g.,
              `sample_proba` in `skactiveml.base.ClassFrequencyEstimator`.
    sample_predictions_dict : dict, default=None
        Parameters (excluding the samples) that are passed to the method with
        the name `sample_predictions_method_name`.
            - This parameter must be `None`, if
              `sample_predictions_method_name` is `None`.
            - Otherwise, it may be used to define the number of sampled
              members, e.g., by defining `n_samples` as parameter to the method
              `sample_proba` of `skactiveml.base.ClassFrequencyEstimator`.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    random_state : int or None or np.random.RandomState, default=None
        The random state to use.

    References
    ----------
    .. [1] Houlsby, Neil, Ferenc Huszár, Zoubin Ghahramani, and Máté Lengyel.
       "Bayesian Active Learning for Classification and Preference Learning."
       arXiv preprint arXiv:1112.5745 (2011).
    """

    def __init__(
        self,
        n_MC_samples=None,
        eps=1e-7,
        sample_predictions_method_name=None,
        sample_predictions_dict=None,
        missing_label=MISSING_LABEL,
        random_state=None,
    ):
        super().__init__(
            n_MC_samples=n_MC_samples,
            greedy_selection=True,
            eps=eps,
            sample_predictions_method_name=sample_predictions_method_name,
            sample_predictions_dict=sample_predictions_dict,
            missing_label=missing_label,
            random_state=random_state,
        )


[docs]def batch_bald(
    probas,
    batch_size,
    n_MC_samples=None,
    random_state=None,
    eps=1e-7,
):
    """BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian
    Active Learning

    BatchBALD [1] is an extension of BALD  [2] (Bayesian Active Learning by
    Disagreement) whereby points are jointly scored by estimating the
    mutual information between a joint of multiple data points and the model
    parameters.

    Parameters
    ----------
    probas : array-like of shape (n_estimators, n_samples, n_classes)
        The probability estimates of all estimators, samples, and classes.
    batch_size : int, default=1
        The number of samples to be selected in one AL cycle.
    n_MC_samples : int > 0, default=n_estimators
        The number of monte carlo samples used for label estimation.
    eps : float  > 0, default=1e-7
        Minimum probability threshold to compute log-probabilities.
    random_state : int or np.random.RandomState, default=None
        The random state to use.

    Returns
    -------
    utilities: numpy.ndarray of shape (batch_size, n_samples)
        Sample utilities computed according to BatchBALD [2].

    References
    ----------
    .. [1] Kirsch, Andreas, Joost Van Amersfoort, and Yarin Gal. "BatchBALD:
       Efficient and Diverse Batch Acquisition for Deep Bayesian  Active
       Learning." Advances in Neural Information Processing Systems 32 (2019).
    .. [2] Houlsby, Neil, Ferenc Huszár, Zoubin Ghahramani, and Máté Lengyel.
       "Bayesian Active Learning for Classification and Preference Learning."
       arXiv preprint arXiv:1112.5745 (2011).
    """
    # Validate input parameters.
    if probas.ndim != 3:
        raise ValueError(
            f"'probas' should be of shape 3, but {probas.ndim}" f" were given."
        )
    probs_K_N_C = check_array(probas, ensure_2d=False, allow_nd=True)
    check_scalar(batch_size, "batch_size", int, min_val=1)
    check_scalar(
        eps,
        "eps",
        min_val=0,
        max_val=0.1,
        target_type=(float, int),
        min_inclusive=False,
    )
    if n_MC_samples is None:
        n_MC_samples = len(probas)
    check_scalar(n_MC_samples, "n_MC_samples", int, min_val=1)
    random_state = check_random_state(random_state)

    probs_N_K_C = probs_K_N_C.swapaxes(0, 1)
    np.clip(probs_N_K_C, a_min=eps, a_max=1, out=probs_N_K_C)
    log_probs_N_K_C = np.log(probs_N_K_C)
    N, K, C = log_probs_N_K_C.shape

    batch_size = min(batch_size, N)

    conditional_entropies_N = _compute_conditional_entropy(log_probs_N_K_C)

    batch_joint_entropy = _DynamicJointEntropy(
        n_MC_samples, batch_size - 1, K, C, random_state
    )

    utilities = np.zeros((batch_size, N))
    query_indices = []

    for i in range(batch_size):
        if i > 0:
            latest_index = query_indices[-1]
            batch_joint_entropy.add_variables(
                log_probs_N_K_C[latest_index : latest_index + 1]
            )

        shared_conditinal_entropies = conditional_entropies_N[
            query_indices
        ].sum()

        utilities[i] = batch_joint_entropy.compute_batch(log_probs_N_K_C)

        utilities[i] -= conditional_entropies_N + shared_conditinal_entropies
        utilities[i, query_indices] = np.nan

        query_idx = rand_argmax(utilities[i], random_state=0)[0]

        query_indices.append(query_idx)

    return utilities


class _ExactJointEntropy:
    def __init__(self, joint_probs_M_K):
        self.joint_probs_M_K = joint_probs_M_K

    @staticmethod
    def empty(K):
        return _ExactJointEntropy(np.ones((1, K)))

    def add_variables(self, log_probs_N_K_C):
        N, K, C = log_probs_N_K_C.shape
        joint_probs_K_M_1 = self.joint_probs_M_K.T[:, :, None]

        probs_N_K_C = np.exp(log_probs_N_K_C)

        # Using lots of memory.
        for i in range(N):
            probs_i__K_1_C = probs_N_K_C[i][:, None, :]
            joint_probs_K_M_C = joint_probs_K_M_1 * probs_i__K_1_C
            joint_probs_K_M_1 = joint_probs_K_M_C.reshape((K, -1, 1))

        self.joint_probs_M_K = joint_probs_K_M_1.squeeze(2).T
        return self

    def compute_batch(self, log_probs_B_K_C):
        B, K, C = log_probs_B_K_C.shape
        M = self.joint_probs_M_K.shape[0]

        probs_b_K_C = np.exp(log_probs_B_K_C)
        b = probs_b_K_C.shape[0]
        probs_b_M_C = np.empty((b, M, C))
        for i in range(b):
            np.matmul(
                self.joint_probs_M_K,
                probs_b_K_C[i],
                out=probs_b_M_C[i],
            )
        probs_b_M_C /= K

        output_entropies_B = np.sum(
            -np.log(probs_b_M_C) * probs_b_M_C, axis=(1, 2)
        )

        return output_entropies_B


def _batch_multi_choices(probs_b_C, M, random_state):
    """
    probs_b_C: Ni... x C

    Returns:
        choices: Ni... x M
    """
    probs_B_C = probs_b_C.reshape((-1, probs_b_C.shape[-1]))
    B = probs_B_C.shape[0]
    C = probs_B_C.shape[1]

    # samples: Ni... x draw_per_xx
    choices = [
        random_state.choice(
            C, size=M, p=probs_B_C[b] / np.sum(probs_B_C[b]), replace=True
        )
        for b in range(B)
    ]
    choices = np.array(choices, dtype=int)

    choices_b_M = choices.reshape(list(probs_b_C.shape[:-1]) + [M])
    return choices_b_M


def _gather_expand(data, axis, index):
    max_shape = [max(dr, ir) for dr, ir in zip(data.shape, index.shape)]
    new_data_shape = list(max_shape)
    new_data_shape[axis] = data.shape[axis]

    new_index_shape = list(max_shape)
    new_index_shape[axis] = index.shape[axis]

    data = np.broadcast_to(data, new_data_shape)
    index = np.broadcast_to(index, new_index_shape)

    return np.take_along_axis(data, index, axis=axis)


class _SampledJointEntropy:
    """
    Random variables (all with the same # of categories $C$) can be added
    via `_SampledJointEntropy.add_variables`.

    `_SampledJointEntropy.compute` computes the joint entropy.

    `_SampledJointEntropy.compute_batch` computes the joint entropy of the
    added variables with each of the variables in the provided batch
    probabilities in turn.
    """

    def __init__(self, sampled_joint_probs_M_K):
        self.sampled_joint_probs_M_K = sampled_joint_probs_M_K

    @staticmethod
    def sample(probs_N_K_C, M, random_state):
        K = probs_N_K_C.shape[1]

        # S: num of samples per w
        S = M // K

        choices_N_K_S = _batch_multi_choices(probs_N_K_C, S, random_state)

        expanded_choices_N_1_K_S = choices_N_K_S[:, None, :, :]
        expanded_probs_N_K_1_C = probs_N_K_C[:, :, None, :]

        probs_N_K_K_S = _gather_expand(
            expanded_probs_N_K_1_C, axis=-1, index=expanded_choices_N_1_K_S
        )
        # exp sum log seems necessary to avoid 0s?
        probs_K_K_S = np.exp(
            np.sum(np.log(probs_N_K_K_S), axis=0, keepdims=False)
        )
        samples_K_M = probs_K_K_S.reshape((K, -1))

        samples_M_K = samples_K_M.T
        return _SampledJointEntropy(samples_M_K)

    def compute_batch(self, log_probs_B_K_C):
        B, K, C = log_probs_B_K_C.shape
        M = self.sampled_joint_probs_M_K.shape[0]

        b = log_probs_B_K_C.shape[0]

        probs_b_M_C = np.empty(
            (b, M, C),
        )
        for i in range(b):
            np.matmul(
                self.sampled_joint_probs_M_K,
                np.exp(log_probs_B_K_C[i]),
                out=probs_b_M_C[i],
            )
        probs_b_M_C /= K

        q_1_M_1 = self.sampled_joint_probs_M_K.mean(axis=1, keepdims=True)[
            None
        ]

        output_entropies_B = (
            np.sum(-np.log(probs_b_M_C) * probs_b_M_C / q_1_M_1, axis=(1, 2))
            / M
        )

        return output_entropies_B


class _DynamicJointEntropy:
    def __init__(self, M, max_N, K, C, random_state):
        self.M = M
        self.N = 0
        self.max_N = max_N

        self.inner = _ExactJointEntropy.empty(K)
        self.log_probs_max_N_K_C = np.empty((max_N, K, C))

        self.random_state = random_state

    def add_variables(self, log_probs_N_K_C):
        C = self.log_probs_max_N_K_C.shape[2]
        add_N = log_probs_N_K_C.shape[0]

        self.log_probs_max_N_K_C[self.N : self.N + add_N] = log_probs_N_K_C
        self.N += add_N

        num_exact_samples = C**self.N
        if num_exact_samples > self.M:
            self.inner = _SampledJointEntropy.sample(
                np.exp(self.log_probs_max_N_K_C[: self.N]),
                self.M,
                self.random_state,
            )
        else:
            self.inner.add_variables(log_probs_N_K_C)

        return self

    def compute_batch(self, log_probs_B_K_C):
        """
        Computes the joint entropy of the added variables together with the
        batch (one by one).
        """
        return self.inner.compute_batch(log_probs_B_K_C)


def _compute_conditional_entropy(log_probs_N_K_C):
    N, K, C = log_probs_N_K_C.shape

    nats_N_K_C = log_probs_N_K_C * np.exp(log_probs_N_K_C)
    nats_N_K_C[np.isnan(nats_N_K_C)] = 0
    entropies_N = -np.sum(nats_N_K_C, axis=(1, 2)) / K
    return entropies_N