Source code for skactiveml.classifier.multiannotator._annotator_ensemble_classifier

"""
Classifier Ensemble for Multiple Annotators
"""

# Author: Marek Herde <marek.herde@uni-kassel.de>

from copy import deepcopy

import numpy as np
from sklearn.ensemble._base import _BaseHeterogeneousEnsemble
from sklearn.utils.validation import check_array, check_is_fitted

from ...base import SkactivemlClassifier
from ...utils import MISSING_LABEL, is_labeled, compute_vote_vectors


[docs]class AnnotatorEnsembleClassifier(
    _BaseHeterogeneousEnsemble, SkactivemlClassifier
):
    """AnnotatorEnsembleClassifier

    This strategy consists of fitting one classifier per annotator.

    Parameters
    ----------
    estimators : list of (str, estimator) tuples
        The ensemble of estimators to use in the ensemble. Each element of the
        list is defined as a tuple of string (i.e. name of the estimator) and
        an estimator instance.
    voting : 'hard' or 'soft', default='hard'
        If 'hard', uses predicted class labels for majority rule voting.
        Else if 'soft', predicts the class label based on the argmax of
        the sums of the predicted probabilities, which is recommended for
        an ensemble of well-calibrated classifiers.
    classes : array-like of shape (n_classes,), default=None
        Holds the label for each class. If none, the classes are determined
        during the fit.
    missing_label : scalar or str or np.nan or None, default=np.nan
        Value to represent a missing label.
    cost_matrix : array-like of shape (n_classes, n_classes)
        Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class
        `classes[j]`  for a sample of class `classes[i]. Can be only set, if
        classes is not none.
    random_state : int or RandomState instance or None, default=None
        Determines random number for 'predict' method. Pass an int for
        reproducible results across multiple method calls.

    Attributes
    ----------
    classes_ : np.ndarray of shape (n_classes,)
        Holds the label for each class after fitting.
    cost_matrix_ : np.ndarray of shape (classes, classes)
        Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting
        class `classes_[j]`  for a sample of class `classes_[i].
    estimators_ : list of estimators
        The elements of the estimators parameter, having been fitted on the
        training data. If an estimator has been set to `'drop'`, it will not
        appear in `estimators_`.
    """

    def __init__(
        self,
        estimators,
        voting="hard",
        classes=None,
        missing_label=MISSING_LABEL,
        cost_matrix=None,
        random_state=None,
    ):
        _BaseHeterogeneousEnsemble.__init__(self, estimators=estimators)
        SkactivemlClassifier.__init__(
            self,
            classes=classes,
            missing_label=missing_label,
            cost_matrix=cost_matrix,
            random_state=random_state,
        )
        self.voting = voting

[docs]    def fit(self, X, y, sample_weight=None):
        """Fit the model using X as training data and y as class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The sample matrix X is the feature matrix representing the samples.
        y : array-like of shape (n_samples, n_estimators)
            It contains the class labels of the training samples.
            The number of class labels may be variable for the samples, where
            missing labels are represented the attribute `missing_label`.
        sample_weight : array-like of shape (n_samples, n_estimators)
            It contains the weights of the training samples' class labels.
            It must have the same shape as `y`.

        Returns
        -------
        self: skactiveml.classifier.multiannotator.AnnotatorEnsembleClassifier,
            The `AnnotatorEnsembleClassifier` object fitted on the training
            data.
        """
        # Check estimators.
        self._validate_estimators()

        # Check input parameters.
        self.check_X_dict_ = {
            "ensure_min_samples": 0,
            "ensure_min_features": 0,
            "allow_nd": True,
            "dtype": None,
        }
        X, y, sample_weight = self._validate_data(
            X=X,
            y=y,
            sample_weight=sample_weight,
            check_X_dict=self.check_X_dict_,
            y_ensure_1d=False,
        )
        self._check_n_features(X, reset=True)

        # Copy estimators
        self.estimators_ = deepcopy(self.estimators)

        # Check for empty training data.
        if self.n_features_in_ is None:
            return self

        # Check number of estimators.
        error_msg = (
            f"'y' must have shape (n_samples={len(y)}, n_estimators="
            f"{len(self.estimators)}) but has shape {y.shape}."
        )
        if (
            self.named_estimators is not None
            and y.ndim <= 1
            or y.shape[1] != len(self.estimators)
        ):
            raise ValueError(error_msg)

        # Check voting scheme.
        if self.voting not in ("soft", "hard"):
            raise ValueError(
                f"Voting must be 'soft' or 'hard'; "
                f"got `voting='{self.voting}'`)"
            )

        # Fit each estimator
        for i, est in enumerate(self.estimators_):
            est[1].set_params(missing_label=-1)
            if self.classes is None or est[1].classes is None:
                est[1].set_params(classes=np.arange(len(self.classes_)))
            if sample_weight is None:
                est[1].fit(X=X, y=y[:, i])
            else:
                est[1].fit(X=X, y=y[:, i], sample_weight=sample_weight[:, i])
        return self

[docs]    def predict_proba(self, X):
        """Return probability estimates for the test data X.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.

        Returns
        -------
        P : np.ndarray of shape (n_samples, classes)
            The class probabilities of the test samples. Classes are ordered
            according to `classes_`.
        """
        check_is_fitted(self)
        X = check_array(X, **self.check_X_dict_)
        self._check_n_features(X, reset=False)
        if self.n_features_in_ is None:
            return np.ones((len(X), len(self.classes_))) / len(self.classes_)
        elif self.voting == "hard":
            y_pred = np.array(
                [est.predict(X) for _, est in self.estimators_]
            ).T
            V = compute_vote_vectors(y=y_pred, classes=self.classes_)
            P = V / np.sum(V, axis=1, keepdims=True)
        elif self.voting == "soft":
            P = np.array([est.predict_proba(X) for _, est in self.estimators_])
            P = np.sum(P, axis=0)
            P /= np.sum(P, axis=1, keepdims=True)
        return P

    def _validate_estimators(self):
        _BaseHeterogeneousEnsemble._validate_estimators(self)
        for name, est in self.estimators:
            if not isinstance(est, SkactivemlClassifier):
                raise TypeError(f"'{est}' is not a 'SkactivemlClassifier'.")
            if self.voting == "soft" and not hasattr(est, "predict_proba"):
                raise ValueError(
                    f"If 'voting' is soft, each classifier must "
                    f"implement 'predict_proba' method. However, "
                    f"{est} does not do so."
                )
            error_msg = (
                f"{est} of 'estimators' has 'missing_label="
                f"{est.missing_label}' as attribute being unequal "
                f"to the given 'missing_label={self.missing_label}' "
                f"as parameter."
            )
            try:
                if is_labeled([self.missing_label], est.missing_label)[0]:
                    raise TypeError(error_msg)
            except TypeError:
                raise TypeError(error_msg)
            error_msg = (
                f"{est} of 'estimators' has 'classes={est.classes}' "
                f"as attribute being unequal to the given 'classes="
                f"{self.classes}' as parameter."
            )
            classes_none = self.classes is None
            est_classes_none = est.classes is None
            if classes_none and not est_classes_none:
                raise ValueError(error_msg)
            if (
                not classes_none
                and not est_classes_none
                and not np.array_equal(self.classes, est.classes)
            ):
                raise ValueError(error_msg)