Source code for skactiveml.utils._multi_annot

import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.utils.validation import (
    check_consistent_length,
    column_or_1d,
    check_array,
)

from ._label import MISSING_LABEL, is_labeled, is_unlabeled
from ._label_encoder import ExtLabelEncoder


[docs]def ext_confusion_matrix( y_true, y_pred, classes=None, missing_label=MISSING_LABEL, normalize=None ): """Compute confusion matrix [1]_ to evaluate the accuracy of a classification. This is an extension of the `sklearn.metric.confusion_matrix` [2]_ function by allowing missing labels and labels predicted by multiple annotators. By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` is equal to the number of observations known to be in group :math:`i` and predicted to be in group :math:`j`. Thus, in binary classification, the count of true negatives is :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is :math:`C_{1,1}` and false positives is :math:`C_{0,1}`. Parameters ---------- y_true : array-like of shape (n_samples) Array of true labels. Is not allowed to contain any missing labels. y_pred : array-like of shape (n_samples) or (n_samples, n_annotators) Estimated targets as returned by multiple annotators. classes : array-like of shape (n_classes,), default=None List of class labels to index the matrix. This may be used to reorder or select a subset of labels. If `None` is given, those that appear at least once in `y_true` or `y_pred` are used in sorted order. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. normalize : 'true' or 'pred' or 'all', default=None Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population. If `None`, confusion matrix will not be normalized. Returns ------- conf_matrices : numpy.ndarray, shape (n_annotators, n_classes, n_classes) Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class. References ---------- .. [1] `Wikipedia entry for the Confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix>`_ (Wikipedia and other references may use a different convention for axes) .. [2] `Scikit-learn Confusion Matrix <https://scikit-learn.org/stable/modules/generated/sklearn.metrics. confusion_matrix.html>`_ """ # Check input. y_true = column_or_1d(y_true) y_pred = check_array( y_pred, ensure_all_finite=False, ensure_2d=False, dtype=None ) if y_pred.ndim == 1: y_pred = y_pred.reshape(-1, 1) check_consistent_length(y_true, y_pred) if normalize not in ["true", "pred", "all", None]: raise ValueError( "'normalize' must be one of {'true', 'pred', 'all', " "None}." ) le = ExtLabelEncoder(classes=classes, missing_label=missing_label) y = np.column_stack((y_true, y_pred)) y = le.fit_transform(y) if np.sum(is_unlabeled(y[:, 0], missing_label=-1)): raise ValueError("'y_true' is not allowed to contain missing labels.") n_classes = len(le.classes_) n_annotators = y_pred.shape[1] # Determine confusion matrix for each annotator. conf_matrices = np.zeros((n_annotators, n_classes, n_classes)) for a in range(n_annotators): is_not_nan_a = is_labeled(y[:, a + 1], missing_label=-1) if np.sum(is_not_nan_a) > 0: cm = confusion_matrix( y_true=y[is_not_nan_a, 0], y_pred=y[is_not_nan_a, a + 1], labels=np.arange(n_classes), ) else: cm = np.zeros((n_classes, n_classes)) with np.errstate(all="ignore"): if normalize == "true": cm = cm / cm.sum(axis=1, keepdims=True) conf_matrices[a] = np.nan_to_num(cm, nan=1 / n_classes) elif normalize == "pred": cm = cm / cm.sum(axis=0, keepdims=True) conf_matrices[a] = np.nan_to_num(cm, nan=1 / n_classes) elif normalize == "all": cm = cm / cm.sum() conf_matrices[a] = np.nan_to_num(cm, nan=1 / cm.size) return conf_matrices