Source code for skactiveml.utils._aggregation

import numpy as np
from sklearn.utils import check_array, check_consistent_length

from ._label import is_labeled, is_unlabeled
from ._label_encoder import ExtLabelEncoder
from ._selection import rand_argmax


[docs]def compute_vote_vectors(y, w=None, classes=None, missing_label=np.nan):
    """Counts number of votes per class label for each sample.

    Parameters
    ----------
    y : array-like, shape (n_samples) or (n_samples, n_annotators)
        Class labels.
    w : array-like, shape (n_samples) or (n_samples, n_annotators),
    default=np.ones_like(y)
        Class label weights.
    classes : array-like, shape (n_classes), default=None
        Holds the label for each class.
    missing_label : scalar|string|np.nan|None, default=np.nan
        Value to represent a missing label.

    Returns
    -------
    v : array-like, shape (n_samples, n_classes)
        V[i,j] counts number of votes per class j for sample i.
    """
    # check input parameters
    le = ExtLabelEncoder(classes=classes, missing_label=missing_label)
    y = le.fit_transform(y)
    n_classes = len(le.classes_)
    y = y if y.ndim == 2 else y.reshape((-1, 1))
    is_unlabeled_y = is_unlabeled(y, missing_label=-1)
    y[is_unlabeled_y] = 0
    y = y.astype(int)

    if n_classes == 0:
        raise ValueError(
            "Number of classes can not be inferred. "
            "There must be at least one assigned label or classes must not be"
            "None. "
        )

    w = (
        np.ones_like(y)
        if w is None
        else check_array(
            w, ensure_2d=False, force_all_finite=False, dtype=float, copy=True
        )
    )
    w = w if w.ndim == 2 else w.reshape((-1, 1))
    check_consistent_length(y, w)
    check_consistent_length(y.T, w.T)
    w[is_unlabeled_y] = 1

    # count class labels per class and weight by confidence scores
    w[np.logical_or(np.isnan(w), is_unlabeled_y)] = 0
    y_off = y + np.arange(y.shape[0])[:, None] * n_classes
    v = np.bincount(
        y_off.ravel(), minlength=y.shape[0] * n_classes, weights=w.ravel()
    )
    v = v.reshape(-1, n_classes)

    return v


[docs]def majority_vote(
    y, w=None, classes=None, missing_label=np.nan, random_state=None
):
    """Assigns a label to each sample based on weighted voting.
    Samples with no labels are assigned with `missing_label`.

    Parameters
    ----------
    y : array-like, shape (n_samples) or (n_samples, n_annotators)
        Class labels.
    w : array-like, shape (n_samples) or (n_samples, n_annotators),
    default=np.ones_like(y)
        Class label weights.
    classes : array-like, shape (n_classes), default=None
        Holds the label for each class.
    missing_label : scalar|string|np.nan|None, default=np.nan
        Value to represent a missing label.
    random_state : int, RandomState instance or None, optional (default=None)
        Determines random number generation for shuffling the data. Pass an int
        for reproducible results across multiple function calls.

    Returns
    -------
    y_aggregated : array-like, shape (n_samples)
        Assigned labels for each sample.

    """
    # check input parameters
    y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=False)
    y = y if y.ndim == 2 else y.reshape((-1, 1))
    n_samples = y.shape[0]
    w = (
        np.ones_like(y)
        if w is None
        else check_array(
            w, ensure_2d=False, force_all_finite=False, dtype=None, copy=True
        )
    )

    # extract labeled samples
    is_labeled_y = np.any(is_labeled(y, missing_label), axis=1)
    y_labeled = y[is_labeled_y]

    # infer encoding
    le = ExtLabelEncoder(classes=classes, missing_label=missing_label)
    le.fit(y)
    y_aggregated = np.full((n_samples,), missing_label, dtype=le._dtype)

    if np.any(is_labeled_y):
        # transform labels
        y_labeled_transformed = le.transform(y_labeled)

        # perform voting
        vote_matrix = compute_vote_vectors(
            y_labeled_transformed,
            w=w[is_labeled_y],
            missing_label=-1,
            classes=np.arange(len(le.classes_)),
        )

        vote_vector = rand_argmax(vote_matrix, random_state, axis=1)

        # inverse transform labels
        y_labeled_inverse_transformed = le.inverse_transform(vote_vector)
        # assign labels
        y_aggregated[is_labeled_y] = y_labeled_inverse_transformed

    return y_aggregated