Source code for skactiveml.utils._aggregation

import numpy as np
from sklearn.utils import check_array, check_consistent_length

from ._label import is_labeled, is_unlabeled
from ._label_encoder import ExtLabelEncoder
from ._selection import rand_argmax


[docs]def compute_vote_vectors(y, w=None, classes=None, missing_label=np.nan): """Counts number of votes per class label for each sample. Parameters ---------- y : array-like, shape (n_samples) or (n_samples, n_annotators) Class labels. w : array-like, shape (n_samples) or (n_samples, n_annotators), default=np.ones_like(y) Class label weights. classes : array-like, shape (n_classes), default=None Holds the label for each class. missing_label : scalar|string|np.nan|None, default=np.nan Value to represent a missing label. Returns ------- v : array-like, shape (n_samples, n_classes) V[i,j] counts number of votes per class j for sample i. """ # check input parameters le = ExtLabelEncoder(classes=classes, missing_label=missing_label) y = le.fit_transform(y) n_classes = len(le.classes_) y = y if y.ndim == 2 else y.reshape((-1, 1)) is_unlabeled_y = is_unlabeled(y, missing_label=-1) y[is_unlabeled_y] = 0 y = y.astype(int) if n_classes == 0: raise ValueError( "Number of classes can not be inferred. " "There must be at least one assigned label or classes must not be" "None. " ) w = ( np.ones_like(y) if w is None else check_array( w, ensure_2d=False, force_all_finite=False, dtype=float, copy=True ) ) w = w if w.ndim == 2 else w.reshape((-1, 1)) check_consistent_length(y, w) check_consistent_length(y.T, w.T) w[is_unlabeled_y] = 1 # count class labels per class and weight by confidence scores w[np.logical_or(np.isnan(w), is_unlabeled_y)] = 0 y_off = y + np.arange(y.shape[0])[:, None] * n_classes v = np.bincount( y_off.ravel(), minlength=y.shape[0] * n_classes, weights=w.ravel() ) v = v.reshape(-1, n_classes) return v
[docs]def majority_vote( y, w=None, classes=None, missing_label=np.nan, random_state=None ): """Assigns a label to each sample based on weighted voting. Samples with no labels are assigned with `missing_label`. Parameters ---------- y : array-like, shape (n_samples) or (n_samples, n_annotators) Class labels. w : array-like, shape (n_samples) or (n_samples, n_annotators), default=np.ones_like(y) Class label weights. classes : array-like, shape (n_classes), default=None Holds the label for each class. missing_label : scalar|string|np.nan|None, default=np.nan Value to represent a missing label. random_state : int, RandomState instance or None, optional (default=None) Determines random number generation for shuffling the data. Pass an int for reproducible results across multiple function calls. Returns ------- y_aggregated : array-like, shape (n_samples) Assigned labels for each sample. """ # check input parameters y = check_array(y, ensure_2d=False, dtype=None, force_all_finite=False) y = y if y.ndim == 2 else y.reshape((-1, 1)) n_samples = y.shape[0] w = ( np.ones_like(y) if w is None else check_array( w, ensure_2d=False, force_all_finite=False, dtype=None, copy=True ) ) # extract labeled samples is_labeled_y = np.any(is_labeled(y, missing_label), axis=1) y_labeled = y[is_labeled_y] # infer encoding le = ExtLabelEncoder(classes=classes, missing_label=missing_label) le.fit(y) y_aggregated = np.full((n_samples,), missing_label, dtype=le._dtype) if np.any(is_labeled_y): # transform labels y_labeled_transformed = le.transform(y_labeled) # perform voting vote_matrix = compute_vote_vectors( y_labeled_transformed, w=w[is_labeled_y], missing_label=-1, classes=np.arange(len(le.classes_)), ) vote_vector = rand_argmax(vote_matrix, random_state, axis=1) # inverse transform labels y_labeled_inverse_transformed = le.inverse_transform(vote_vector) # assign labels y_aggregated[is_labeled_y] = y_labeled_inverse_transformed return y_aggregated