Source code for skactiveml.pool._expected_error_reduction

from copy import deepcopy

import numpy as np

from .utils import IndexClassifierWrapper
from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
    check_type,
    is_labeled,
    simple_batch,
    check_cost_matrix,
    MISSING_LABEL,
    check_equal_missing_label,
    unlabeled_indices,
    is_unlabeled,
)


[docs] class ExpectedErrorReduction(SingleAnnotatorPoolQueryStrategy): """Expected Error Reduction (EER) This class implements the basic workflow of EER algorithms by: - determining each candidates-label pair and simulate its utility by retraining the classifier with it, - determining some kind of risk for the retrained classifier. This workflow has been used by [1]_, [2]_, [3]_, and [4]_. Parameters ---------- enforce_mapping : bool If True, an exception is raised when no exact mapping between samples in `X` and samples in `candidates` can be determined. cost_matrix: array-like, shape (n_classes, n_classes), default=None Cost matrix with `cost_matrix[i,j]` defining the cost of predicting class `j` for a sample with the actual class `i`. Used for misclassification loss and ignored for log loss. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or np.random.RandomState or None, default=None The random state to use. References ---------- .. [1] Roy, N., & McCallum, A. (2001). Toward optimal active learning through monte carlo estimation of error reduction. ICML, (pp. 441-448). .. [2] Joshi, A. J., Porikli, F., & Papanikolopoulos, N. P. (2012). Scalable active learning for multiclass image classification. IEEE TrPAMI, 34(11), pp. 2259-2273. .. [3] Margineantu, D. D. (2005). Active cost-sensitive learning. In IJCAI (Vol. 5, pp. 1622-1623). .. [4] Kapoor, Ashish, Eric Horvitz, and Sumit Basu. "Selective Supervision: Guiding Supervised Learning with Decision-Theoretic Active Learning." IJCAI. Vol. 7. 2007. """ def __init__( self, enforce_mapping, cost_matrix=None, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( missing_label=missing_label, random_state=random_state ) self.cost_matrix = cost_matrix self.enforce_mapping = enforce_mapping
[docs] def query( self, X, y, clf, fit_clf=True, ignore_partial_fit=True, sample_weight=None, candidates=None, sample_weight_candidates=None, X_eval=None, sample_weight_eval=None, batch_size=1, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. ignore_partial_fit : bool, default=True Relevant in cases where `clf` implements `partial_fit`. If `True`, the `partial_fit` function is ignored and `fit` is used instead. sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). sample_weight_candidates : array-like of shape (n_candidates,), \ default=None Weights of candidates samples in `candidates` if candidates are directly given (i.e., `candidates.ndim > 1`). Otherwise, weights for `candidates` are given in `sample_weight`. X_eval : array-like of shape (n_eval_samples, n_features), \ default=None Unlabeled evaluation data set that is used for estimating the risk. Not applicable for all EER methods. sample_weight_eval : array-like of shape (n_eval_samples,), \ default=None Weights of evaluation samples in `X_eval` if given. Used to weight the importance of samples when estimating the risk. batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size,) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) or \ numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. """ ( X, y, sample_weight, clf, candidates, sample_weight_candidates, X_eval, sample_weight_eval, batch_size, return_utilities, ) = self._validate_data( X, y, sample_weight, clf, candidates, sample_weight_candidates, X_eval, sample_weight_eval, batch_size, return_utilities, reset=True, check_X_dict=None, ) _, mapping = self._transform_candidates( candidates, X, y, enforce_mapping=self.enforce_mapping ) ( X_full, y_full, w_full, w_eval, idx_train, idx_cand, idx_eval, ) = self._concatenate_samples( X, y, sample_weight, candidates, sample_weight_candidates, X_eval, sample_weight_eval, ) # Check fit_clf check_type(fit_clf, "fit_clf", bool) # Initialize classifier that works with indices to improve readability id_clf = IndexClassifierWrapper( deepcopy(clf), X_full, y_full, w_full, set_base_clf=not fit_clf, ignore_partial_fit=ignore_partial_fit, enforce_unique_samples=True, use_speed_up=True, missing_label=self.missing_label_, ) # Fit the classifier. id_clf = self._precompute_and_fit_clf( id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf=fit_clf, ) # Compute class-membership probabilities of candidate samples probs_cand = id_clf.predict_proba(idx_cand) # Check cost matrix. classes = id_clf.classes_ self._validate_cost_matrix(len(classes)) # precomputating current error current_error = self._estimate_current_error( id_clf, idx_train, idx_cand, idx_eval, w_eval ) # Storage for computed errors per candidate sample errors = np.zeros([len(idx_cand), len(classes)]) # Iterate over candidate samples for i_cx, idx_cx in enumerate(idx_cand): # Simulate acquisition of label for each candidate sample and class for i_cy, cy in enumerate(classes): errors[i_cx, i_cy] = self._estimate_error_for_candidate( id_clf, [idx_cx], [cy], idx_train, idx_cand, idx_eval, w_eval, ) # utils are maximized, errors minimized: hence multiply by (-1) future_error = np.sum(probs_cand * errors, axis=1) utilities_cand = -1 * (future_error - current_error) if mapping is None: utilities = np.array(utilities_cand) else: utilities = np.full(len(X), np.nan) utilities[mapping] = utilities_cand return simple_batch( utilities, self.random_state_, batch_size=batch_size, return_utilities=return_utilities, )
def _validate_data( self, X, y, sample_weight, clf, candidates, sample_weight_candidates, X_eval, sample_weight_eval, batch_size, return_utilities, reset=True, check_X_dict=None, ): # Validate input parameters. ( X, y, candidates, batch_size, return_utilities, ) = super()._validate_data( X, y, candidates, batch_size, return_utilities, reset=reset, check_X_dict=check_X_dict, ) # Validate classifier type. check_type(clf, "clf", SkactivemlClassifier) check_equal_missing_label(clf.missing_label, self.missing_label_) self._validate_init_params() return ( X, y, sample_weight, clf, candidates, sample_weight_candidates, X_eval, sample_weight_eval, batch_size, return_utilities, ) def _validate_init_params(self): """Function used to evaluate parameters of the `__init__` function that are not part of the abstract class to avoid redundancies. """ pass def _precompute_and_fit_clf( self, id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf=True, ): if fit_clf: id_clf.fit(idx_train, set_base_clf=True) return id_clf def _estimate_current_error( self, id_clf, idx_train, idx_cand, idx_eval, w_eval ): """ Result must be of float or of shape `(len(idx_eval))`. """ return 0.0 def _estimate_error_for_candidate( self, uclf, idx_cx, cy, idx_train, idx_cand, idx_eval, w_eval ): raise NotImplementedError( "Error estimation method must be implemented" "by the query strategy." ) def _validate_cost_matrix(self, n_classes): cost_matrix = ( 1 - np.eye(n_classes) if self.cost_matrix is None else self.cost_matrix ) self.cost_matrix_ = check_cost_matrix(cost_matrix, n_classes) def _concatenate_samples( self, X, y, sample_weight, candidates, sample_weight_candidates, X_eval, sample_weight_eval, ): # Check if candidates are samples if sample_weight_candidates is set if ( candidates is None or candidates.ndim == 1 ) and sample_weight_candidates is not None: raise ValueError( "Attribute `sample_weight_candidates` can only " "be set if `candidates` consists of samples." ) # TODO: test sample weight_eval - length + column if sample_weight is not None and len(X) != len(sample_weight): raise ValueError( "If `sample_weight` is set, it must have same " "length as `X`." ) if sample_weight_candidates is not None and len(candidates) != len( sample_weight_candidates ): raise ValueError( "If `sample_weight_candidates` is set, it must have same " "length as `X`." ) # Concatenate samples X_full = X y_full = y w_full = sample_weight idx_train = np.arange(len(X)) idx_unld = unlabeled_indices(y, self.missing_label_) if candidates is None: idx_cand = idx_unld elif candidates.ndim == 1: idx_cand = candidates else: X_full = np.concatenate([X_full, candidates], axis=0) y_full = np.concatenate( [y_full, np.full(len(candidates), np.nan)], axis=0 ) if not (w_full is None and sample_weight_candidates is None): if w_full is None: w_full = np.ones(len(X)) if sample_weight_candidates is None: sample_weight_candidates = np.ones(len(candidates)) w_full = np.concatenate( [w_full, sample_weight_candidates], axis=0 ) idx_cand = np.arange(len(X), len(X_full)) if X_eval is None: idx_eval = idx_train if sample_weight_eval is None: w_eval = np.ones(len(X_full)) else: if len(sample_weight_eval) != len(idx_eval): raise ValueError( "If `sample_weight_eval` is set but " "`X_eval` is None, then it should have " "same size as `X`" ) w_eval = np.zeros(len(X_full)) w_eval[idx_eval] = sample_weight_eval else: X_full = np.concatenate([X_full, X_eval], axis=0) y_full = np.concatenate( [y_full, np.full(len(X_eval), np.nan)], axis=0 ) idx_eval = np.arange(len(X_full) - len(X_eval), len(X_full)) w_eval = np.ones(len(X_full)) if sample_weight_eval is not None: if len(sample_weight_eval) != len(idx_eval): raise ValueError( "If `sample_weight_eval` and `X_eval` " "are set, then `sample_weight_eval` " "should have len(X_eval)" ) w_eval[idx_eval] = sample_weight_eval if w_full is not None: w_full = np.concatenate([w_full, sample_weight_eval], axis=0) return X_full, y_full, w_full, w_eval, idx_train, idx_cand, idx_eval def _risk_estimation( self, prob_true, prob_pred, cost_matrix, sample_weight ): if prob_true.ndim == 1 and prob_pred.ndim == 1: cost_est = cost_matrix[prob_true, :][ range(len(prob_true)), prob_pred ] return np.sum(sample_weight * cost_est) elif prob_true.ndim == 1 and prob_pred.ndim == 2: cost_est = cost_matrix[prob_true, :] return np.sum( sample_weight[:, np.newaxis] * prob_pred * cost_est[np.newaxis, :] ) elif prob_true.ndim == 2 and prob_pred.ndim == 1: cost_est = cost_matrix[:, prob_pred].T return np.sum( sample_weight[:, np.newaxis] * prob_true * cost_est[np.newaxis, :] ) else: prob_mat = ( prob_true[:, :, np.newaxis] @ prob_pred[:, np.newaxis, :] ) return np.sum( sample_weight[:, np.newaxis, np.newaxis] * prob_mat * cost_matrix[np.newaxis, :, :] ) def _logloss_estimation(self, prob_true, prob_pred): return -np.sum(prob_true * np.log(prob_pred + np.finfo(float).eps))
[docs] class MonteCarloEER(ExpectedErrorReduction): """Monte Carlo Expected Error Reduction (EER) This class implements the expected error method from [1]_ that uses a Monte-Carlo approach to estimate the error. Therefore, it implements the following two steps: - determining each candidates-label pair and simulate its utility by retraining the classifier with it, - determining some kind of risk for the retrained classifier. Parameters ---------- method : string, default='misclassification_loss' The optimization method. Possible values are 'misclassification_loss' and 'log_loss'. cost_matrix: array-like of shape (n_classes, n_classes), default=None Cost matrix with `cost_matrix[i,j]` defining the cost of predicting class `j` for a sample with the actual class `i`. Used for misclassification loss and ignored for log loss. subtract_current : bool, default=False If `True`, the current error estimate is subtracted from the simulated score. This might be helpful to define a stopping criterion. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or np.random.RandomState or None,d efault=None The random state to use. References ---------- .. [1] Roy, N., & McCallum, A. (2001). Toward optimal active learning through monte carlo estimation of error reduction. ICML, (pp. 441-448). """ def __init__( self, method="misclassification_loss", cost_matrix=None, subtract_current=False, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( enforce_mapping=False, cost_matrix=cost_matrix, missing_label=missing_label, random_state=random_state, ) self.method = method self.subtract_current = subtract_current def _validate_init_params(self): super()._validate_init_params() # Validate method. if not isinstance(self.method, str): raise TypeError( "{} is an invalid type for method. Type {} is " "expected".format(type(self.method), str) ) if self.method not in ["misclassification_loss", "log_loss"]: raise ValueError( f"Supported methods are `misclassification_loss`, or" f"`log_loss` the given one is: {self.method}" ) check_type(self.subtract_current, "subtract_current", bool) if self.method == "log_loss" and self.cost_matrix is not None: raise ValueError( "`cost_matrix` must be None if `method` is set to `log_loss`" ) def _estimate_current_error( self, id_clf, idx_train, idx_cand, idx_eval, w_eval ): if self.subtract_current: probs = id_clf.predict_proba(idx_eval) if self.method == "misclassification_loss": preds = np.argmin(np.dot(probs, self.cost_matrix_), axis=1) err = self._risk_estimation( probs, preds, self.cost_matrix_, w_eval[idx_eval] ) elif self.method == "log_loss": err = self._logloss_estimation(probs, probs) return err else: return super()._estimate_current_error( id_clf, idx_train, idx_cand, idx_eval, w_eval ) def _estimate_error_for_candidate( self, id_clf, idx_cx, cy, idx_train, idx_cand, idx_eval, w_eval ): id_clf.partial_fit(idx_cx, cy, use_base_clf=True, set_base_clf=False) probs = id_clf.predict_proba(idx_eval) if self.method == "misclassification_loss": preds = np.argmin(np.dot(probs, self.cost_matrix_), axis=1) err = self._risk_estimation( probs, preds, self.cost_matrix_, w_eval[idx_eval] ) elif self.method == "log_loss": err = self._logloss_estimation(probs, probs) return err def _precompute_and_fit_clf( self, id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf ): id_clf.precompute(idx_train, idx_cand) id_clf.precompute(idx_train, idx_eval) id_clf.precompute(idx_cand, idx_eval) id_clf = super()._precompute_and_fit_clf( id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf=fit_clf, ) return id_clf
[docs] class ValueOfInformationEER(ExpectedErrorReduction): """Value of Information (VOI) This class implements the expected error method from [1]_ that estimates the "Value of Information" (VOI). This method can be extended in a way that it also implements [2]_ and [3]_. The default parameters described in [1]_. Therefore, it implements the following two steps: - determining each candidates-label pair and simulate its utility by retraining the classifier with it, - determining some kind of risk for the retrained classifier. Parameters ---------- cost_matrix: array-like, shape (n_classes, n_classes), default=None Cost matrix with `cost_matrix[i,j]` defining the cost of predicting class `j` for a sample with the actual class `i`. Used for misclassification loss and ignored for log loss. consider_unlabeled : bool, default=True If `True`, the error is estimated on the unlabeled samples. consider_labeled : bool, default=True If `True`, the error is estimated on the labeled samples. candidate_to_labeled : bool, default=True If `True`, the candidate with the simulated label is added to the labeled set. As this label is considered to be correct, it will be evaluated under the `consider_labeled` flag then. subtract_current : bool, default=False If `True`, the current error estimate is subtracted from the simulated score. This might be helpful to define a stopping criterion as in [2]_. normalize : bool, default=False If `True` the error terms are normalized by the number of evaluation samples such that the errors represent the average error instead of the summed error. This will be done independently for the simulated and the current error. missing_label : scalar or string or np.nan or None, default=np.nan Value to represent a missing label. random_state : int or np.random.RandomState or None, default=None The random state to use. References ---------- .. [1] Kapoor, Ashish, Eric Horvitz, and Sumit Basu. "Selective Supervision: Guiding Supervised Learning with Decision-Theoretic Active Learning." IJCAI. Vol. 7. 2007. .. [2] Joshi, A. J., Porikli, F., & Papanikolopoulos, N. P. (2012). Scalable active learning for multiclass image classification. IEEE TrPAMI, 34(11), pp. 2259-2273. .. [3] Margineantu, D. D. (2005). Active cost-sensitive learning. In IJCAI (Vol. 5, pp. 1622-1623). """ def __init__( self, cost_matrix=None, consider_unlabeled=True, consider_labeled=True, candidate_to_labeled=True, subtract_current=False, normalize=False, missing_label=MISSING_LABEL, random_state=None, ): super().__init__( enforce_mapping=True, cost_matrix=cost_matrix, missing_label=missing_label, random_state=random_state, ) self.consider_unlabeled = consider_unlabeled self.consider_labeled = consider_labeled self.candidate_to_labeled = candidate_to_labeled self.subtract_current = subtract_current self.normalize = normalize def _validate_init_params(self): super()._validate_init_params() check_type(self.consider_unlabeled, "consider_unlabeled", bool) check_type(self.consider_labeled, "consider_labeled", bool) check_type(self.candidate_to_labeled, "candidate_to_labeled", bool) check_type(self.subtract_current, "subtract_current", bool) check_type(self.normalize, "normalize", bool)
[docs] def query( self, X, y, clf, sample_weight=None, fit_clf=True, ignore_partial_fit=True, candidates=None, batch_size=1, return_utilities=False, ): """Determines for which candidate samples labels are to be queried. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e., including the labeled and unlabeled samples. y : array-like of shape (n_samples,) Labels of the training data set (possibly including unlabeled ones indicated by `self.missing_label`). clf : skactiveml.base.SkactivemlClassifier Model implementing the methods `fit` and `predict_proba`. fit_clf : bool, default=True Defines whether the classifier should be fitted on `X`, `y`, and `sample_weight`. ignore_partial_fit : bool, default=True Relevant in cases where `clf` implements `partial_fit`. If True, the `partial_fit` function is ignored and `fit` is used instead. sample_weight : array-like of shape (n_samples,), default=None Weights of training samples in `X`. candidates : None or array-like of shape (n_candidates), dtype=int or \ array-like of shape (n_candidates, n_features), default=None - If `candidates` is `None`, the unlabeled samples from `(X,y)` are considered as `candidates`. - If `candidates` is of shape `(n_candidates,)` and of type `int`, `candidates` is considered as the indices of the samples in `(X,y)`. - If `candidates` is of shape `(n_candidates, ...)`, the candidate samples are directly given in `candidates` (not necessarily contained in `X`). batch_size : int, default=1 The number of samples to be selected in one AL cycle. return_utilities : bool, default=False If `True`, also return the utilities based on the query strategy. Returns ------- query_indices : numpy.ndarray of shape (batch_size,) The query indices indicate for which candidate sample a label is to be queried, e.g., `query_indices[0]` indicates the first selected sample. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. utilities : numpy.ndarray of shape (batch_size, n_samples) or \ numpy.ndarray of shape (batch_size, n_candidates) The utilities of samples after each selected sample of the batch, e.g., `utilities[0]` indicates the utilities used for selecting the first sample (with index `query_indices[0]`) of the batch. Utilities for labeled samples will be set to np.nan. - If `candidates` is `None` or of shape `(n_candidates,)`, the indexing refers to the samples in `X`. - If `candidates` is of shape `(n_candidates, n_features)`, the indexing refers to the samples in `candidates`. """ # TODO check if candidates are only unlabeled ones if given return super().query( X, y, clf, sample_weight=sample_weight, fit_clf=fit_clf, ignore_partial_fit=ignore_partial_fit, candidates=candidates, sample_weight_candidates=None, X_eval=None, sample_weight_eval=None, batch_size=batch_size, return_utilities=return_utilities, )
def _estimate_error_for_candidate( self, id_clf, idx_cx, cy, idx_train, idx_cand, idx_eval, w_eval ): id_clf.partial_fit(idx_cx, cy, use_base_clf=True, set_base_clf=False) # Handle problem that if only one candidate is remaining, this should # be the one to be selected although the error cannot be estimated # as there are no samples left for estimating le = id_clf._le y_eval = id_clf.y[idx_eval] idx_labeled = idx_train[ is_labeled(y_eval, missing_label=self.missing_label_) ] y_labeled = id_clf.y[idx_labeled] idx_unlabeled = idx_train[ is_unlabeled(y_eval, missing_label=self.missing_label_) ] if self.candidate_to_labeled: idx_labeled = np.concatenate([idx_labeled, idx_cx], axis=0) y_labeled = np.concatenate([y_labeled, cy], axis=0) idx_unlabeled = np.setdiff1d( idx_unlabeled, idx_cx, assume_unique=True ) y_labeled_c_id = None if len(idx_labeled) > 0: y_labeled_c_id = le.transform(y_labeled) err = 0 norm = 0 if self.consider_labeled and len(idx_labeled) > 0: norm += len(idx_labeled) probs = id_clf.predict_proba(idx_labeled) err += self._risk_estimation( y_labeled_c_id, probs, self.cost_matrix_, w_eval[idx_labeled] ) if self.consider_unlabeled and len(idx_unlabeled) > 0: norm += len(idx_unlabeled) probs = id_clf.predict_proba(idx_unlabeled) err += self._risk_estimation( probs, probs, self.cost_matrix_, w_eval[idx_unlabeled] ) if self.normalize: if norm == 0: return 0.0 else: return err / norm else: return err def _estimate_current_error( self, id_clf, idx_train, idx_cand, idx_eval, w_eval ): # estimate current utility score if required # TODO: maybe use function for code below to reduce redundancies if self.subtract_current: le = id_clf._le y_eval = id_clf.y[idx_eval] idx_labeled = idx_train[is_labeled(y_eval)] y_labeled = id_clf.y[idx_labeled] idx_unlabeled = idx_train[is_unlabeled(y_eval)] y_labeled_c_id = None if len(idx_labeled) > 0: y_labeled_c_id = le.transform(y_labeled) err = 0 norm = 0 if self.consider_labeled and len(idx_labeled) > 0: norm += len(idx_labeled) probs = id_clf.predict_proba(idx_labeled) err += self._risk_estimation( y_labeled_c_id, probs, self.cost_matrix_, w_eval[idx_labeled], ) if self.consider_unlabeled and len(idx_unlabeled) > 0: norm += len(idx_unlabeled) probs = id_clf.predict_proba(idx_unlabeled) err += self._risk_estimation( probs, probs, self.cost_matrix_, w_eval[idx_unlabeled] ) if self.normalize: return err / norm else: return err else: return super()._estimate_current_error( id_clf, idx_train, idx_cand, idx_eval, w_eval ) def _precompute_and_fit_clf( self, id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf ): # TODO: replace the following line by more efficient code id_clf.precompute( idx_train, idx_train, fit_params="all", pred_params="all" ) # # # for cond_prob # id_clf.precompute(idx_train, idx_cand, # fit_params='labeled', pred_params='all') # # for risk estimation # if self.consider_labeled: # id_clf.precompute(idx_train, idx_eval, # fit_params='labeled', pred_params='labeled') # id_clf.precompute(idx_cand, idx_eval, # fit_params='all', pred_params='labeled') # if self.candidate_to_labeled: # # idx_train ('labeled'), idx_cand ('all') exists above # # TODO: consider only equal samples would be sufficient # id_clf.precompute(idx_cand, idx_cand, # fit_params='all', pred_params='all') # if self.consider_unlabeled: # id_clf.precompute(idx_train, idx_eval, # fit_params='labeled', pred_params='unlabeled') # id_clf.precompute(idx_cand, idx_eval, # fit_params='all', pred_params='unlabeled') id_clf = super()._precompute_and_fit_clf( id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf=fit_clf, ) return id_clf