Source code for skactiveml.pool._expected_error_reduction
from copy import deepcopy
import numpy as np
from .utils import IndexClassifierWrapper
from ..base import SingleAnnotatorPoolQueryStrategy, SkactivemlClassifier
from ..utils import (
check_type,
is_labeled,
simple_batch,
check_cost_matrix,
MISSING_LABEL,
check_equal_missing_label,
unlabeled_indices,
is_unlabeled,
)
class ExpectedErrorReduction(SingleAnnotatorPoolQueryStrategy):
"""Abstract class for Expected Error Reduction (EER).
This class implements the basic workflow of EER algorithms containing:
- determining ever candidates x label pair and simulate its outcome
in the classifier by simulating it
- determining some kind of risk for the new classifier
These structure has been used by e.g.:
- Roy, N., & McCallum, A. (2001). Toward optimal active learning through
monte carlo estimation of error reduction. ICML, pp. 441-448.
- Kapoor, A., Horvitz, E., & Basu, S. (2007). Selective Supervision:
Guiding Supervised Learning with Decision-Theoretic Active Learning.
IJCAI, pp. 877-882.
- Margineantu, D. D. (2005). Active cost-sensitive learning.
IJCAI, pp. 1622-1623.
- Joshi, A. J., Porikli, F., & Papanikolopoulos, N. P. (2012). Scalable
active learning for multiclass image classification.
IEEE TrPAMI, 34(11), pp. 2259-2273.
Parameters
----------
enforce_mapping : bool
If True, an exception is raised when no exact mapping between
instances in `X` and instances in `candidates` can be determined.
cost_matrix: array-like, shape (n_classes, n_classes), optional
(default=None)
Cost matrix with `cost_matrix[i,j]` defining the cost of predicting
class `j` for a sample with the actual class `i`.
Used for misclassification loss and ignored for log loss.
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : int or np.random.RandomState
The random state to use.
References
----------
[1] Roy, N., & McCallum, A. (2001). Toward optimal active learning through
monte carlo estimation of error reduction. ICML, (pp. 441-448).
[2] Joshi, A. J., Porikli, F., & Papanikolopoulos, N. P. (2012).
Scalable active learning for multiclass image classification.
IEEE TrPAMI, 34(11), pp. 2259-2273.
[3] Margineantu, D. D. (2005). Active cost-sensitive learning.
In IJCAI (Vol. 5, pp. 1622-1623).
[4] Kapoor, Ashish, Eric Horvitz, and Sumit Basu. "Selective Supervision:
Guiding Supervised Learning with Decision-Theoretic Active Learning."
IJCAI. Vol. 7. 2007.
"""
def __init__(
self,
enforce_mapping,
cost_matrix=None,
missing_label=MISSING_LABEL,
random_state=None,
):
super().__init__(
missing_label=missing_label, random_state=random_state
)
self.cost_matrix = cost_matrix
self.enforce_mapping = enforce_mapping
def query(
self,
X,
y,
clf,
fit_clf=True,
ignore_partial_fit=True,
sample_weight=None,
candidates=None,
sample_weight_candidates=None,
X_eval=None,
sample_weight_eval=None,
batch_size=1,
return_utilities=False,
):
"""Determines for which candidate samples labels are to be queried.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples)
Labels of the training data set (possibly including unlabeled ones
indicated by self.MISSING_LABEL.
clf : skactiveml.base.SkactivemlClassifier
Model implementing the methods `fit` and `predict_proba`.
fit_clf : bool, optional (default=True)
Defines whether the classifier should be fitted on `X`, `y`, and
`sample_weight`.
ignore_partial_fit : bool, optional (default=True)
Relevant in cases where `clf` implements `partial_fit`. If True,
the `partial_fit` function is ignored and `fit` is used instead.
sample_weight : array-like of shape (n_samples), optional
(default=None)
Weights of training samples in `X`.
candidates : None or array-like of shape (n_candidates), dtype=int or
array-like of shape (n_candidates, n_features),
optional (default=None)
If candidates is None, the unlabeled samples from (X,y) are
considered as candidates.
If candidates is of shape (n_candidates) and of type int,
candidates is considered as the indices of the samples in (X,y).
If candidates is of shape (n_candidates, n_features), the
candidates are directly given in candidates (not necessarily
contained in X). This is not supported by all query strategies.
sample_weight_candidates : array-like of shape (n_candidates),
optional (default=None)
Weights of candidates samples in `candidates` if candidates are
directly given (i.e., candidates.ndim > 1). Otherwise weights for
candidates are given in `sample_weight`.
X_eval : array-like of shape (n_eval_samples, n_features),
optional (default=None).
Unlabeled evaluation data set that is used for estimating the risk.
Not applicable for all EER methods.
sample_weight_eval : array-like of shape (n_eval_samples),
optional (default=None)
Weights of evaluation samples in `X_eval` if given. Used to weight
the importance of samples when estimating the risk.
batch_size : int, optional (default=1)
The number of samples to be selected in one AL cycle.
return_utilities : bool, optional (default=False)
If true, also return the utilities based on the query strategy.
Returns
-------
query_indices : numpy.ndarray of shape (batch_size)
The query_indices indicate for which candidate sample a label is
to queried, e.g., `query_indices[0]` indicates the first selected
sample.
If candidates is None or of shape (n_candidates), the indexing
refers to samples in X.
If candidates is of shape (n_candidates, n_features), the indexing
refers to samples in candidates.
utilities : numpy.ndarray of shape (batch_size, n_samples) or
numpy.ndarray of shape (batch_size, n_candidates)
The utilities of samples after each selected sample of the batch,
e.g., `utilities[0]` indicates the utilities used for selecting
the first sample (with index `query_indices[0]`) of the batch.
Utilities for labeled samples will be set to np.nan.
If candidates is None or of shape (n_candidates), the indexing
refers to samples in X.
If candidates is of shape (n_candidates, n_features), the indexing
refers to samples in candidates.
"""
(
X,
y,
sample_weight,
clf,
candidates,
sample_weight_candidates,
X_eval,
sample_weight_eval,
batch_size,
return_utilities,
) = self._validate_data(
X,
y,
sample_weight,
clf,
candidates,
sample_weight_candidates,
X_eval,
sample_weight_eval,
batch_size,
return_utilities,
reset=True,
check_X_dict=None,
)
_, mapping = self._transform_candidates(
candidates, X, y, enforce_mapping=self.enforce_mapping
)
(
X_full,
y_full,
w_full,
w_eval,
idx_train,
idx_cand,
idx_eval,
) = self._concatenate_samples(
X,
y,
sample_weight,
candidates,
sample_weight_candidates,
X_eval,
sample_weight_eval,
)
# Check fit_clf
check_type(fit_clf, "fit_clf", bool)
# Initialize classifier that works with indices to improve readability
id_clf = IndexClassifierWrapper(
deepcopy(clf),
X_full,
y_full,
w_full,
set_base_clf=not fit_clf,
ignore_partial_fit=ignore_partial_fit,
enforce_unique_samples=True,
use_speed_up=True,
missing_label=self.missing_label_,
)
# Fit the classifier.
id_clf = self._precompute_and_fit_clf(
id_clf,
X_full,
y_full,
idx_train,
idx_cand,
idx_eval,
fit_clf=fit_clf,
)
# Compute class-membership probabilities of candidate samples
probs_cand = id_clf.predict_proba(idx_cand)
# Check cost matrix.
classes = id_clf.classes_
self._validate_cost_matrix(len(classes))
# precomputating current error
current_error = self._estimate_current_error(
id_clf, idx_train, idx_cand, idx_eval, w_eval
)
# Storage for computed errors per candidate sample
errors = np.zeros([len(idx_cand), len(classes)])
# Iterate over candidate samples
for i_cx, idx_cx in enumerate(idx_cand):
# Simulate acquisition of label for each candidate sample and class
for i_cy, cy in enumerate(classes):
errors[i_cx, i_cy] = self._estimate_error_for_candidate(
id_clf,
[idx_cx],
[cy],
idx_train,
idx_cand,
idx_eval,
w_eval,
)
# utils are maximized, errors minimized: hence multiply by (-1)
future_error = np.sum(probs_cand * errors, axis=1)
utilities_cand = -1 * (future_error - current_error)
if mapping is None:
utilities = np.array(utilities_cand)
else:
utilities = np.full(len(X), np.nan)
utilities[mapping] = utilities_cand
return simple_batch(
utilities,
self.random_state_,
batch_size=batch_size,
return_utilities=return_utilities,
)
def _validate_data(
self,
X,
y,
sample_weight,
clf,
candidates,
sample_weight_candidates,
X_eval,
sample_weight_eval,
batch_size,
return_utilities,
reset=True,
check_X_dict=None,
):
# Validate input parameters.
(
X,
y,
candidates,
batch_size,
return_utilities,
) = super()._validate_data(
X,
y,
candidates,
batch_size,
return_utilities,
reset=reset,
check_X_dict=check_X_dict,
)
# Validate classifier type.
check_type(clf, "clf", SkactivemlClassifier)
check_equal_missing_label(clf.missing_label, self.missing_label_)
self._validate_init_params()
return (
X,
y,
sample_weight,
clf,
candidates,
sample_weight_candidates,
X_eval,
sample_weight_eval,
batch_size,
return_utilities,
)
def _validate_init_params(self):
"""Function used to evaluate parameters of the `__init__` function that
are not part of the abstract class to avoid redundancies.
"""
pass
def _precompute_and_fit_clf(
self,
id_clf,
X_full,
y_full,
idx_train,
idx_cand,
idx_eval,
fit_clf=True,
):
if fit_clf:
id_clf.fit(idx_train, set_base_clf=True)
return id_clf
def _estimate_current_error(
self, id_clf, idx_train, idx_cand, idx_eval, w_eval
):
"""
Result must be of float or of shape (len(idx_eval))
"""
return 0.0
def _estimate_error_for_candidate(
self, uclf, idx_cx, cy, idx_train, idx_cand, idx_eval, w_eval
):
raise NotImplementedError(
"Error estimation method must be implemented"
"by the query strategy."
)
def _validate_cost_matrix(self, n_classes):
cost_matrix = (
1 - np.eye(n_classes)
if self.cost_matrix is None
else self.cost_matrix
)
self.cost_matrix_ = check_cost_matrix(cost_matrix, n_classes)
def _concatenate_samples(
self,
X,
y,
sample_weight,
candidates,
sample_weight_candidates,
X_eval,
sample_weight_eval,
):
# Check if candidates are samples if sample_weight_candidates is set
if (
candidates is None or candidates.ndim == 1
) and sample_weight_candidates is not None:
raise ValueError(
"Attribute `sample_weight_candidates` can only "
"be set if `candidates` consists of samples."
)
# TODO: test sample weight_eval - length + column
if sample_weight is not None and len(X) != len(sample_weight):
raise ValueError(
"If `sample_weight` is set, it must have same "
"length as `X`."
)
if sample_weight_candidates is not None and len(candidates) != len(
sample_weight_candidates
):
raise ValueError(
"If `sample_weight_candidates` is set, it must have same "
"length as `X`."
)
# Concatenate samples
X_full = X
y_full = y
w_full = sample_weight
idx_train = np.arange(len(X))
idx_unld = unlabeled_indices(y, self.missing_label_)
if candidates is None:
idx_cand = idx_unld
elif candidates.ndim == 1:
idx_cand = candidates
else:
X_full = np.concatenate([X_full, candidates], axis=0)
y_full = np.concatenate(
[y_full, np.full(len(candidates), np.nan)], axis=0
)
if not (w_full is None and sample_weight_candidates is None):
if w_full is None:
w_full = np.ones(len(X))
if sample_weight_candidates is None:
sample_weight_candidates = np.ones(len(candidates))
w_full = np.concatenate(
[w_full, sample_weight_candidates], axis=0
)
idx_cand = np.arange(len(X), len(X_full))
if X_eval is None:
idx_eval = idx_train
if sample_weight_eval is None:
w_eval = np.ones(len(X_full))
else:
if len(sample_weight_eval) != len(idx_eval):
raise ValueError(
"If `sample_weight_eval` is set but "
"`X_eval` is None, then it should have "
"same size as `X`"
)
w_eval = np.zeros(len(X_full))
w_eval[idx_eval] = sample_weight_eval
else:
X_full = np.concatenate([X_full, X_eval], axis=0)
y_full = np.concatenate(
[y_full, np.full(len(X_eval), np.nan)], axis=0
)
idx_eval = np.arange(len(X_full) - len(X_eval), len(X_full))
w_eval = np.ones(len(X_full))
if sample_weight_eval is not None:
if len(sample_weight_eval) != len(idx_eval):
raise ValueError(
"If `sample_weight_eval` and `X_eval` "
"are set, then `sample_weight_eval` "
"should have len(X_eval)"
)
w_eval[idx_eval] = sample_weight_eval
if w_full is not None:
w_full = np.concatenate([w_full, sample_weight_eval], axis=0)
return X_full, y_full, w_full, w_eval, idx_train, idx_cand, idx_eval
def _risk_estimation(
self, prob_true, prob_pred, cost_matrix, sample_weight
):
if prob_true.ndim == 1 and prob_pred.ndim == 1:
cost_est = cost_matrix[prob_true, :][
range(len(prob_true)), prob_pred
]
return np.sum(sample_weight * cost_est)
elif prob_true.ndim == 1 and prob_pred.ndim == 2:
cost_est = cost_matrix[prob_true, :]
return np.sum(
sample_weight[:, np.newaxis]
* prob_pred
* cost_est[np.newaxis, :]
)
elif prob_true.ndim == 2 and prob_pred.ndim == 1:
cost_est = cost_matrix[:, prob_pred].T
return np.sum(
sample_weight[:, np.newaxis]
* prob_true
* cost_est[np.newaxis, :]
)
else:
prob_mat = (
prob_true[:, :, np.newaxis] @ prob_pred[:, np.newaxis, :]
)
return np.sum(
sample_weight[:, np.newaxis, np.newaxis]
* prob_mat
* cost_matrix[np.newaxis, :, :]
)
def _logloss_estimation(self, prob_true, prob_pred):
return -np.sum(prob_true * np.log(prob_pred + np.finfo(float).eps))
[docs]class MonteCarloEER(ExpectedErrorReduction):
"""This class implements the expected error method from [1] that uses a
Monte-Carlo approach to estimate the error.
Therefore, it implements the following two steps:
- determining ever candidates x label pair and simulate its outcome
in the classifier by simulating it
- determining some kind of risk for the new classifier
Parameters
----------
method : string, optional (default='misclassification_loss')
The optimization method. Possible values are 'misclassification_loss'
and 'log_loss'.
cost_matrix: array-like, shape (n_classes, n_classes), optional
(default=None)
Cost matrix with `cost_matrix[i,j]` defining the cost of predicting
class `j` for a sample with the actual class `i`.
Used for misclassification loss and ignored for log loss.
subtract_current : bool, optional (default=False)
If True, the current error estimate is subtracted from the simulated
score. This might be helpful to define a stopping criterion.
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : int or np.random.RandomState
The random state to use.
References
----------
[1] Roy, N., & McCallum, A. (2001). Toward optimal active learning through
monte carlo estimation of error reduction. ICML, (pp. 441-448)."""
def __init__(
self,
method="misclassification_loss",
cost_matrix=None,
subtract_current=False,
missing_label=MISSING_LABEL,
random_state=None,
):
super().__init__(
enforce_mapping=False,
cost_matrix=cost_matrix,
missing_label=missing_label,
random_state=random_state,
)
self.method = method
self.subtract_current = subtract_current
def _validate_init_params(self):
super()._validate_init_params()
# Validate method.
if not isinstance(self.method, str):
raise TypeError(
"{} is an invalid type for method. Type {} is "
"expected".format(type(self.method), str)
)
if self.method not in ["misclassification_loss", "log_loss"]:
raise ValueError(
f"Supported methods are `misclassification_loss`, or"
f"`log_loss` the given one is: {self.method}"
)
check_type(self.subtract_current, "subtract_current", bool)
if self.method == "log_loss" and self.cost_matrix is not None:
raise ValueError(
"`cost_matrix` must be None if `method` is set to `log_loss`"
)
def _estimate_current_error(
self, id_clf, idx_train, idx_cand, idx_eval, w_eval
):
if self.subtract_current:
probs = id_clf.predict_proba(idx_eval)
if self.method == "misclassification_loss":
preds = np.argmin(np.dot(probs, self.cost_matrix_), axis=1)
err = self._risk_estimation(
probs, preds, self.cost_matrix_, w_eval[idx_eval]
)
elif self.method == "log_loss":
err = self._logloss_estimation(probs, probs)
return err
else:
return super()._estimate_current_error(
id_clf, idx_train, idx_cand, idx_eval, w_eval
)
def _estimate_error_for_candidate(
self, id_clf, idx_cx, cy, idx_train, idx_cand, idx_eval, w_eval
):
id_clf.partial_fit(idx_cx, cy, use_base_clf=True, set_base_clf=False)
probs = id_clf.predict_proba(idx_eval)
if self.method == "misclassification_loss":
preds = np.argmin(np.dot(probs, self.cost_matrix_), axis=1)
err = self._risk_estimation(
probs, preds, self.cost_matrix_, w_eval[idx_eval]
)
elif self.method == "log_loss":
err = self._logloss_estimation(probs, probs)
return err
def _precompute_and_fit_clf(
self, id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf
):
id_clf.precompute(idx_train, idx_cand)
id_clf.precompute(idx_train, idx_eval)
id_clf.precompute(idx_cand, idx_eval)
id_clf = super()._precompute_and_fit_clf(
id_clf,
X_full,
y_full,
idx_train,
idx_cand,
idx_eval,
fit_clf=fit_clf,
)
return id_clf
[docs]class ValueOfInformationEER(ExpectedErrorReduction):
"""This class implements the expected error method from [1] that estimates
the value of information. This method can be extended in a way that it also
implements [2] and [3]. The default parameters describe [1].
Therefore, it implements the following two steps:
- determining ever candidates x label pair and simulate its outcome
in the classifier by simulating it
- determining some kind of risk for the new classifier
Parameters
----------
cost_matrix: array-like, shape (n_classes, n_classes), optional
(default=None)
Cost matrix with `cost_matrix[i,j]` defining the cost of predicting
class `j` for a sample with the actual class `i`.
Used for misclassification loss and ignored for log loss.
consider_unlabeled : bool, optional (default=True)
If True, the error is estimated on the unlabeled samples.
consider_labeled : bool, optional (default=True)
If True, the error is estimated on the labeled samples.
candidate_to_labeled : bool, optional (default=True)
If True, the candidate with the simulated label is added to the labeled
set. As this label is considered to be correct, it will be evaluated
under the `consider_labeled` flag then.
subtract_current : bool, optional (default=False)
If True, the current error estimate is subtracted from the simulated
score. This might be helpful to define a stopping criterion as in [2].
normalize : bool, optional (default=False)
If True the error terms are normalized by the number of evaluation
samples such that the errors represent the average error instead of the
summed error. This will be done independently for the simulated and the
current error.
missing_label : scalar or string or np.nan or None, default=np.nan
Value to represent a missing label.
random_state : int or np.random.RandomState
The random state to use.
References
----------
[1] Kapoor, Ashish, Eric Horvitz, and Sumit Basu. "Selective Supervision:
Guiding Supervised Learning with Decision-Theoretic Active Learning."
IJCAI. Vol. 7. 2007.
[2] Joshi, A. J., Porikli, F., & Papanikolopoulos, N. P. (2012).
Scalable active learning for multiclass image classification.
IEEE TrPAMI, 34(11), pp. 2259-2273.
[3] Margineantu, D. D. (2005). Active cost-sensitive learning.
In IJCAI (Vol. 5, pp. 1622-1623).
"""
def __init__(
self,
cost_matrix=None,
consider_unlabeled=True,
consider_labeled=True,
candidate_to_labeled=True,
subtract_current=False,
normalize=False,
missing_label=MISSING_LABEL,
random_state=None,
):
super().__init__(
enforce_mapping=True,
cost_matrix=cost_matrix,
missing_label=missing_label,
random_state=random_state,
)
self.consider_unlabeled = consider_unlabeled
self.consider_labeled = consider_labeled
self.candidate_to_labeled = candidate_to_labeled
self.subtract_current = subtract_current
self.normalize = normalize
def _validate_init_params(self):
super()._validate_init_params()
check_type(self.consider_unlabeled, "consider_unlabeled", bool)
check_type(self.consider_labeled, "consider_labeled", bool)
check_type(self.candidate_to_labeled, "candidate_to_labeled", bool)
check_type(self.subtract_current, "subtract_current", bool)
check_type(self.normalize, "normalize", bool)
[docs] def query(
self,
X,
y,
clf,
sample_weight=None,
fit_clf=True,
ignore_partial_fit=True,
candidates=None,
batch_size=1,
return_utilities=False,
):
"""Determines for which candidate samples labels are to be queried.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data set, usually complete, i.e. including the labeled and
unlabeled samples.
y : array-like of shape (n_samples)
Labels of the training data set (possibly including unlabeled ones
indicated by self.MISSING_LABEL.
clf : skactiveml.base.SkactivemlClassifier
Model implementing the methods `fit` and `predict_proba`.
fit_clf : bool, optional (default=True)
Defines whether the classifier should be fitted on `X`, `y`, and
`sample_weight`.
ignore_partial_fit : bool, optional (default=True)
Relevant in cases where `clf` implements `partial_fit`. If True,
the `partial_fit` function is ignored and `fit` is used instead.
sample_weight : array-like of shape (n_samples), optional
(default=None)
Weights of training samples in `X`.
candidates : None or array-like of shape (n_candidates), dtype=int or
array-like of shape (n_candidates, n_features),
optional (default=None)
If candidates is None, the unlabeled samples from (X,y) are
considered as candidates.
If candidates is of shape (n_candidates) and of type int,
candidates is considered as the indices of the samples in (X,y).
If candidates is of shape (n_candidates, n_features), the
candidates are directly given in candidates (not necessarily
contained in X). This is not supported by all query strategies.
batch_size : int, optional (default=1)
The number of samples to be selected in one AL cycle.
return_utilities : bool, optional (default=False)
If true, also return the utilities based on the query strategy.
Returns
-------
query_indices : numpy.ndarray of shape (batch_size)
The query_indices indicate for which candidate sample a label is
to queried, e.g., `query_indices[0]` indicates the first selected
sample.
If candidates is None or of shape (n_candidates), the indexing
refers to samples in X.
If candidates is of shape (n_candidates, n_features), the indexing
refers to samples in candidates.
utilities : numpy.ndarray of shape (batch_size, n_samples) or
numpy.ndarray of shape (batch_size, n_candidates)
The utilities of samples after each selected sample of the batch,
e.g., `utilities[0]` indicates the utilities used for selecting
the first sample (with index `query_indices[0]`) of the batch.
Utilities for labeled samples will be set to np.nan.
If candidates is None or of shape (n_candidates), the indexing
refers to samples in X.
If candidates is of shape (n_candidates, n_features), the indexing
refers to samples in candidates.
"""
# TODO check if candidates are only unlabeled ones if given
return super().query(
X,
y,
clf,
sample_weight=sample_weight,
fit_clf=fit_clf,
ignore_partial_fit=ignore_partial_fit,
candidates=candidates,
sample_weight_candidates=None,
X_eval=None,
sample_weight_eval=None,
batch_size=batch_size,
return_utilities=return_utilities,
)
def _estimate_error_for_candidate(
self, id_clf, idx_cx, cy, idx_train, idx_cand, idx_eval, w_eval
):
id_clf.partial_fit(idx_cx, cy, use_base_clf=True, set_base_clf=False)
# Handle problem that if only one candidate is remaining, this should
# be the one to be selected although the error cannot be estimated
# as there are no instances left for estimating
le = id_clf._le
y_eval = id_clf.y[idx_eval]
idx_labeled = idx_train[
is_labeled(y_eval, missing_label=self.missing_label_)
]
y_labeled = id_clf.y[idx_labeled]
idx_unlabeled = idx_train[
is_unlabeled(y_eval, missing_label=self.missing_label_)
]
if self.candidate_to_labeled:
idx_labeled = np.concatenate([idx_labeled, idx_cx], axis=0)
y_labeled = np.concatenate([y_labeled, cy], axis=0)
idx_unlabeled = np.setdiff1d(
idx_unlabeled, idx_cx, assume_unique=True
)
y_labeled_c_id = le.transform(y_labeled)
err = 0
norm = 0
if self.consider_labeled and len(idx_labeled) > 0:
norm += len(idx_labeled)
probs = id_clf.predict_proba(idx_labeled)
err += self._risk_estimation(
y_labeled_c_id, probs, self.cost_matrix_, w_eval[idx_labeled]
)
if self.consider_unlabeled and len(idx_unlabeled) > 0:
norm += len(idx_unlabeled)
probs = id_clf.predict_proba(idx_unlabeled)
err += self._risk_estimation(
probs, probs, self.cost_matrix_, w_eval[idx_unlabeled]
)
if self.normalize:
if norm == 0:
return 0.0
else:
return err / norm
else:
return err
def _estimate_current_error(
self, id_clf, idx_train, idx_cand, idx_eval, w_eval
):
# estimate current utility score if required
# TODO: maybe use function for code below to reduce redundancies
if self.subtract_current:
le = id_clf._le
y_eval = id_clf.y[idx_eval]
idx_labeled = idx_train[is_labeled(y_eval)]
y_labeled = id_clf.y[idx_labeled]
idx_unlabeled = idx_train[is_unlabeled(y_eval)]
y_labeled_c_id = le.transform(y_labeled)
err = 0
norm = 0
if self.consider_labeled and len(idx_labeled) > 0:
norm += len(idx_labeled)
probs = id_clf.predict_proba(idx_labeled)
err += self._risk_estimation(
y_labeled_c_id,
probs,
self.cost_matrix_,
w_eval[idx_labeled],
)
if self.consider_unlabeled and len(idx_unlabeled) > 0:
norm += len(idx_unlabeled)
probs = id_clf.predict_proba(idx_unlabeled)
err += self._risk_estimation(
probs, probs, self.cost_matrix_, w_eval[idx_unlabeled]
)
if self.normalize:
return err / norm
else:
return err
else:
return super()._estimate_current_error(
id_clf, idx_train, idx_cand, idx_eval, w_eval
)
def _precompute_and_fit_clf(
self, id_clf, X_full, y_full, idx_train, idx_cand, idx_eval, fit_clf
):
# TODO: replace the following line by more efficient code
id_clf.precompute(
idx_train, idx_train, fit_params="all", pred_params="all"
)
#
# # for cond_prob
# id_clf.precompute(idx_train, idx_cand,
# fit_params='labeled', pred_params='all')
# # for risk estimation
# if self.consider_labeled:
# id_clf.precompute(idx_train, idx_eval,
# fit_params='labeled', pred_params='labeled')
# id_clf.precompute(idx_cand, idx_eval,
# fit_params='all', pred_params='labeled')
# if self.candidate_to_labeled:
# # idx_train ('labeled'), idx_cand ('all') exists above
# # TODO: consider only equal instances would be sufficient
# id_clf.precompute(idx_cand, idx_cand,
# fit_params='all', pred_params='all')
# if self.consider_unlabeled:
# id_clf.precompute(idx_train, idx_eval,
# fit_params='labeled', pred_params='unlabeled')
# id_clf.precompute(idx_cand, idx_eval,
# fit_params='all', pred_params='unlabeled')
id_clf = super()._precompute_and_fit_clf(
id_clf,
X_full,
y_full,
idx_train,
idx_cand,
idx_eval,
fit_clf=fit_clf,
)
return id_clf