Source code for skactiveml.visualization._feature_space

import warnings

import numpy as np
from matplotlib import lines, pyplot as plt
from matplotlib.axes import Axes
from sklearn.base import ClassifierMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import (
    check_array,
    check_consistent_length,
    column_or_1d,
)

from ._misc import (
    mesh,
    check_bound,
    _get_boundary_args,
    _get_confidence_args,
    _get_contour_args,
    _get_cmap,
)
from ..base import (
    QueryStrategy,
    SingleAnnotatorPoolQueryStrategy,
    MultiAnnotatorPoolQueryStrategy,
)
from ..exceptions import MappingError
from ..utils import (
    check_scalar,
    unlabeled_indices,
    call_func,
    check_type,
    check_indices,
)


[docs]def plot_utilities(qs, X, y, candidates=None, **kwargs): """Plot the utility for the given single-annotator query strategy. Parameters ---------- qs : skactiveml.base.SingleAnnotatorPoolQueryStrategy The query strategy for which the utility is plotted. X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, ) or (n_samples, n_annotators) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL). candidates : None or array-like of shape (n_candidates,), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If `candidates` is None, the unlabeled samples from (X,y) are considered as candidates. If `candidates` is of shape (n_candidates,) and of type int, candidates is considered as the indices of the samples in (X,y). If `candidates` is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. Other Parameters ---------------- replace_nan : numeric or None, optional (default=0.0) Only used if plotting with mesh instances is not possible. If numeric, the utility of labeled instances will be plotted with value `replace_nan`. If None, these samples will be ignored. ignore_undefined_query_params : bool, optional (default=False) If True, query parameters that are not defined in the query function are ignored and will not raise an exception. feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]], optional (default=None) Determines the area in which the boundary is plotted. If candidates is not given, bound must not be None. Otherwise, the bound is determined based on the data. ax : matplotlib.axes.Axes, optional (default=None) The axis on which the utility is plotted. Only if y.ndim = 1 (single annotator). res : int, optional (default=21) The resolution of the plot. contour_dict : dict, optional (default=None) Additional parameters for the utility contour. **kwargs Remaining keyword arguments are passed the query function of the query strategy. Returns ------- ax : matplotlib.axes.Axes The axis on which the utilities were plotted. """ check_type(qs, "qs", SingleAnnotatorPoolQueryStrategy) return _general_plot_utilities( qs=qs, X=X, y=y, candidates=candidates, **kwargs )
[docs]def plot_annotator_utilities(qs, X, y, candidates=None, **kwargs): """Plot the utility for the given query strategy. Parameters ---------- qs : skactiveml.base.QueryStrategy The query strategy for which the utility is plotted. X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, ) or (n_samples, n_annotators) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL). candidates : None or array-like of shape (n_candidates,), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If `candidates` is None, the unlabeled samples from (X,y) are considered as candidates. If `candidates` is of shape (n_candidates,) and of type int, candidates is considered as the indices of the samples in (X,y). If `candidates` is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. Other Parameters ---------------- replace_nan : numeric or None, optional (default=0.0) Only used if plotting with mesh instances is not possible. If numeric, the utility of labeled instances will be plotted with value `replace_nan`. If None, these samples will be ignored. ignore_undefined_query_params : bool, optional (default=False) If True, query parameters that are not defined in the query function are ignored and will not raise an exception. feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]], optional (default=None) Determines the area in which the boundary is plotted. If candidates is not given, bound must not be None. Otherwise, the bound is determined based on the data. axes : array-like of matplotlib.axes.Axes, optional (default=None) The axes on which the utilities for the annotators are plotted. Only supported for y.ndim = 2 (multi annotator). res : int, optional (default=21) The resolution of the plot. contour_dict : dict, optional (default=None) Additional parameters for the utility contour. plot_annotators : None or array-like of shape (n_annotators_to_plot,), optional (default=None) Contains the indices of the annotators to be plotted. If it is None, all annotators are plotted. Only supported for y.ndim = 2 (multi annotator). **kwargs Remaining keyword arguments are passed the query function of the query strategy. Returns ------- axes : array-like of shape (n_annotators_to_plot,) The axes on which the utilities were plotted. """ check_type(qs, "qs", MultiAnnotatorPoolQueryStrategy) return _general_plot_utilities( qs=qs, X=X, y=y, candidates=candidates, **kwargs )
[docs]def plot_decision_boundary( clf, feature_bound, ax=None, res=21, boundary_dict=None, confidence=0.75, cmap="coolwarm", confidence_dict=None, ): """Plot the decision boundary of the given classifier. Parameters ---------- clf: Sklearn classifier The fitted classifier whose decision boundary is plotted. If confidence is not None, the classifier must implement the predict_proba function. feature_bound: array-like, [[xmin, ymin], [xmax, ymax]] Determines the area in which the boundary is plotted. ax: matplotlib.axes.Axes or List, optional (default=None) The axis on which the decision boundary is plotted. If ax is a List, each entry has to be an `matplotlib.axes.Axes`. res: int, optional (default=21) The resolution of the plot. boundary_dict: dict, optional (default=None) Additional parameters for the boundary contour. confidence: scalar | None, optional (default=0.75) The confidence interval plotted with dashed lines. It is not plotted if confidence is None. Must be in the open interval (0.5, 1). The value stands for the ratio best class / second best class. cmap: str | matplotlib.colors.Colormap, optional (default='coolwarm_r') The colormap for the confidence levels. confidence_dict: dict, optional (default=None) Additional parameters for the confidence contour. Must not contain a colormap because cmap is used. Returns ------- ax: matplotlib.axes.Axes or List The axis on which the boundary was plotted or the list of axis if ax was a list. """ check_type(clf, "clf", ClassifierMixin) check_scalar(res, "res", int, min_val=1) if ax is None: ax = plt.gca() check_type(ax, "ax", Axes) feature_bound = check_bound(bound=feature_bound) # Check and convert the colormap cmap = _get_cmap(cmap) if confidence is not None: check_scalar( confidence, "confidence", float, min_inclusive=False, max_inclusive=False, min_val=0.5, max_val=1, ) # Update additional arguments boundary_args = _get_boundary_args(boundary_dict) confidence_args = _get_confidence_args(confidence_dict) # Create mesh for plotting X_mesh, Y_mesh, mesh_instances = mesh(feature_bound, res) # Calculate predictions if hasattr(clf, "predict_proba"): predictions = clf.predict_proba(mesh_instances) classes = np.arange(predictions.shape[1]) elif hasattr(clf, "predict"): if confidence is not None: warnings.warn( "The given classifier does not implement " "'predict_proba'. Thus, the confidence cannot be " "plotted." ) confidence = None predicted_classes = clf.predict(mesh_instances) classes = np.arange(len(np.unique(predicted_classes))) predictions = np.zeros((len(predicted_classes), len(classes))) for idx, y in enumerate(predicted_classes): predictions[idx, y] = 1 else: raise AttributeError( "'clf' must implement 'predict' or " "'predict_proba'" ) posterior_list = [] for y in classes: posteriors = predictions[:, y].reshape(X_mesh.shape) posterior_list.append(posteriors) norm = plt.Normalize(vmin=min(classes), vmax=max(classes)) for y in classes: posteriors = posterior_list[y] posteriors_best_alternative = np.zeros_like(posteriors) for y2 in np.setdiff1d(classes, [y]): posteriors_best_alternative = np.max( [posteriors_best_alternative, posterior_list[y2]], axis=0 ) posteriors = posteriors / (posteriors + posteriors_best_alternative) ax.contour(X_mesh, Y_mesh, posteriors, [0.5], **boundary_args) if confidence is not None: ax.contour( X_mesh, Y_mesh, posteriors, [confidence], colors=[cmap(norm(y))], **confidence_args, ) return ax
[docs]def plot_contour_for_samples( X, values, replace_nan=0.0, feature_bound=None, ax=None, res=21, contour_dict=None, ): """Plot the utility for the given query strategy. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. values : array-like of shape (n_samples) Values to plot for samples `X` (may contain np.nan, can be replaced or ignored, see `replace_nan`). replace_nan : numeric or None, optional (default=0.0) If numeric, nan-values in `values` will be replaced by this number. If None, these samples will be ignored. feature_bound : array-like, [[xmin, ymin], [xmax, ymax]] Determines the area in which the boundary is plotted. If candidates is not given, bound must not be None. Otherwise, the bound is determined based on the data. ax : matplotlib.axes.Axes, optional (default=None) The axis on which the utility is plotted. If no axis is given, the current axis (`plt.gca()`) will be used instead. res : int, optional (default=21) The resolution of the plot. contour_dict : dict, optional (default=None) Additional parameters for the utility contour. Returns ------- matplotlib.axes.Axes: The axis on which the utility was plotted. """ check_array(X, ensure_2d=True) values = check_array( values, ensure_2d=False, force_all_finite=False, copy=True ) values[np.isinf(values)] = np.nan feature_bound = check_bound(bound=feature_bound, X=X) X_mesh, Y_mesh, mesh_instances = mesh(feature_bound, res) if ax is None: ax = plt.gca() if replace_nan is None: valid_idx = ~np.isnan(values) X = X[valid_idx] values = values[valid_idx] else: values = np.nan_to_num(values, nan=replace_nan) contour_args = _get_contour_args(contour_dict) neighbors = KNeighborsRegressor(n_neighbors=1) neighbors.fit(X, values) scores = neighbors.predict(mesh_instances).reshape(X_mesh.shape) ax.contourf(X_mesh, Y_mesh, scores, **contour_args) return ax
[docs]def plot_stream_training_data( ax, X, y, queried_indices, classes, feature_bound, unlabeled_color="grey", cmap="coolwarm", alpha=0.2, linewidth=3, plot_cand_highlight=True, ): """Plot the utility for the given query strategy. Parameters ---------- ax : matplotlib.axes.Axes The axis on which the utility is plotted. Only if y.ndim = 1 (single annotator). X : array-like of shape (n_samples, 1) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, ) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL). queried_indices : array-like of shape (n_samples,) Indicates which instances from candidates have been queried. classes : array-like of shape (n_classes) Holds the label for each class. feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]] Determines the area in which the boundary is plotted. If candidates is not given, bound must not be None. Otherwise, the bound is determined based on the data. unlabeled_color: str | matplotlib.colors.Colormap, optional (default='grey') The color for the unlabled samples. cmap: str | matplotlib.colors.Colormap, optional (default='coolwarm_r') The colormap for the confidence levels. alpha: scalar Set the alpha value used for blending - not supported on all backends. linewidth: float Set the line width in points. plot_cand_highlight: bool The indicator to higlight the current candidate. Returns ------- axes : array-like of shape (n_annotators_to_plot,) The axes on which the utilities were plotted. """ column_or_1d(X) check_array(y, ensure_2d=False, force_all_finite="allow-nan") check_consistent_length(X, y) check_array(queried_indices, ensure_2d=False) check_array(classes, ensure_2d=False) check_type(unlabeled_color, "unlabeled_color", str) check_type(plot_cand_highlight, "plot_cand_highlight", bool) check_type(ax, "ax", Axes) data_lines = [] cmap = _get_cmap(cmap) norm = plt.Normalize(vmin=min(classes), vmax=max(classes)) highlight_color = ( cmap(norm(y[-1])) if queried_indices[-1] else unlabeled_color ) if plot_cand_highlight: data_lines.append( lines.Line2D( [0, feature_bound[0][1]], [X[-1], X[-1]], c=highlight_color, alpha=alpha, linewidth=linewidth * 2, ) ) for t, (x_t, a, y_t) in enumerate(zip(X, queried_indices, y)): line_color = cmap(norm(y_t)) if a else unlabeled_color zorder = 3 if a else 2 alpha_tmp = alpha * 2 if a else alpha data_lines.append( lines.Line2D( [t, len(X) - 1], [x_t, x_t], zorder=zorder, color=line_color, alpha=alpha_tmp, linewidth=linewidth, ) ) for d_line in data_lines: ax.add_line(d_line) return data_lines
[docs]def plot_stream_decision_boundary( ax, t_x, plot_step, clf, X, pred_list, color="k", res=25, ): """Plot the decision boundary of the given classifier. Parameters ---------- ax: matplotlib.axes.Axes or List The axis on which the decision boundary is plotted. If ax is a List, each entry has to be an `matplotlib.axes.Axes`. t_x: int The position of the newest instance for the x axies. plot_step: int The interval in which the clf should predict new samples. clf: Sklearn classifier The fitted classifier whose decision boundary is plotted. X : array-like of shape (n_samples, 1) Training data set, usually complete, i.e. including the labeled and unlabeled samples. pred_list: array-like of shape (n_samples, ) The list containing classifier prediction for the last steps. color: str | matplotlib.colors.Colormap, optional (default='k') The color for the decision boundary. res : int, optional (default=25) The resolution of the plot. Returns ------- ax: matplotlib.axes.Axes or List The axis on which the boundary was plotted or the list of axis if ax was a list. pred_list: array-like of shape (n_samples, ) The list containing classifier prediction for the last steps. """ X = column_or_1d(X) check_array(pred_list, ensure_2d=False, ensure_min_samples=0) check_scalar(t_x, "t_x", int, min_val=0) check_scalar(plot_step, "plot_step", int, min_val=1) check_type(ax, "ax", Axes) check_type(clf, "clf", ClassifierMixin) x_vec = np.linspace(np.min(X), np.max(X), res) t_vec = np.arange(1, t_x // plot_step + 1) * plot_step t_mesh, x_mesh = np.meshgrid(t_vec, x_vec) predictions = np.array([clf.predict(x_vec.reshape([-1, 1]))]) pred_list.extend(predictions) if len(pred_list) > 2 and np.sum(pred_list) > 0: ax.contour( t_mesh, x_mesh, np.array(pred_list[1:]).T, levels=[0.5], colors=color, ) return ax, pred_list
def _general_plot_utilities(qs, X, y, candidates=None, **kwargs): """Plot the utility for the given query strategy. Parameters ---------- qs : skactiveml.base.QueryStrategy The query strategy for which the utility is plotted. X : array-like of shape (n_samples, n_features) Training data set, usually complete, i.e. including the labeled and unlabeled samples. y : array-like of shape (n_samples, ) or (n_samples, n_annotators) Labels of the training data set (possibly including unlabeled ones indicated by self.MISSING_LABEL). candidates : None or array-like of shape (n_candidates,), dtype=int or array-like of shape (n_candidates, n_features), optional (default=None) If `candidates` is None, a mesh with the specified resolution is generated and considered as candidates. If `candidates` is of shape (n_candidates,) and of type int, candidates is considered as the indices of the samples in (X,y). If `candidates` is of shape (n_candidates, n_features), the candidates are directly given in candidates (not necessarily contained in X). This is not supported by all query strategies. Other Parameters ---------------- replace_nan : numeric or None, optional (default=0.0) Only used if plotting with mesh instances is not possible. If numeric, the utility of labeled instances will be plotted with value `replace_nan`. If None, these samples will be ignored. ignore_undefined_query_params : bool, optional (default=False) If True, query parameters that are not defined in the query function are ignored and will not raise an exception. feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]], optional (default=None) Determines the area in which the boundary is plotted. If candidates is not given, bound must not be None. Otherwise, the bound is determined based on the data. ax : matplotlib.axes.Axes, optional (default=None) The axis on which the utility is plotted. Only if y.ndim = 1 (single annotator). axes : array-like of matplotlib.axes.Axes, optional (default=None) The axes on which the utilities for the annotators are plotted. Only supported for y.ndim = 2 (multi annotator). res : int, optional (default=21) The resolution of the plot. contour_dict : dict, optional (default=None) Additional parameters for the utility contour. plot_annotators : None or array-like of shape (n_annotators_to_plot,), optional (default=None) Contains the indices of the annotators to be plotted. If it is None, all annotators are plotted. Only supported for y.ndim = 2 (multi annotator). **kwargs Remaining keyword arguments are passed the query function of the query strategy. Returns ------- axes : array-like of shape (n_annotators_to_plot,) The axes on which the utilities were plotted. """ replace_nan = kwargs.pop("replace_nan", 0.0) ignore_undefined_query_params = kwargs.pop( "ignore_undefined_query_params", False ) feature_bound = kwargs.pop("feature_bound", None) ax = kwargs.pop("ax", None) axes = kwargs.pop("axes", None) res = kwargs.pop("res", 21) contour_dict = kwargs.pop("contour_dict", None) plot_annotators = kwargs.pop("plot_annotators", None) check_type(qs, "qs", QueryStrategy) X = check_array(X, allow_nd=False, ensure_2d=True) if X.shape[1] != 2: raise ValueError("Samples in `X` must have 2 features.") # Check labels y = check_array(y, ensure_2d=False, force_all_finite="allow-nan") check_consistent_length(X, y) if y.ndim == 2: if plot_annotators is None: n_annotators = y.shape[1] plot_annotators = np.arange(n_annotators) else: plot_annotators = column_or_1d(plot_annotators) check_indices(plot_annotators, y, dim=1) n_annotators = len(plot_annotators) else: n_annotators = None if plot_annotators is not None: raise TypeError( "`plot_annotator` can be only used in the multi-annotator " "setting." ) else: plot_annotators = np.arange(1) if n_annotators is None: if axes is not None: raise TypeError( "`axes` can be only used in the multi-annotator setting. " "Use `ax` instead." ) if ax is None: axes = np.array([plt.subplots(1, 1)[1]]) else: check_type(ax, "ax", Axes) axes = np.array([ax]) else: if ax is not None: raise ValueError( "`ax` can be only used in the single-annotator setting. " "Use `axes` instead." ) if axes is None: axes = plt.subplots(1, n_annotators)[1] else: [check_type(ax_, "ax", Axes) for ax_ in axes] if n_annotators is not None and len(axes) != n_annotators: raise ValueError( "`axes` must contain one `Axes` object for each " "annotator to be plotted (indicated by `plot_annotators`)." ) # ensure that utilities are returned kwargs["return_utilities"] = True if candidates is None: # plot mesh try: check_scalar(res, "res", int, min_val=1) feature_bound = check_bound(bound=feature_bound, X=X) X_mesh, Y_mesh, mesh_instances = mesh(feature_bound, res) contour_args = _get_contour_args(contour_dict) if ignore_undefined_query_params: _, utilities = call_func( qs.query, X=X, y=y, candidates=mesh_instances, **kwargs ) else: _, utilities = qs.query( X=X, y=y, candidates=mesh_instances, **kwargs ) for a_idx, ax_ in zip(plot_annotators, axes): if n_annotators is not None: utilities_a_idx = utilities[0, :, a_idx] else: utilities_a_idx = utilities[0, :] utilities_a_idx = utilities_a_idx.reshape(X_mesh.shape) ax_.contourf(X_mesh, Y_mesh, utilities_a_idx, **contour_args) if n_annotators is None: return axes[0] else: return axes except MappingError: candidates = unlabeled_indices(y, missing_label=qs.missing_label) except BaseException as err: warnings.warn( f"Unable to create utility plot with mesh because " f"of the following error. Trying plotting over " f"candidates. \n\n Unexpected {err.__repr__()}" ) candidates = unlabeled_indices(y, missing_label=qs.missing_label) candidates = check_array( candidates, allow_nd=False, ensure_2d=False, force_all_finite="allow-nan", ) if candidates.ndim == 1: X_utils = X candidates = check_indices(candidates, X) else: X_utils = candidates if ignore_undefined_query_params: _, utilities = call_func( qs.query, X=X, y=y, candidates=candidates, **kwargs ) else: _, utilities = qs.query(X=X, y=y, candidates=candidates, **kwargs) for a_idx, ax_ in zip(plot_annotators, axes): if n_annotators is not None: utilities_a_idx = utilities[0, :, a_idx] else: utilities_a_idx = utilities[0, :] plot_contour_for_samples( X_utils, utilities_a_idx, replace_nan=replace_nan, feature_bound=feature_bound, ax=ax_, res=res, contour_dict=contour_dict, ) if n_annotators is None: return axes[0] else: return axes