Source code for skactiveml.visualization._feature_space

import warnings

import numpy as np
from matplotlib import lines, pyplot as plt
from matplotlib.axes import Axes
from sklearn.base import ClassifierMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import (
    check_array,
    check_consistent_length,
    column_or_1d,
)

from ._misc import (
    mesh,
    check_bound,
    _get_boundary_args,
    _get_confidence_args,
    _get_contour_args,
    _get_cmap,
)
from ..base import (
    QueryStrategy,
    SingleAnnotatorPoolQueryStrategy,
    MultiAnnotatorPoolQueryStrategy,
)
from ..exceptions import MappingError
from ..utils import (
    check_scalar,
    unlabeled_indices,
    call_func,
    check_type,
    check_indices,
)



[docs]
def plot_utilities(qs, X, y, candidates=None, **kwargs):
    """Plot the utility for the given single-annotator query strategy.

    Parameters
    ----------
    qs : skactiveml.base.SingleAnnotatorPoolQueryStrategy
        The query strategy for which the utility is plotted.
    X : array-like of shape (n_samples, n_features)
        Training data set, usually complete, i.e., including the labeled and
        unlabeled samples.
    y : array-like of shape (n_samples,) or (n_samples, n_annotators)
        Labels of the training data set (possibly including unlabeled ones
        indicated by `qs.missing_label`).
    candidates : None or array-like of shape (n_candidates), dtype=int or \
            array-like of shape (n_candidates, n_features), default=None
        - If `candidates` is `None`, the unlabeled samples from
          `(X,y)` are considered as `candidates`.
        - If `candidates` is of shape `(n_candidates,)` and of type
          `int`, `candidates` is considered as the indices of the
          samples in `(X,y)`.
        - If `candidates` is of shape `(n_candidates, ...)`, the
          candidate samples are directly given in `candidates` (not
          necessarily contained in `X`). This is not supported by all
          query strategies.

    Other Parameters
    ----------------
    replace_nan : numeric or None, default=0.0
        Only used if plotting with mesh samples is not possible.
        If numeric, the utility of labeled samples will be plotted with
        value `replace_nan`. If None, these samples will be ignored.
    ignore_undefined_query_params : bool, default=False
        If True, query parameters that are not defined in the query function
        are ignored and will not raise an exception.
    feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]],\
            default=None
        Determines the area in which the boundary is plotted. If candidates is
        not given, bound must not be None. Otherwise, the bound is determined
        based on the data.
    ax : matplotlib.axes.Axes, default=None
        The axis on which the utility is plotted. Only if y.ndim = 1 (single
        annotator).
    res : int, default=21
        The resolution of the plot.
    contour_dict : dict, default=None
        Additional parameters for the utility contour.
    **kwargs
        Remaining keyword arguments are passed the query function of the query
        strategy.

    Returns
    -------
    ax : matplotlib.axes.Axes
        The axis on which the utilities were plotted.
    """
    check_type(qs, "qs", SingleAnnotatorPoolQueryStrategy)
    return _general_plot_utilities(
        qs=qs, X=X, y=y, candidates=candidates, **kwargs
    )




[docs]
def plot_annotator_utilities(qs, X, y, candidates=None, **kwargs):
    """Plot the utility for the given query strategy.

    Parameters
    ----------
    qs : skactiveml.base.MultiAnnotatorPoolQueryStrategy
        The query strategy for which the utility is plotted.
    X : array-like of shape (n_samples, n_features)
        Training data set, usually complete, i.e. including the labeled and
        unlabeled samples.
    y : array-like of shape (n_samples,) or (n_samples, n_annotators)
        Labels of the training data set (possibly including unlabeled ones
        indicated by `qs.missing_label`).
    candidates : None or array-like of shape (n_candidates), dtype=int or \
            array-like of shape (n_candidates, n_features), default=None
        - If `candidates` is `None`, the unlabeled samples from
          `(X,y)` are considered as `candidates`.
        - If `candidates` is of shape `(n_candidates,)` and of type
          `int`, `candidates` is considered as the indices of the
          samples in `(X,y)`.
        - If `candidates` is of shape `(n_candidates, ...)`, the
          candidate samples are directly given in `candidates` (not
          necessarily contained in `X`). This is not supported by all
          query strategies.

    Other Parameters
    ----------------
    replace_nan : numeric or None, default=0.0
        Only used if plotting with mesh samples is not possible.
        If numeric, the utility of labeled samples will be plotted with
        value `replace_nan`. If None, these samples will be ignored.
    ignore_undefined_query_params : bool, default=False
        If True, query parameters that are not defined in the query function
        are ignored and will not raise an exception.
    feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]],\
            default=None
        Determines the area in which the boundary is plotted. If candidates is
        not given, bound must not be None. Otherwise, the bound is determined
        based on the data.
    axes : array-like of matplotlib.axes.Axes, default=None
        The axes on which the utilities for the annotators are plotted. Only
        supported for y.ndim = 2 corresponding to a setting with multiple
        annotators.
    res : int, default=21
        The resolution of the plot.
    contour_dict : dict, default=None
        Additional parameters for the utility contour.
    plot_annotators : None or array-like of shape (n_annotators_to_plot,),\
            default=None
        Contains the indices of the annotators to be plotted. If it is None,
        all annotators are plotted. Only supported for y.ndim = 2 corresponding
        to a setting with multiple annotators.
    **kwargs
        Remaining keyword arguments are passed the query function of the query
        strategy.

    Returns
    -------
    axes : array-like of shape (n_annotators_to_plot,)
        The axes on which the utilities were plotted.
    """
    check_type(qs, "qs", MultiAnnotatorPoolQueryStrategy)
    return _general_plot_utilities(
        qs=qs, X=X, y=y, candidates=candidates, **kwargs
    )




[docs]
def plot_decision_boundary(
    clf,
    feature_bound,
    ax=None,
    res=21,
    boundary_dict=None,
    confidence=0.75,
    cmap="coolwarm",
    confidence_dict=None,
):
    """Plot the decision boundary of the given classifier.

    Parameters
    ----------
    clf : sklearn.base.ClassifierMixin
        The fitted classifier whose decision boundary is plotted. If confidence
        is not None, the classifier must implement the predict_proba function.
    feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]]
        Determines the area in which the boundary is plotted.
    ax : matplotlib.axes.Axes or List, default=None
        The axis on which the decision boundary is plotted. If ax is a List,
        each entry has to be an `matplotlib.axes.Axes`.
    res : int, default=21
        The resolution of the plot.
    boundary_dict : dict, default=None
        Additional parameters for the boundary contour.
    confidence : scalar or None, default=0.75
        The confidence interval plotted with dashed lines. It is not plotted if
        confidence is None. Must be in the open interval (0.5, 1). The value
        stands for the ratio best class / second best class.
    cmap : str or matplotlib.colors.Colormap, default='coolwarm_r'
        The colormap for the confidence levels.
    confidence_dict : dict, default=None
        Additional parameters for the confidence contour. Must not contain a
        colormap because cmap is used.

    Returns
    -------
    ax : matplotlib.axes.Axes or List
        The axis on which the boundary was plotted or the list of axis if ax
        was a list.
    """
    check_type(clf, "clf", ClassifierMixin)
    check_scalar(res, "res", int, min_val=1)
    if ax is None:
        ax = plt.gca()
    check_type(ax, "ax", Axes)
    feature_bound = check_bound(bound=feature_bound)

    # Check and convert the colormap
    cmap = _get_cmap(cmap)

    if confidence is not None:
        check_scalar(
            confidence,
            "confidence",
            float,
            min_inclusive=False,
            max_inclusive=False,
            min_val=0.5,
            max_val=1,
        )

    # Update additional arguments
    boundary_args = _get_boundary_args(boundary_dict)
    confidence_args = _get_confidence_args(confidence_dict)

    # Create mesh for plotting
    X_mesh, Y_mesh, mesh_samples = mesh(feature_bound, res)

    # Calculate predictions
    if hasattr(clf, "predict_proba"):
        predictions = clf.predict_proba(mesh_samples)
        classes = np.arange(predictions.shape[1])
    elif hasattr(clf, "predict"):
        if confidence is not None:
            warnings.warn(
                "The given classifier does not implement "
                "'predict_proba'. Thus, the confidence cannot be "
                "plotted."
            )
            confidence = None
        predicted_classes = clf.predict(mesh_samples)
        classes = np.arange(len(np.unique(predicted_classes)))
        predictions = np.zeros((len(predicted_classes), len(classes)))
        for idx, y in enumerate(predicted_classes):
            predictions[idx, y] = 1
    else:
        raise AttributeError(
            "'clf' must implement 'predict' or " "'predict_proba'"
        )

    posterior_list = []

    for y in classes:
        posteriors = predictions[:, y].reshape(X_mesh.shape)
        posterior_list.append(posteriors)

    norm = plt.Normalize(vmin=min(classes), vmax=max(classes))

    for y in classes:
        posteriors = posterior_list[y]
        posteriors_best_alternative = np.zeros_like(posteriors)
        for y2 in np.setdiff1d(classes, [y]):
            posteriors_best_alternative = np.max(
                [posteriors_best_alternative, posterior_list[y2]], axis=0
            )

        posteriors = posteriors / (posteriors + posteriors_best_alternative)
        ax.contour(X_mesh, Y_mesh, posteriors, [0.5], **boundary_args)
        if confidence is not None:
            ax.contour(
                X_mesh,
                Y_mesh,
                posteriors,
                [confidence],
                colors=[cmap(norm(y))],
                **confidence_args,
            )
    return ax




[docs]
def plot_contour_for_samples(
    X,
    values,
    replace_nan=0.0,
    feature_bound=None,
    ax=None,
    res=21,
    contour_dict=None,
):
    """Plot the utility for the given query strategy.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training data set, usually complete, i.e., including the labeled and
        unlabeled samples.
    values : array-like of shape (n_samples,)
        Values to plot for samples `X` (may contain np.nan, can be replaced
        or ignored, see `replace_nan`).
    replace_nan : numeric or None, default=0.0
        If numeric, nan-values in `values` will be replaced by this number.
        If None, these samples will be ignored.
    feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]]
        Determines the area in which the boundary is plotted. If candidates is
        not given, bound must not be None. Otherwise, the bound is determined
        based on the data.
    ax : matplotlib.axes.Axes, default=None
        The axis on which the utility is plotted. If no axis is given, the
        current axis (`plt.gca()`) will be used instead.
    res : int, default=21
        The resolution of the plot.
    contour_dict : dict, default=None
        Additional parameters for the utility contour.

    Returns
    -------
    ax : matplotlib.axes.Axes
        The axis on which the utility was plotted.
    """
    check_array(X, ensure_2d=True)
    values = check_array(
        values, ensure_2d=False, ensure_all_finite=False, copy=True
    )
    values[np.isinf(values)] = np.nan

    feature_bound = check_bound(bound=feature_bound, X=X)

    X_mesh, Y_mesh, mesh_samples = mesh(feature_bound, res)

    if ax is None:
        ax = plt.gca()

    if replace_nan is None:
        valid_idx = ~np.isnan(values)
        X = X[valid_idx]
        values = values[valid_idx]
    else:
        values = np.nan_to_num(values, nan=replace_nan)

    contour_args = _get_contour_args(contour_dict)

    neighbors = KNeighborsRegressor(n_neighbors=1)
    neighbors.fit(X, values)

    scores = neighbors.predict(mesh_samples).reshape(X_mesh.shape)
    ax.contourf(X_mesh, Y_mesh, scores, **contour_args)
    return ax




[docs]
def plot_stream_training_data(
    ax,
    X,
    y,
    queried_indices,
    classes,
    feature_bound,
    unlabeled_color="grey",
    cmap="coolwarm",
    alpha=0.2,
    linewidth=3,
    plot_cand_highlight=True,
):
    """Plot the utility for the given query strategy.

    Parameters
    ----------
    ax : matplotlib.axes.Axes
        The axis on which the utility is plotted. Only if y.ndim = 1 (single
        annotator).
    X : array-like of shape (n_samples, 1)
        Training data set, usually complete, i.e., including the labeled and
        unlabeled samples.
    y : array-like of shape (n_samples,)
        Labels of the training data set (possibly including unlabeled ones).
    queried_indices : array-like of shape (n_samples,)
        Indicates which samples in `X` have been queried.
    classes : array-like of shape (n_classes,)
        Holds the label for each class.
    feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]]
        Determines the area in which the boundary is plotted. If candidates is
        not given, bound must not be None. Otherwise, the bound is determined
        based on the data.
    unlabeled_color : str or matplotlib.colors.Colormap, default='grey'
        The color for the unlabeled samples.
    cmap : str or matplotlib.colors.Colormap, default='coolwarm_r'
        The colormap for the confidence levels.
    alpha : scalar, default=0.2
        Set the alpha value used for blending - not supported on all backends.
    linewidth : float, default=3
        Set the line width in points.
    plot_cand_highlight : bool, default=True
        The indicator to highlight the current candidate.

    Returns
    -------
     axes : array-like of shape (n_annotators_to_plot,)
         The axes on which the utilities were plotted.
    """
    column_or_1d(X)
    check_array(y, ensure_2d=False, ensure_all_finite="allow-nan")
    check_consistent_length(X, y)
    check_array(queried_indices, ensure_2d=False)
    check_array(classes, ensure_2d=False)
    check_type(unlabeled_color, "unlabeled_color", str)
    check_type(plot_cand_highlight, "plot_cand_highlight", bool)
    check_type(ax, "ax", Axes)

    data_lines = []
    cmap = _get_cmap(cmap)
    norm = plt.Normalize(vmin=min(classes), vmax=max(classes))

    highlight_color = (
        cmap(norm(y[-1])) if queried_indices[-1] else unlabeled_color
    )

    if plot_cand_highlight:
        data_lines.append(
            lines.Line2D(
                [0, feature_bound[0][1]],
                [X[-1], X[-1]],
                c=highlight_color,
                alpha=alpha,
                linewidth=linewidth * 2,
            )
        )

    for t, (x_t, a, y_t) in enumerate(zip(X, queried_indices, y)):
        line_color = cmap(norm(y_t)) if a else unlabeled_color
        zorder = 3 if a else 2
        alpha_tmp = alpha * 2 if a else alpha
        data_lines.append(
            lines.Line2D(
                [t, len(X) - 1],
                [x_t, x_t],
                zorder=zorder,
                color=line_color,
                alpha=alpha_tmp,
                linewidth=linewidth,
            )
        )
    for d_line in data_lines:
        ax.add_line(d_line)
    return data_lines




[docs]
def plot_stream_decision_boundary(
    ax,
    t_x,
    plot_step,
    clf,
    X,
    pred_list,
    color="k",
    res=25,
):
    """Plot the decision boundary of the given classifier.

    Parameters
    ----------
    ax : matplotlib.axes.Axes or List
        The axis on which the decision boundary is plotted. If ax is a List,
        each entry has to be an `matplotlib.axes.Axes`.
    t_x : int
        The position of the newest instance for the x axies.
    plot_step : int
        The interval in which the clf should predict new samples.
    clf : sklearn.base.ClassifierMixin
        The fitted classifier whose decision boundary is plotted.
    X : array-like of shape (n_samples, 1)
        Training data set, usually complete, i.e. including the labeled and
        unlabeled samples.
    pred_list : array-like of shape (n_samples,)
        The list containing classifier prediction for the last steps.
    color : str or matplotlib.colors.Colormap, default='k'
        The color for the decision boundary.
    res : int, default=25
        The resolution of the plot.

    Returns
    -------
    ax : matplotlib.axes.Axes or List
        The axis on which the boundary was plotted or the list of axis if ax
        was a list.
    pred_list : array-like of shape (n_samples,)
        The list containing classifier prediction for the last steps.
    """
    X = column_or_1d(X)
    check_array(pred_list, ensure_2d=False, ensure_min_samples=0)
    check_scalar(t_x, "t_x", int, min_val=0)
    check_scalar(plot_step, "plot_step", int, min_val=1)
    check_type(ax, "ax", Axes)
    check_type(clf, "clf", ClassifierMixin)
    x_vec = np.linspace(np.min(X), np.max(X), res)
    t_vec = np.arange(1, t_x // plot_step + 1) * plot_step
    t_mesh, x_mesh = np.meshgrid(t_vec, x_vec)
    predictions = np.array([clf.predict(x_vec.reshape([-1, 1]))])
    pred_list.extend(predictions)

    if len(pred_list) > 2 and np.sum(pred_list) > 0:
        ax.contour(
            t_mesh,
            x_mesh,
            np.array(pred_list[1:]).T,
            levels=[0.5],
            colors=color,
        )
    return ax, pred_list



def _general_plot_utilities(qs, X, y, candidates=None, **kwargs):
    """Plot the utility for the given query strategy.

    Parameters
    ----------
    qs : skactiveml.base.QueryStrategy
        The query strategy for which the utility is plotted.
    X : array-like of shape (n_samples, n_features)
        Training data set, usually complete, i.e. including the labeled and
        unlabeled samples.
    y : array-like of shape (n_samples, ) or (n_samples, n_annotators)
        Labels of the training data set (possibly including unlabeled ones
        indicated by self.MISSING_LABEL).
    candidates : None or array-like of shape (n_candidates), dtype=int or \
            array-like of shape (n_candidates, n_features), default=None
        - If `candidates` is `None`, the unlabeled samples from
          `(X,y)` are considered as `candidates`.
        - If `candidates` is of shape `(n_candidates,)` and of type
          `int`, `candidates` is considered as the indices of the
          samples in `(X,y)`.
        - If `candidates` is of shape `(n_candidates, ...)`, the
          candidate samples are directly given in `candidates` (not
          necessarily contained in `X`). This is not supported by all
          query strategies.

    Other Parameters
    ----------------
    replace_nan : numeric or None, default=0.0
        Only used if plotting with mesh samples is not possible.
        If numeric, the utility of labeled samples will be plotted with
        value `replace_nan`. If None, these samples will be ignored.
    ignore_undefined_query_params : bool, default=False
        If True, query parameters that are not defined in the query function
        are ignored and will not raise an exception.
    feature_bound : array-like of shape [[xmin, ymin], [xmax, ymax]],\
            default=None
        Determines the area in which the boundary is plotted. If candidates is
        not given, bound must not be None. Otherwise, the bound is determined
        based on the data.
    ax : matplotlib.axes.Axes, default=None
        The axis on which the utility is plotted. Only if y.ndim = 1 (single
        annotator).
    axes : array-like of matplotlib.axes.Axes, default=None
        The axes on which the utilities for the annotators are plotted. Only
        supported for y.ndim = 2 (multi annotator).
    res : int, default=21
        The resolution of the plot.
    contour_dict : dict, default=None
        Additional parameters for the utility contour.
    plot_annotators : None or array-like of shape (n_annotators_to_plot,),\
            default=None
        Contains the indices of the annotators to be plotted. If it is None,
        all annotators are plotted. Only supported for y.ndim = 2
        (multi annotator).
    **kwargs
        Remaining keyword arguments are passed the query function of the query
        strategy.

    Returns
    -------
     axes : array-like of shape (n_annotators_to_plot,)
         The axes on which the utilities were plotted.
    """
    replace_nan = kwargs.pop("replace_nan", 0.0)
    ignore_undefined_query_params = kwargs.pop(
        "ignore_undefined_query_params", False
    )
    feature_bound = kwargs.pop("feature_bound", None)
    ax = kwargs.pop("ax", None)
    axes = kwargs.pop("axes", None)
    res = kwargs.pop("res", 21)
    contour_dict = kwargs.pop("contour_dict", None)
    plot_annotators = kwargs.pop("plot_annotators", None)

    check_type(qs, "qs", QueryStrategy)
    X = check_array(X, allow_nd=False, ensure_2d=True)
    if X.shape[1] != 2:
        raise ValueError("Samples in `X` must have 2 features.")

    # Check labels
    y = check_array(y, ensure_2d=False, ensure_all_finite="allow-nan")
    check_consistent_length(X, y)

    if y.ndim == 2:
        if plot_annotators is None:
            n_annotators = y.shape[1]
            plot_annotators = np.arange(n_annotators)
        else:
            plot_annotators = column_or_1d(plot_annotators)
            check_indices(plot_annotators, y, dim=1)
            n_annotators = len(plot_annotators)
    else:
        n_annotators = None
        if plot_annotators is not None:
            raise TypeError(
                "`plot_annotator` can be only used in the multi-annotator "
                "setting."
            )
        else:
            plot_annotators = np.arange(1)
    if n_annotators is None:
        if axes is not None:
            raise TypeError(
                "`axes` can be only used in the multi-annotator setting. "
                "Use `ax` instead."
            )
        if ax is None:
            axes = np.array([plt.subplots(1, 1)[1]])
        else:
            check_type(ax, "ax", Axes)
            axes = np.array([ax])
    else:
        if ax is not None:
            raise ValueError(
                "`ax` can be only used in the single-annotator setting. "
                "Use `axes` instead."
            )
        if axes is None:
            axes = plt.subplots(1, n_annotators)[1]
        else:
            [check_type(ax_, "ax", Axes) for ax_ in axes]

    if n_annotators is not None and len(axes) != n_annotators:
        raise ValueError(
            "`axes` must contain one `Axes` object for each "
            "annotator to be plotted (indicated by `plot_annotators`)."
        )

    # ensure that utilities are returned
    kwargs["return_utilities"] = True

    if candidates is None:
        # plot mesh
        try:
            check_scalar(res, "res", int, min_val=1)
            feature_bound = check_bound(bound=feature_bound, X=X)

            X_mesh, Y_mesh, mesh_samples = mesh(feature_bound, res)

            contour_args = _get_contour_args(contour_dict)

            if ignore_undefined_query_params:
                _, utilities = call_func(
                    qs.query, X=X, y=y, candidates=mesh_samples, **kwargs
                )
            else:
                _, utilities = qs.query(
                    X=X, y=y, candidates=mesh_samples, **kwargs
                )

            for a_idx, ax_ in zip(plot_annotators, axes):
                if n_annotators is not None:
                    utilities_a_idx = utilities[0, :, a_idx]
                else:
                    utilities_a_idx = utilities[0, :]
                utilities_a_idx = utilities_a_idx.reshape(X_mesh.shape)
                ax_.contourf(X_mesh, Y_mesh, utilities_a_idx, **contour_args)

            if n_annotators is None:
                return axes[0]
            else:
                return axes

        except MappingError:
            candidates = unlabeled_indices(y, missing_label=qs.missing_label)
        except BaseException as err:
            warnings.warn(
                f"Unable to create utility plot with mesh because "
                f"of the following error. Trying plotting over "
                f"candidates. \n\n Unexpected {err.__repr__()}"
            )
            candidates = unlabeled_indices(y, missing_label=qs.missing_label)

    candidates = check_array(
        candidates,
        allow_nd=False,
        ensure_2d=False,
        ensure_all_finite="allow-nan",
    )
    if candidates.ndim == 1:
        X_utils = X
        candidates = check_indices(candidates, X)
    else:
        X_utils = candidates

    if ignore_undefined_query_params:
        _, utilities = call_func(
            qs.query, X=X, y=y, candidates=candidates, **kwargs
        )
    else:
        _, utilities = qs.query(X=X, y=y, candidates=candidates, **kwargs)

    for a_idx, ax_ in zip(plot_annotators, axes):
        if n_annotators is not None:
            utilities_a_idx = utilities[0, :, a_idx]
        else:
            utilities_a_idx = utilities[0, :]
        plot_contour_for_samples(
            X_utils,
            utilities_a_idx,
            replace_nan=replace_nan,
            feature_bound=feature_bound,
            ax=ax_,
            res=res,
            contour_dict=contour_dict,
        )

    if n_annotators is None:
        return axes[0]
    else:
        return axes