Source code for skactiveml.classifier._parzen_window_classifier

"""
Parzen Window Classifier
"""

# Author: Marek Herde <marek.herde@uni-kassel.de>

import numpy as np
import warnings
from sklearn.metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
from sklearn.utils import check_array
from sklearn.utils.validation import (
    check_is_fitted,
    check_scalar,
)

from ..base import ClassFrequencyEstimator
from ..utils import MISSING_LABEL, compute_vote_vectors, is_labeled


[docs]class ParzenWindowClassifier(ClassFrequencyEstimator):
    """ParzenWindowClassifier

    The Parzen window classifier (PWC) is a simple and
    probabilistic classifier. This classifier is based on a non-parametric
    density estimation obtained by applying a kernel function.

    Parameters
    ----------
    classes : array-like of shape (n_classes), default=None
        Holds the label for each class. If none, the classes are determined
        during the fit.
    missing_label : scalar or string or np.nan or None, default=np.nan
        Value to represent a missing label.
    cost_matrix : array-like of shape (n_classes, n_classes), default=None
        Cost matrix with `cost_matrix[i,j]` indicating cost of predicting class
        `classes[j]` for a sample of class `classes[i]`. Can be only set, if
        `classes` is not none.
    class_prior : float or array-like of shape (n_classes,), default=0
        Prior observations of the class frequency estimates. If `class_prior`
        is an array, the entry `class_prior[i]` indicates the non-negative
        prior number of samples belonging to class `classes_[i]`. If
        `class_prior` is a float, `class_prior` indicates the non-negative
        prior number of samples per class.
    metric : str or callable, default='rbf'
        The metric must be a valid kernel defined by the function
        `sklearn.metrics.pairwise.pairwise_kernels`.
    n_neighbors : int or None, default=None
        Number of nearest neighbours. Default is None, which means all
        available samples are considered.
    metric_dict : dict, default=None
        Any further parameters are passed directly to the kernel function.
        For the kernel 'rbf' we allow the use of mean kernel [2] and use
        it when gamma is set to 'mean' (i.e., {'gamma': 'mean'}). While N is
        defined as the labeled data the variance is calculated over all X.
    random_state : int or RandomState instance or None, default=None
        Determines random number for 'predict' method. Pass an int for
        reproducible results across multiple method calls.

    Attributes
    ----------
    classes_ : array-like of shape (n_classes,)
        Holds the label for each class after fitting.
    class_prior : np.ndarray of shape (n_classes)
        Prior observations of the class frequency estimates. The entry
        `class_prior_[i]` indicates the non-negative prior number of samples
        belonging to class `classes_[i]`.
    cost_matrix_ : np.ndarray of shape (classes, classes)
        Cost matrix with `cost_matrix_[i,j]` indicating cost of predicting
        class `classes_[j]` for a sample of class `classes_[i]`.
    X_ : np.ndarray of shape (n_samples, n_features)
        The sample matrix `X` is the feature matrix representing the samples.
    V_ : np.ndarray of shape (n_samples, classes)
        The class labels are represented by counting vectors. An entry `V[i,j]`
        indicates how many class labels of `classes[j]` were provided for
        training sample `X_[i]`.

    References
    ----------
    .. [1] O. Chapelle, "Active Learning for Parzen Window Classifier",
       Proceedings of the Tenth International Workshop Artificial Intelligence
       and Statistics, 2005.
    .. [2] Chaudhuri, A., Kakde, D., Sadek, C., Gonzalez, L., & Kong, S.,
       "The Mean and Median Criteria for Kernel Bandwidth Selection for Support
       Vector Data Description" IEEE International Conference on Data
       Mining Workshops (ICDMW), 2017.
    """

    METRICS = list(KERNEL_PARAMS.keys()) + ["precomputed"]

    def __init__(
        self,
        n_neighbors=None,
        metric="rbf",
        metric_dict=None,
        classes=None,
        missing_label=MISSING_LABEL,
        cost_matrix=None,
        class_prior=0.0,
        random_state=None,
    ):
        super().__init__(
            classes=classes,
            class_prior=class_prior,
            missing_label=missing_label,
            cost_matrix=cost_matrix,
            random_state=random_state,
        )
        self.metric = metric
        self.n_neighbors = n_neighbors
        self.metric_dict = metric_dict

[docs]    def fit(self, X, y, sample_weight=None):
        """Fit the model using X as training data and y as class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The sample matrix `X` is the feature matrix representing the
            samples.
        y : array-like of shape (n_samples)
            It contains the class labels of the training samples.
        sample_weight : array-like of shape (n_samples), default=None
            It contains the weights of the training samples' class labels.
            It must have the same shape as `y`.

        Returns
        -------
        self: ParzenWindowClassifier,
            The ParzenWindowClassifier is fitted on the training data.
        """
        # Check input parameters.
        X, y, sample_weight = self._validate_data(X, y, sample_weight)

        # Check whether metric is available.
        if self.metric not in ParzenWindowClassifier.METRICS and not callable(
            self.metric
        ):
            raise ValueError(
                "The parameter 'metric' must be callable or "
                "in {}".format(KERNEL_PARAMS.keys())
            )

        # Check number of neighbors which must be a positive integer.
        if self.n_neighbors is not None:
            check_scalar(
                self.n_neighbors,
                name="n_neighbors",
                min_val=1,
                target_type=int,
            )

        # Ensure that metric_dict is a Python dictionary.
        self.metric_dict_ = (
            self.metric_dict if self.metric_dict is not None else {}
        )
        if (
            "gamma" in self.metric_dict_
            and self.metric_dict["gamma"] == "mean"
            and self.metric == "rbf"
        ):
            is_lbld = is_labeled(y, missing_label=1)
            N = np.max([2, np.sum(is_lbld)])
            variance = np.var(X, axis=0)
            n_features = X.shape[1]
            gamma = ParzenWindowClassifier._calculate_mean_gamma(
                N, variance, n_features
            )
            self.metric_dict_["gamma"] = gamma
        if not isinstance(self.metric_dict_, dict):
            raise TypeError("'metric_dict' must be a Python dictionary.")

        self._check_n_features(X, reset=True)

        # Store train samples.
        self.X_ = X.copy()

        # Convert labels to count vectors.
        if self.n_features_in_ is None:
            self.V_ = 0
        else:
            self.V_ = compute_vote_vectors(
                y=y,
                w=sample_weight,
                classes=np.arange(len(self.classes_)),
                missing_label=-1,
            )

        return self

[docs]    def predict_freq(self, X):
        """Return class frequency estimates for the input samples 'X'.

        Parameters
        ----------
        X: array-like or shape (n_samples, n_features) or shape \
                (n_samples, m_samples) if metric == 'precomputed'
            Input samples.

        Returns
        -------
        F: array-like of shape (n_samples, classes)
            The class frequency estimates of the input samples. Classes are
            ordered according to `classes_`.
        """
        check_is_fitted(self)
        X = check_array(X, force_all_finite=(self.metric != "precomputed"))

        # Predict zeros because of missing training data.
        if self.n_features_in_ is None:
            return np.zeros((len(X), len(self.classes_)))

        # Compute kernel (metric) matrix.
        if self.metric == "precomputed":
            K = X
            if np.size(K, 0) != np.size(X, 0) or np.size(K, 1) != np.size(
                self.X_, 0
            ):
                raise ValueError(
                    "The kernel matrix 'X' must have the shape "
                    "(n_test_samples, n_train_samples)."
                )
        else:
            self._check_n_features(X, reset=False)
            K = pairwise_kernels(
                X, self.X_, metric=self.metric, **self.metric_dict_
            )

        # computing class frequency estimates
        if self.n_neighbors is None or np.size(self.X_, 0) <= self.n_neighbors:
            F = K @ self.V_
        else:
            indices = np.argpartition(K, -self.n_neighbors, axis=1)
            indices = indices[:, -self.n_neighbors :]
            F = np.empty((np.size(X, 0), len(self.classes_)))
            for i in range(np.size(X, 0)):
                F[i, :] = K[i, indices[i]] @ self.V_[indices[i], :]
        return F

    @staticmethod
    def _calculate_mean_gamma(
        N, variance, n_features, delta=(np.sqrt(2) * 1e-6)
    ):
        denominator = 2 * N * np.sum(variance)
        numerator = (N - 1) * np.log((N - 1) / delta**2)
        if denominator <= 0:
            gamma = 1 / n_features
            warnings.warn(
                "The variance of the provided data is 0. Bandwidth of "
                + f"1/n_features={gamma} is used instead."
            )
        else:
            gamma = 0.5 * numerator / denominator
        return gamma