Source code for lightkde.lightkde

"""
Reliable and extremely fast kernel density estimator for one and two-dimensional
samples.

The kernel density estimations here are kept as simple and as separated from the rest
of the code as possible. They do nothing but kernel density estimation. The
motivation for their partial reimplementation is that the existing kernel density
estimators are:
* suboptimal (like scipy where no kernel bandwidth optimization is done), or
* come with a gorilla holding a banana and the entire jungle although only the
    banana is needed.

Do one thing and do it well.

Botev's Matlab codes are the starting point of this implementation as those mostly
follow the above principle.

TODO:
 - [low] add cdf estimate as in ``kde_1d.m``.
 - [high] more thorough input check, mostly shape and type.
 - [high] check the details of ``histc`` in Matlab and ``np.histogram`` make sure that
    appending a zero to ``sample_hist`` is always valid.
"""

import copy
import logging
from typing import Iterable, Optional, Tuple, Union

import numpy as np
from scipy import fft, optimize
from scipy.stats import gaussian_kde

N_X_VEC = int(2**14)
N_ROW_MX = int(2**8)


# ======================================================================================
# 1D
# ======================================================================================
[docs]def kde_1d(
    sample_vec: Union[np.ndarray, list],
    n_x_vec: int = N_X_VEC,
    x_min: Optional[Union[int, float]] = None,
    x_max: Optional[Union[int, float]] = None,
    weight_vec: Union[np.ndarray, list] = None,
    return_bandwidth: bool = False,
) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, float]]:
    """
    Reliable and extremely fast kernel density estimator for one-dimensional sample.

    Gaussian kernel is assumed and the bandwidth is chosen automatically.
    Unlike many other implementations, this one is immune to problems caused by
    multimodal densities with widely separated modes. The estimation does not
    deteriorate for multimodal densities, because we never assume a parametric model
    for the sample.

    .. note::

        * The elements of ``sample_vec`` that fall between ``x_min`` and ``x_max`` will
          be treated as the full sample, i.e. the kernel density over ``[x_min, x_max]``
          will integrate to one.

        * If the search for finding the optimal bandwidth fails the functions falls
          back to ``scipy.stats.gaussian_kde``.

    Args:
        sample_vec:
            A vector of sample points from which the density estimate is constructed.
        n_x_vec:
            The number of ``x_vec`` points used in the uniform discretization of
            the interval ``[x_min, x_max]``. ``n_x_vec`` has to be a power of two. If
            ``n_x_vec`` is not a power of two, then ``n_x_vec`` is rounded up to the
            next power of two, i.e., ``n_x_vec`` is set to
            ``n_x_vec=2**ceil(log2(n_x_vec))``; the default value of ``n_x_vec`` is
            ``n_x_vec=2**14``.
        x_min:
            The lower boundary of the interval over which the density estimate is
            constructed.
        x_max:
            The upper boundary of the interval over which the density estimate is
            constructed.
        weight_vec:
            Weights of sample points. This must have the same shape as ``sample_vec``.
            If ``None`` (default), the samples are assumed to be equally weighted.
            Only the values of elements relative to each other matter,
            i.e. multiplying ``weight_vec`` by a non-negative scalar does not change
            the results.
        return_bandwidth:
            Should the used bandwidth be returned?

    Raises:
        ValueError: If ``weight_vec`` has at least one negative value.

    Warns:
        Root finding failed (Brent's method): Optimal bandwidth finding failed,
            falling back to the rule-of-thumb bandwidth of ``scipy.stats.gaussian_kde``.

    Returns:
        Kernel densities, a vector of length ``n_x_vec`` with the values of
        the density estimate at the grid points (``x_vec``).

        Kernel density grid (``x_vec``), a vector of grid points over which
        the kernel density estimate is computed.

        Optimal bandwidth (Gaussian kernel assumed), returned only if
        ``return_bandwidth`` is ``True``.


    Examples:
        .. code-block:: python

            import numpy as np
            import matplotlib.pyplot as plt
            from lightkde import kde_1d

        .. code-block:: python

            sample_vec = [
                -1.3145, -0.5197, 0.9326, 3.2358, 0.3814,
                -0.3226, 2.1121, 1.1357, 0.4376, -0.0332
            ]
            density_vec, x_vec = kde_1d(sample_vec)

        .. code-block:: python

            sample_vec = np.hstack((np.random.normal(loc=-8, size=100),
                np.random.normal(loc=-3, size=100),
                np.random.normal(loc=7, size=100)))
            density_vec, x_vec = kde_1d(sample_vec)

            plt.subplots()
            plt.plot(x_vec, density_vec)
            plt.show()

    The kde bandwidth selection method is outlined in [1]. This implementation is
    based on the implementation of Daniel B. Smith [2] who based his
    implementation on the Matlab implementation by Zdravko Botev [3].

    References:
        [1] Z. I. Botev, J. F. Grotowski, and D. P. Kroese (2010) Annals of
        Statistics, Volume 38, Number 5, pages 2916-2957.

        [2] https://github.com/Daniel-B-Smith/KDE-for-SciPy/blob/a9982909bbb92a7e243e5fc9a74f957d883f1c5d/kde.py # noqa: E501
        Updated on: 6 Feb 2013.

        [3] https://nl.mathworks.com/matlabcentral/fileexchange/14034-kernel-density-estimator # noqa: E501
        Updated on: 30 Dec 2015.
    """
    sample_vec = np.array(sample_vec).ravel()
    n_sample = len(np.unique(sample_vec))

    # Parameters to set up the x_vec on which to calculate
    n_x_vec = int(2 ** np.ceil(np.log2(n_x_vec)))
    if x_min is None or x_max is None:
        sample_min = np.min(sample_vec)
        sample_max = np.max(sample_vec)
        sample_range = sample_max - sample_min
        x_min = sample_min - sample_range / 10 if x_min is None else x_min
        x_max = sample_max + sample_range / 10 if x_max is None else x_max

    # watch out, scaling of weight_vec
    if weight_vec is not None:
        weight_vec = np.atleast_1d(weight_vec).squeeze()
        if np.any(weight_vec < 0):
            raise ValueError("Argument: weight_vec cannot have negative elements!")
        weight_vec = weight_vec / np.sum(weight_vec) * n_sample

    # Range of x_vec
    x_range = x_max - x_min

    # Histogram the sample_vec to get a crude first approximation of the density
    step = x_range / (n_x_vec - 1)
    x_vec = np.arange(start=x_min, stop=x_max + 0.1 * step, step=step)

    sample_hist, bin_edges = np.histogram(sample_vec, bins=x_vec, weights=weight_vec)
    # for easier comparison with Matlab, the count for [x_vec[-1], +Inf [ is also
    # added, i.e. 0
    sample_hist = np.append(sample_hist, 0)
    sample_hist = sample_hist / n_sample
    # discrete cosine transform of initial sample_vec
    dct_sample = fft.dct(sample_hist, norm=None)

    ic = np.arange(1, n_x_vec, dtype=float) ** 2
    sq_dct_sample = (dct_sample[1:] / 2) ** 2.0

    # The fixed point calculation finds the bandwidth = t_star
    guess = 0.1
    try:
        t_star = optimize.brentq(
            f=fixed_point, a=0, b=guess, args=(n_sample, ic, sq_dct_sample)
        )
    except (ValueError, RuntimeError) as e:
        logging.warning(
            "Failed to find the optimal bandwidth.\n\t"
            f"Root finding (Brent's method) failed with error: {e}.\n\t"
            "We fall back to use ``scipy.stats.gaussian_kde``).\n\t"
            "Please carefully check the results!"
        )
        # t_star = 0.28 * n_x_vec ** (-2 / 5)
        gkde = gaussian_kde(sample_vec, weights=weight_vec)
        density_vec = gkde.evaluate(x_vec)
        if return_bandwidth:
            return density_vec, x_vec, np.nan
        else:
            return density_vec, x_vec

    # Smooth the DCTransformed sample_vec using t_star
    sm_dct_sample = dct_sample * np.exp(
        -np.arange(n_x_vec) ** 2 * np.pi**2 * t_star / 2
    )
    # Inverse DCT to get density
    density_vec = fft.idct(sm_dct_sample, norm=None) / x_range
    bandwidth = np.sqrt(t_star) * x_range

    density_vec = density_vec / np.trapz(density_vec, x_vec)

    if return_bandwidth:
        return density_vec, x_vec, bandwidth
    else:
        return density_vec, x_vec


def fixed_point(t, n_sample, ic, sq_dct_sample):
    # this implements the function t-zeta*gamma**[l](t)
    c7 = 7
    ic = np.longdouble(ic)
    n_sample = np.longdouble(n_sample)
    sq_dct_sample = np.longdouble(sq_dct_sample)
    f = (
        2
        * np.pi ** (2 * c7)
        * np.sum(ic**c7 * sq_dct_sample * np.exp(-ic * np.pi**2 * t))
    )
    for s in range(c7, 1, -1):
        k0 = np.prod(range(1, 2 * s, 2)) / np.sqrt(2 * np.pi)
        const = (1 + (1 / 2) ** (s + 1 / 2)) / 3
        time = (2 * const * k0 / n_sample / f) ** (2 / (3 + 2 * s))
        f = (
            2
            * np.pi ** (2 * s)
            * np.sum(ic**s * sq_dct_sample * np.exp(-ic * np.pi**2 * time))
        )
    return t - (2 * n_sample * np.sqrt(np.pi) * f) ** (-2 / 5)


# ======================================================================================
# 2D
# ======================================================================================
[docs]def kde_2d(
    sample_mx: Union[np.ndarray, list],
    n_row_mx: int = N_ROW_MX,
    xy_min: Union[np.ndarray, Iterable] = None,
    xy_max: Union[np.ndarray, Iterable] = None,
    weight_vec: Union[np.ndarray, list] = None,
    return_bandwidth: bool = False,
) -> Union[
    Tuple[np.ndarray, np.ndarray, np.ndarray],
    Tuple[np.ndarray, np.ndarray, np.ndarray, float],
]:
    """
    Fast and accurate state-of-the-art bivariate kernel density estimator with
    diagonal bandwidth matrix.

    The kernel is assumed to be Gaussian. The two bandwidth parameters are chosen
    optimally without ever using/assuming a parametric model for the sample_vec or
    any "rules of thumb". Unlike many other procedures, this one is immune to
    accuracy failures in the estimation of multimodal densities with widely separated
    modes.

    Args:
        sample_mx:
            A 2D matrix of sample_vec from which the density estimate is
            constructed, the matrix must have two columns that represent the two
            coordinates (x,y) of the 2D sample_vec.
        n_row_mx:
            Number of points along each dimension (same for columns) where the
            estimate of the density will be returned, i.e. total number of points is
            ``n_row_x_mx**2``.
        xy_min:
            The lower x and y boundaries of the interval over which the density
            estimate is constructed.
        xy_max:
            The upper x and y boundaries of the interval over which the density
            estimate is constructed.
        weight_vec:
            Weights of sample points. This must have the same number of
            elements as rows in ``sample_vec``, the same weight is applied to both
            coordinates of the same ``sample_vec`` point. If ``None`` (default),
            the samples are assumed to be equally weighted. The absolute value of the
            elements of ``weight_vec`` does not matter, only the values of elements
            relative to each other, i.e. multiplying ``weight_vec`` by a scalar does
            not change the results.
        return_bandwidth:
            Should the used bandwidth be returned?

    Raises:
        ValueError: If the number of columns in ``sample_mx`` is not two. If
            ``weight_vec`` has at least one negative value.

    Returns:
        Kernel densities, 2D matrix with the values of the density
        estimate at the grid points formed by ``x_mx`` and ``y_mx``.

        Kernel density grid (``x_mx``), the x coordinates of the grid points
        over which the density estimate is computed in the form of a 2D matrix
        that is the outcome of ``np.meshgrid``.

        Kernel density grid (``y_mx``), the x coordinates of the grid points
        over which the density estimate is computed in the form of a 2D matrix
        that is the outcome of ``np.meshgrid``.

        Optimal bandwidth (Gaussian kernel assumed), returned only if
        ``return_bandwidth`` is ``True``.

    .. note::
        To ease testing and debugging the implementation very closely follows [2],
        i.e. [2] is assumed to be correct.

    References:
        [1] Z. I. Botev, J. F. Grotowski, and D. P. Kroese (2010) Annals of
        Statistics, Volume 38, Number 5, pages 2916-2957.

        [2] https://nl.mathworks.com/matlabcentral/fileexchange/17204-kernel-density-estimation.  # noqa: E501
        Updated on: 30 Dec 2015.
    """

    sample_mx = np.atleast_2d(sample_mx)

    if sample_mx.shape[1] != 2:
        raise ValueError(
            f"``sample_vec`` should have exactly two columns but your input has:"
            f" {sample_mx.shape[1]}."
        )

    n_row_mx = int(2 ** np.ceil(np.log2(n_row_mx)))
    n_sample = sample_mx.shape[0]

    if xy_min is None or xy_max is None:
        xy_sample_max = np.max(sample_mx, axis=0)
        xy_sample_min = np.min(sample_mx, axis=0)
        xy_sample_range = xy_sample_max - xy_sample_min
        xy_max = xy_sample_max + xy_sample_range / 2 if xy_max is None else xy_max
        xy_min = xy_sample_min - xy_sample_range / 2 if xy_min is None else xy_min

    # watch out, scaling of weight_vec
    if weight_vec is not None:
        weight_vec = np.atleast_1d(weight_vec).squeeze()
        if np.any(weight_vec < 0):
            raise ValueError("Argument: ``weight_vec`` cannot have negative elements!")
        weight_vec = weight_vec / np.sum(weight_vec) * n_sample

    xy_max = np.atleast_1d(xy_max)
    xy_min = np.atleast_1d(xy_min)

    scaling = xy_max - xy_min
    transformed_sample = (sample_mx - xy_min) / scaling

    # bin the sample_vec uniformly using regular grid
    initial_sample = hist_2d(
        sample_mx=transformed_sample, n_bin=n_row_mx, weight_vec=weight_vec
    )

    # discrete cosine transform of initial sample_vec
    a = dct2d(initial_sample)

    # compute the optimal bandwidth**2
    ic = np.arange(start=0, stop=n_row_mx, step=1, dtype=float) ** 2
    ac2 = a**2
    t_star = root(
        lambda t: t - evolve(t, n_sample=n_sample, ic=ic, ac2=ac2)[0], n=n_sample
    )

    def func_(s, t):
        return func(s=s, t=t, n_sample=n_sample, ic=ic, ac2=ac2)

    p_02 = func_([0, 2], t_star)
    p_20 = func_([2, 0], t_star)
    p_11 = func_([1, 1], t_star)
    t_y = (
        p_02 ** (3 / 4)
        / (4 * np.pi * n_sample * p_20 ** (3 / 4) * (p_11 + np.sqrt(p_20 * p_02)))
    ) ** (1 / 3)
    t_x = (
        p_20 ** (3 / 4)
        / (4 * np.pi * n_sample * p_02 ** (3 / 4) * (p_11 + np.sqrt(p_20 * p_02)))
    ) ** (1 / 3)

    # smooth the discrete cosine transform of initial sample_vec using t_star
    n_range = np.arange(0, n_row_mx, dtype=float)
    v1 = np.atleast_2d(np.exp(-(n_range**2) * np.pi**2 * t_x / 2)).T
    v2 = np.atleast_2d(np.exp(-(n_range**2) * np.pi**2 * t_y / 2))
    a_t = np.matmul(v1, v2) * a

    # apply the inverse discrete cosine transform
    density_mx = idct2d(a_t) * (a_t.size / np.prod(scaling))
    # remove any negative density values
    density_mx[density_mx < 0] = np.finfo(float).eps
    x_step = scaling[0] / (n_row_mx - 1)
    y_step = scaling[1] / (n_row_mx - 1)
    x_vec = np.arange(start=xy_min[0], stop=xy_max[0] + 0.1 * x_step, step=x_step)
    y_vec = np.arange(start=xy_min[1], stop=xy_max[1] + 0.1 * y_step, step=y_step)
    x_mx, y_mx = np.meshgrid(x_vec, y_vec)
    bandwidth = np.sqrt([t_x, t_y]) * scaling

    density_mx = density_mx.T

    if return_bandwidth:
        return density_mx, x_mx, y_mx, bandwidth
    else:
        return density_mx, x_mx, y_mx


def evolve(t, n_sample: int, ic, ac2):
    def func_(ss, tt):
        return func(s=ss, t=tt, n_sample=n_sample, ic=ic, ac2=ac2)

    sum_func = func_([0, 2], t) + func_([2, 0], t) + 2 * func_([1, 1], t)
    time = (2 * np.pi * n_sample * sum_func) ** (-1 / 3)
    out = (t - time) / time
    return out, time


def func(s, t, n_sample, ic, ac2):
    if sum(s) <= 4:
        sum_func = func([s[0] + 1, s[1]], t, n_sample=n_sample, ic=ic, ac2=ac2) + func(
            [s[0], s[1] + 1], t, n_sample=n_sample, ic=ic, ac2=ac2
        )
        const = (1 + 1 / 2 ** (np.sum(s) + 1)) / 3
        time = (-2 * const * k_fun(s[0]) * k_fun(s[1]) / n_sample / sum_func) ** (
            1 / (2 + np.sum(s))
        )
        out = psi(s, time, ic, ac2)
    else:
        out = psi(s, t, ic, ac2)
    return out


def psi(s, time, ic, ac2):
    # s is a vector
    w = np.exp(-ic * np.pi**2 * time) * np.append(1, 0.5 * np.ones(len(ic) - 1))
    wx = w * (ic ** s[0])
    wy = w * (ic ** s[1])
    out = (
        (-1) ** np.sum(s)
        * (np.matmul(np.matmul(wy, ac2), wx.T))
        * np.pi ** (2 * np.sum(s))
    )
    return out


def k_fun(s):
    step = 2
    idx = np.arange(start=1, stop=2 * s - 1 + 0.1 * step, step=step)
    return (-1) ** s * np.prod(idx) / np.sqrt(2 * np.pi)


def dct2d(sample):
    # t_sample = fft.dct(fft.dct(sample_vec, axis=0), axis=1)
    t_sample = fft.dctn(sample)
    t_sample[:, 0] = t_sample[:, 0] / 2
    t_sample[0, :] = t_sample[0, :] / 2
    return t_sample


def idct2d(sample):
    sample = copy.deepcopy(sample)
    sample[:, 0] = sample[:, 0] * 2
    sample[0, :] = sample[0, :] * 2
    t_sample = fft.idctn(sample)
    return t_sample


def hist_2d(sample_mx, n_bin, weight_vec: Union[np.ndarray, list] = None) -> np.ndarray:
    """
    Computes the histogram of a 2-dimensional sample (two columns, n rows).

    Args:
        sample_mx: A sample of ``n_rows`` and ``n_columns``.
        n_bin: The number of bins used in each dimension so that ``binned_sample``
            is a hypercube with size length equal to ``n_bin``.
        weight_vec: Weights.

    Returns:
        Binned sample

    :meta private:
    """
    x = sample_mx[:, 0]
    y = sample_mx[:, 1]

    step = 1 / n_bin
    hc = np.histogram2d(
        x, y, bins=np.arange(0, 1 + 0.1 * step, step=step), weights=weight_vec
    )[0]
    binned_sample = hc / np.sum(hc)
    return binned_sample


def root(fun, n):
    # Try to find the smallest root whenever there is more than one.
    max_tol = 0.1
    n = 50 * int(n <= 50) + 1050 * int(n >= 1050) + n * int((n < 1050) & (n > 50))
    # pwith the current numbers this is at maximum 0.01
    tol = 10**-12 + 0.01 * (n - 50) / 1000

    solved = False
    while not solved:
        try:
            t = optimize.brentq(f=fun, a=0, b=tol)
            solved = True
        except ValueError:
            # double search interval
            tol = min(tol * 2, max_tol)

        # if all else fails
        if tol >= max_tol:
            t = optimize.fminbound(func=lambda x: abs(fun(x)), x1=0, x2=0.1)
            solved = True

    return t