Source code for icspylab.comp_select.unimodality

import numpy as np
from scipy.optimize import brent
from scipy.stats import norm, multivariate_normal
import numbers
from .base import ComponentSelect, _validate_nb_select


def _ftu_exact_statistics(X):
    """Reference: Siffer, A., Fouque, P.-A., Termier, A. and Largouët, C. (2018), Are your data gathered?, In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2210–2218. <doi:10.1145/3219819.3219994⟩>."""

    def _ftu_exact_ratio(X):
        a, b, c = X.min(), X.mean(), X.max()
        s, fmin, _, _ = brent(
            lambda s: np.var(np.abs(X - s)),
            brack=(a, b, c),
            full_output=True,
        )

        return float(s), float(fmin) / X.var()

    s, ratio = _ftu_exact_ratio(X)

    return s, ratio * 4


def _ftu_approx_statistics(X):
    """Reference: Siffer, A., Fouque, P.-A., Termier, A. and Largouët, C. (2018), Are your data gathered?, In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2210–2218. <doi:10.1145/3219819.3219994⟩>."""

    def _ftu_approx_ratio(X):
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        a, b, c = X.min(), X.mean(), X.max()
        s, _, _, _ = brent(
            lambda s: np.var((X - s) ** 2),
            brack=(a, b, c),
            full_output=True,
        )
        return float(s), float(np.var(np.abs(X - s)) / X.var())

    s, ratio = _ftu_approx_ratio(X)

    return s, ratio * 4


def _p_val_hat_T(n, t):
    """Compute the p-value of the DFTU using the test statistic T=min(Phi1, Phi2)"""

    # Marginal distributions of Phi1 and Phi2
    a = 0.273
    d = 1
    sigma = a * (d + 1) ** 2 / np.sqrt(n)
    distrib_Phi = norm(loc=1, scale=sigma)
    p_hat_Phi = distrib_Phi.cdf(t)

    # Join distribution of Phi1 and Phi2
    rho = -0.37
    mean = [1, 1]
    cov = [
        [sigma ** 2, rho * sigma ** 2],
        [rho * sigma ** 2, sigma ** 2]
    ]
    distrib_Phi1_Phi2 = multivariate_normal(mean=mean, cov=cov)
    p_hat_Phi1_Phi2 = distrib_Phi1_Phi2.cdf([t, t])

    p_val_hat = 2 * p_hat_Phi - p_hat_Phi1_Phi2

    return p_val_hat



[docs]
def dftu(x):
    """
    Apply the Double Folding Test of Unimodality (DFTU), a two-step extension of Siffer's Folding Test of Unimodality (FTU). The null hypothesis states that the underlying distribution is unimodal. Small p-values provide evidence against unimodality.

    Parameters:
        x (ndarray): Data

    Returns:
        stat (float): Test statistic
        p_val (float): Associated p-value

    Details:
        The test is based on the statistic:

    .. math::

        T = \\min(\\Phi_1, \\Phi_2)

    :math:`\Phi_1` and :math:`\Phi_2` are obtained from two successive folding steps.

        Hypotheses:

        .. math::

            H_0: T \\geq 1 \\quad \\text{(the distribution is unimodal)}

            H_1: T < 1 \\quad \\text{(the distribution is not unimodal)}

        Small values of :math:`T` provide evidence against unimodality.

    Reference:
        - Becquart, C., Archimbaud, A., Ruiz, A.M., Smida, Z., A Note on the Folding Test of Unimodality: limitations and an improved alternative.
        - Siffer, A., Fouque, P.-A., Termier, A. and Largouët, C. (2018), Are your data gathered?, In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2210–2218. <doi:10.1145/3219819.3219994>.

    Example:
        >>> from sklearn.datasets import load_iris
        >>> from icspylab import dftu
        >>> iris = load_iris()
        >>> X = iris.data
        >>> stat, p = dftu(X[:,0])
        >>> print(round(stat, 2), round(p, 2))
        1.15 1.0
    """

    x = np.asarray(x, dtype=np.float64).reshape(-1, 1)
    n = x.shape[0]

    # Phi1
    _, Phi1 = _ftu_exact_statistics(x)
    # Phi2
    s_approx, _ = _ftu_approx_statistics(x)
    x_new = np.abs(x - s_approx)
    _, Phi2 = _ftu_exact_statistics(x_new)

    stat = np.min([Phi1, Phi2])
    p_val = _p_val_hat_T(n, stat)

    return stat, p_val




[docs]
def unimodal_crit(X, W, level=0.05, max_select=None, **kwargs):
    """
    Identifies invariant coordinates that are multimodal using the univariate Fouble Folding Test of Unimodality (DFTU).
    Only the first and last components are investigated.

    Parameters:
        X (ndarray): Data to fit the ICS model, where rows are samples and columns are features.
        W (ndarray): Transformation matrix in which each row contains the coefficients of the linear transformation to the corresponding invariant coordinate.
        level (float, default=0.05) The initial level used to make a decision based on the test p-values.
        max_select (int or None, default=None): Maximum number of components to select.

    Returns:
        dict: Summary of the component selection step

    Example:
        >>> from sklearn.datasets import load_iris
        >>> from icspylab import ICS, unimodal_crit
        >>> iris = load_iris()
        >>> X = iris.data
        >>> ics = ICS(S1="cov", S2="cov4")
        >>> selection_res = unimodal_crit(X=X, W=ics.components_)
        >>> print(selection_res.info)
        {'crit': 'unimodal', 'level': 0.05, 'max_select': 3, 'pvalues': array([9.99998344e-01, 9.97549948e-01, 9.99996927e-01, 2.85058691e-12]), 'adjusted_levels': [0.05, 0.025], 'component_names': ['IC_4']}

    """

    def test_fun(x):
        stat, p = dftu(x)
        return {"statistic": stat, "p_value": p}


    # level validation
    if not isinstance(level, numbers.Real):
        raise TypeError("level must be a real number.")

    if not (0.0 < float(level) < 1.0):
        raise ValueError("level must be between 0 and 1.")

    # max_select validation
    all_comp_names = [f"IC_{i+1}" for i in range(X.shape[1])]
    p = X.shape[1]
    max_select = _validate_nb_select(max_select, p)

    # Transform X to apply the normality tests
    Z = X @ W.T

    # Apply marginal normality tests to all components and select on p-values
    test_pvals = np.array([test_fun(Z[:, j])["p_value"] for j in range(Z.shape[1])])
    comp_signif = test_pvals <= level

    # If none of them are significative
    if comp_signif.sum() == 0:
        selected_component_names = []
        adjusted_levels = [level]

    # Else: we select the components on the extreme left and right while they are not gaussian
    else:
        temp = 1
        selected_component_names = []
        adjusted_levels = [level]

        pvals = test_pvals.copy()
        comps = all_comp_names.copy()

        while temp <= max_select:

            left_pval = pvals[0]
            right_pval = pvals[-1]

            if (left_pval < level) and (left_pval < right_pval):
                selected_component_names.append(comps[0])
                comps = comps[1:]
                pvals = pvals[1:]

            elif right_pval < level:
                selected_component_names.append(comps[-1])
                comps = comps[:-1]
                pvals = pvals[:-1]

            else:
                break

            temp += 1

            # Adjust the alpha level by dividing by the old weight and multiplying by the new one.
            pvals = pvals * temp / (temp - 1)
            adjusted_levels.append(level / temp)

    # Keep only the selected components
    name_to_idx = {name: i for i, name in enumerate(all_comp_names)}
    idx = [name_to_idx[name] for name in selected_component_names]
    components = W[idx, :]

    # ComponentSelect class
    n_components = len(selected_component_names)

    info = {
        "crit": "unimodal",
        "level": level,
        "max_select": max_select,
        "pvalues": test_pvals.copy(),
        "adjusted_levels": adjusted_levels,
        "component_names": selected_component_names
    }

    return ComponentSelect(label="unimodal", components=components, n_components=n_components, component_names=selected_component_names, info=info)