Source code for icspylab.comp_select.normal

import numpy as np
from scipy.stats import shapiro, normaltest, jarque_bera, kurtosistest, skewtest
import numbers
from .base import ComponentSelect, _validate_nb_select


def _normaltest(x):
    """D’Agostino and Pearson’s test."""
    stat, p = normaltest(x)
    return {"statistic": stat, "p_value": p}

def _agostino_test(x):
    """D'Agostino test of skewness in normally distributed data."""
    stat, p = skewtest(x)
    return {"statistic": stat, "p_value": p}

def _anscombe_test(x):
    """Anscombe-Glynn test of kurtosis in normally distributed data."""
    stat, p = kurtosistest(x)
    return {"statistic": stat, "p_value": p}

def _jarque_test(x):
    """Jarque-Bera normality test."""
    stat, p = jarque_bera(x)
    return {"statistic": stat, "p_value": p}

def _shapiro_test(x):
    """Shapiro-Wilk normality test."""
    stat, p = shapiro(x)
    return {"statistic": stat, "p_value": p}


[docs]
def normal_crit(X, W, level=0.05, test="agostino", max_select=None, **kwargs):
    """
    Identifies invariant coordinates that deviate from normality using univariate normality tests. Only the first and
    last components are investigated.

    SciPy implementations are used. The available tests are:
    `normal <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html>`_,
    `agostino <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skewtest.html>`_,
    `jarque <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.jarque_bera.html>`_,
    `anscombe <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kurtosistest.html>`_, and
    `shapiro <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html>`_.

    Parameters:
        X (ndarray): Data to fit the ICS model, where rows are samples and columns are features.
        W (ndarray): Transformation matrix in which each row contains the coefficients of the linear transformation to the corresponding invariant coordinate.
        level (float, default=0.05) The initial level used to make a decision based on the test p-values.
        test ({'normal', 'agostino', 'jarque', 'anscombe', 'shapiro'}, default='agostino') Name of the normality test to be used. Refer to the SciPy documentation for more information about the tests.
        max_select (int or None, default=None): Maximum number of components to select.

    Returns:
        dict: Summary of the component selection step

    References:
        - Archimbaud, A., Alfons, A., Nordhausen, K., & Ruiz-Gazen, A. (2023). ICSClust: Tandem clustering with invariant coordinate selection.
        - Alfons, A., Archimbaud, A., Nordhausen, K., & Ruiz-Gazen, A. (2024). Tandem clustering with invariant coordinate selection. Econometrics and Statistics. doi:10.1016/j.ecosta.2024.03.002.

    Example:
        >>> from sklearn.datasets import load_iris
        >>> from icspylab import ICS, normal_crit
        >>> iris = load_iris()
        >>> X = iris.data
        >>> ics = ICS(S1="cov", S2="cov4")
        >>> selection_res = normal_crit(X=X, W=ics.components_)
        >>> print(selection_res.info)
        {'crit': 'normal', 'level': 0.05, 'max_select': 3, 'test': 'agostino', 'pvalues': array([0.07492811, 0.19460223, 0.9311222 , 0.00942277]), 'adjusted_levels': [0.05, 0.025], 'component_names': ['IC_4']}

    """

    # test validation
    tests = {
        "normal": _normaltest,
        "agostino": _agostino_test,
        "jarque": _jarque_test,
        "anscombe": _anscombe_test,
        "shapiro": _shapiro_test
    }

    if test not in tests:
        valid = ", ".join(tests.keys())
        raise ValueError(f"test must be one of: {valid}")

    test_fun = tests[test]

    # level validation
    if not isinstance(level, numbers.Real):
        raise TypeError("level must be a real number.")

    if not (0.0 < float(level) < 1.0):
        raise ValueError("level must be between 0 and 1.")

    # max_select validation
    all_comp_names = [f"IC_{i+1}" for i in range(X.shape[1])]
    p = X.shape[1]
    max_select = _validate_nb_select(max_select, p)

    # Transform X to apply the normality tests
    Z = X @ W.T

    # Apply marginal normality tests to all components and select on p-values
    test_pvals = np.array([test_fun(Z[:, j])["p_value"] for j in range(Z.shape[1])])
    comp_signif = test_pvals <= level

    # If none of them are significative
    if comp_signif.sum() == 0:
        selected_component_names = []
        adjusted_levels = [level]

    # Else: we select the components on the extreme left and right while they are not gaussian
    else:
        temp = 1
        selected_component_names = []
        adjusted_levels = [level]

        pvals = test_pvals.copy()
        comps = all_comp_names.copy()

        while temp <= max_select:

            left_pval = pvals[0]
            right_pval = pvals[-1]

            if (left_pval < level) and (left_pval < right_pval):
                selected_component_names.append(comps[0])
                comps = comps[1:]
                pvals = pvals[1:]

            elif right_pval < level:
                selected_component_names.append(comps[-1])
                comps = comps[:-1]
                pvals = pvals[:-1]

            else:
                break

            temp += 1

            # Adjust the alpha level by dividing by the old weight and multiplying by the new one.
            pvals = pvals * temp / (temp - 1)
            adjusted_levels.append(level / temp)

    # Keep only the selected components
    name_to_idx = {name: i for i, name in enumerate(all_comp_names)}
    idx = [name_to_idx[name] for name in selected_component_names]
    components = W[idx, :]

    # ComponentSelect class
    n_components = len(selected_component_names)

    info = {
        "crit": "normal",
        "level": level,
        "max_select": max_select,
        "test": test,
        "pvalues": test_pvals.copy(),
        "adjusted_levels": adjusted_levels,
        "component_names": selected_component_names
    }

    return ComponentSelect(label="normal", components=components, n_components=n_components, component_names=selected_component_names, info=info)