Source code for icspylab.comp_select.median

import numpy as np
from sklearn.utils.validation import check_array
from .base import ComponentSelect, _validate_nb_select


[docs] def median_crit(kurtosis, W, nb_select=None, **kwargs): """ Identifies as interesting the invariant coordinates whose generalized eigenvalues (kurtosis) are the furthermost away from the median of all generalized eigenvalues (kurtosis). Parameters: kurtosis (ndarray): Array of kurtosis values. W (ndarray): Transformation matrix in which each row contains the coefficients of the linear transformation to the corresponding invariant coordinate. nb_select (int or None, default=None): Exact number of components to select. If None (default), number of components to select is the number of variables minus one. Returns: dict: Summary of the component selection step References: - Archimbaud, A., Alfons, A., Nordhausen, K., & Ruiz-Gazen, A. (2023). ICSClust: Tandem clustering with invariant coordinate selection. - Alfons, A., Archimbaud, A., Nordhausen, K., & Ruiz-Gazen, A. (2024). Tandem clustering with invariant coordinate selection. Econometrics and Statistics. doi:10.1016/j.ecosta.2024.03.002. Example: >>> from sklearn.datasets import load_iris >>> from icspylab import ICS, median_crit >>> iris = load_iris() >>> X = iris.data >>> ics = ICS(S1="cov", S2="cov4") >>> X_new = ics.fit_transform(X) >>> selection_res = median_crit(kurtosis=ics.kurtosis_,W=ics.components_) >>> print(selection_res.info) {'crit': 'med', 'nb_select': 3, 'gen_kurtosis': array([1.20739878, 1.0269412 , 0.9292235 , 0.74046722]), 'med_gen_kurtosis': np.float64(0.9780823483964416), 'gen_kurtosis_diff_med': array([0.22931644, 0.04885885, 0.04885885, 0.23761513]), 'component_names': ['IC_4', 'IC_1', 'IC_2']} """ # gen_kurtosis validation gen_kurtosis = check_array( kurtosis, ensure_2d=False, dtype=float, ensure_all_finite=True, ) if gen_kurtosis.ndim != 1: raise ValueError("gen_kurtosis must be 1D.") # nb_select validation p = len(gen_kurtosis) nb_select = _validate_nb_select(nb_select, p) all_comp_names = [f"IC_{i + 1}" for i in range(p)] # Components associated with the furthest eigenvalues from the median med_gen_kurtosis = np.median(gen_kurtosis) gen_kurtosis_diff = np.abs(gen_kurtosis - med_gen_kurtosis) idx_sel = np.argsort(gen_kurtosis_diff)[::-1][: nb_select] selected_component_names = [all_comp_names[i] for i in idx_sel] # Keep only the selected components name_to_idx = {name: i for i, name in enumerate(all_comp_names)} idx = [name_to_idx[name] for name in selected_component_names] components = W[idx, :] # ComponentSelect class n_components = len(selected_component_names) info = { "crit": "med", "nb_select": nb_select, "gen_kurtosis": gen_kurtosis, "med_gen_kurtosis": med_gen_kurtosis, "gen_kurtosis_diff_med": gen_kurtosis_diff, "component_names": selected_component_names } return ComponentSelect(label="med", components=components, n_components=n_components, component_names=selected_component_names, info=info)