Source code for icspylab.scatter

import numpy as np
import pandas as pd
from scipy.spatial.distance import mahalanobis
from numpy.linalg import multi_dot
from sklearn.covariance import MinCovDet
from sklearn.utils.validation import check_array
from numba import njit
import warnings
try:
    from icspylab.tcov import tcov_module
except ImportError:
    tcov_module = None



[docs]
class Scatter:
    """
    A class to represent the scatter matrix and its related data.

    Attributes:
        location (ndarray): The mean location of the data.
        scatter (ndarray): The scatter matrix.
        label (str): A label describing the scatter matrix.
    """

    def __init__(self, location, scatter, label):
        """
        Initialize the Scatter object with specified parameters.

        Parameters:
            location (ndarray or None): The mean location of the data.
            scatter (ndarray): The scatter matrix.
            label (str): A label describing the scatter matrix.
        """
        self.location = location
        self.scatter = scatter
        self.label = label




[docs]
def cov(X, location=True):
    """
    Compute the covariance matrix.

    Parameters:
        X (array-like): The data matrix.
        location (bool, default=True) Whether to include the mean location.

    Returns:
        Scatter: An object containing the location and scatter matrix.

    Refernce:
        Refer to numpy.cov

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import cov
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> cov_X = cov(X)
        >>> print(cov_X.scatter)
        [[5.6 3.6]
         [3.6 2.4]]
    """

    # Check inputs
    X = np.asarray(X)
    if X.ndim > 2:
        raise ValueError("X has more than 2 dimensions")
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    p = X.shape[1]

    # Compute the covariance matrix
    cov_cov = np.cov(X, rowvar=False).reshape(p, p)

    # Compute the mean location if required
    cov_loc = X.mean(0) if location else None

    return Scatter(cov_loc, cov_cov, "Covariance")




[docs]
def covW(X, location=True, alpha=1, cf=1):
    """
    Estimates the scatter matrix based on one-step M-estimator using mean and covariance matrix as starting point.
    For more details, check the R documentation of the package ICS (function covW).

    Parameters:
        X (array-like): Data matrix, must be at least bi-variate.
        location (bool, default=True) Whether to include the mean location.
        alpha (float, default=1) Parameter of the one-step M-estimator.
        cf (float, default=1) Consistency factor of the one-step M-estimator.

    Returns:
        Scatter: An object containing the location and weighted scatter matrix.

    Details:
        It is given for a :math:`n` x :math:`p` matrix :math:`X` by:

    :math:`CovW(X) = (1/n) cf \sum_{i=1}^{n} w(D^2(x_i)) (x_i - \overline{x})^T (x_i - \overline{x})`

    where:
        - :math:`n` is the number of observations,
        - :math:`x_i` is the i-th observation vector,
        - :math:`\overline{x}` is the mean vector of all observations,
        - :math:`w(d)= d^{α}` is a non-negative and continuous weight function applied to the squared Mahalanobis distance :math:`D^2(x_i)`.
        - :math:`cf` is a consistency factor

    References:
        - Tukey, J. W. (1977). Exploratory data analysis (Vol. 2, pp. 131-160). Reading, MA: Addison-wesley.
        - Archimbaud, A., Drmac, Z., Nordhausen, K., Radojicic, U. and Ruiz-Gazen, A. (2023). SIAM Journal on Mathematics of Data Science (SIMODS), Vol.5(1):97–121. doi:10.1137/22M1498759.
        - Nordhausen, K., Oja, H., & Tyler, D. E. (2008). Tools for exploring multivariate data: The package ICS. Journal of Statistical Software, 28, 1-31.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import covW
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> covW_X = covW(X)
        >>> print(covW_X.scatter)
        [[7.77777778 5.        ]
         [5.         3.33333333]]
    """

    # Check inputs
    X = np.asarray(X)
    if X.ndim != 2:
        raise ValueError("X must be a 2D array")
    n, p = X.shape
    if np.isnan(X).any():
        raise ValueError("X must not contain NaN values")
    if p <= 1:
        raise ValueError("X must be at least bi-variate")

    # Calculate the mean location and covariance matrix
    X_means = X.mean(axis=0)
    X_cov = np.cov(X.T)

    # Compute the Mahalanobis distance, square it, and apply the exponent alpha
    distance_maha = np.apply_along_axis(mahalanobis, 1, X, X_means, np.linalg.inv(X_cov))
    distance_maha_square = np.power(distance_maha, 2)
    distance_maha_square_alpha = np.power(distance_maha_square, alpha)

    # Center the data and compute the weighted covariance matrix
    X_centered = X - X_means
    X_covW = cf / n * multi_dot([X_centered.T, np.diag(distance_maha_square_alpha), X_centered])

    location_ = X.mean(0) if location else None
    return Scatter(location_, X_covW, "Weighted Covariance")




[docs]
def covAxis(X, location=True):
    """
    Compute the one-step Tyler shape matrix which internally uses covW with alpha=-1 and cf=p.

    Parameters:
        X (array-like): The data matrix.
        location (bool, default=True) Whether to include the mean location.

    Returns:
        Scatter: An object containing the location and scatter matrix.

    References:
        - Critchley , F., Pires, A. and Amado, C. (2006), Principal axis analysis, Technical Report, 06/14, The Open University Milton Keynes.
        - Tyler, D.E., Critchley, F., Dumbgen, L. and Oja, H. (2009), Invariant co-ordinate selection, Journal of the Royal Statistical Society,Series B, 71, 549–592. <doi:10.1111/j.1467-9868.2009.00706.x>.
        - Nordhausen, K., Oja, H., & Tyler, D. E. (2008). Tools for exploring multivariate data: The package ICS. Journal of Statistical Software, 28, 1-31.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import covAxis
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> covAxis_X = covAxis(X)
        >>> print(covAxis_X.scatter)
        [[5.6 3.6]
         [3.6 2.4]]
    """

    X = np.asarray(X)
    p = X.shape[1]
    # Directly call covW with given parameters
    covaxis_scatter = covW(X, location, alpha=-1, cf=p)
    covaxis_scatter.label = "CovAxis"

    return covaxis_scatter




[docs]
def cov4(X, location=True):
    """
    Compute a custom weighted covariance matrix (cov4) which internally uses covW with alpha=1 and cf=(1 / (p + 2)).

    Parameters:
        X (array-like): The data matrix.
        location (bool, default=True) Whether to include the mean location.

    Returns:
        Scatter: An object containing the location and custom weighted scatter matrix.

    References:
        - Cardoso, J.F. (1989), Source separation using higher order moments, in Proc. IEEE Conf. on Acoustics, Speech and Signal Processing (ICASSP’89), 2109–2112. <doi:10.1109/ICASSP.1989.266878>.
        - Oja, H., Sirkia, S. and Eriksson, J. (2006), Scatter matrices and independent component analysis, Austrian Journal of Statistics, 35, 175–189.
        - Nordhausen, K., Oja, H., & Tyler, D. E. (2008). Tools for exploring multivariate data: The package ICS. Journal of Statistical Software, 28, 1-31.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import cov4
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> cov4_X = cov4(X)
        >>> print(cov4_X.scatter)
        [[1.94444444 1.25      ]
         [1.25       0.83333333]]
    """

    X = np.asarray(X)
    p = X.shape[1]
    location_ = X.mean(0) if location else None

    # Directly call covW with given parameters
    cov4_scatter = covW(X, location, alpha=1, cf=(1 / (p + 2)))
    cov4_scatter.label = "Cov4"
    return cov4_scatter




[docs]
def mcd(X, support_fraction=0.25, reweighted=False, **kwargs):
    """
    Wrapper function around scikit learn's implementation of the MCD (Minimum Covariance Determinant) estimator,
    computed using the FastMCD algorithm. The MCD is a robust estimator of location and covariance. It is based on
    selecting a subset of observations (of given size) whose empirical covariance matrix has minimal determinant.
    The covariance estimate is then rescaled to ensure consistency. An optional reweighting step can be applied,
    where observations are weighted according to their Mahalanobis distance from the estimated location. This yields
    the reweighted MCD estimator. In scikit-learn's MinCovDet, both raw (non-reweighted) and reweighted estimates are
    available through the attributes `raw_location_`, `raw_covariance_`, `location_`, and `covariance_`.
    By default, this function returns the raw estimates.
    Fore more information, check out scikit learn's `documentation <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.MinCovDet.html>`_.

    Parameters:
        X (array-like): The data matrix.
        support_fraction (float or None, default=0.25): The proportion of observations to be included in the support of the raw MCD estimate. This corresponds to the `support_fraction` parameter of sklearn.covariance.MinCovDet. By default, it is set to 0.25, which differs from the default used in sklearn.covariance.MinCovDet. If None, the original MinCovDet default value is used: (n_samples + n_features + 1) / (2 * n_samples).
        reweighted (bool, default=False) If True, use the reweighted version of the MCD estimator. Default is False.

    Returns:
        Scatter: An object containing the location and scatter matrix.

    References:
        - Refer to sklearn.covariance.MinCovDet
        - Rousseeuw, P.J. (1984) Least median of squares regression. J. Am Stat Ass, 79:871.
        - A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import mcd
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> mcd_X = mcd(X, support_fraction=None)
        >>> print(mcd_X.scatter)
        [[4.66666667 3.        ]
         [3.         2.        ]]
    """

    # Check inputs
    X = np.asarray(X)
    if X.ndim > 2:
        raise ValueError("X has more than 2 dimensions")
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    n = X.shape[0]
    if n < 2:
        raise ValueError("X must have at least 2 observations")

    mcd_fit = MinCovDet(support_fraction=support_fraction, **kwargs).fit(X)

    if reweighted:
        mcd_loc = mcd_fit.location_
        mcd_cov = mcd_fit.covariance_
    else:
        mcd_loc = mcd_fit.raw_location_
        mcd_cov = mcd_fit.raw_covariance_

    return Scatter(location=mcd_loc, scatter=mcd_cov, label="MCD")



@njit
def _tcov_numba(X, cov_inv, b):
    """Loop over pairs of observations and add their weighted contribution."""

    n, p = X.shape
    V = np.zeros((p, p))
    denominator = 0.0

    for i in range(1, n):
        xi = X[i]
        for j in range(i):
            # Compute difference of current pair of observations
            diff = xi - X[j]
            # Compute squared pairwise Mahalanobis distance r_sq = diff^T @ cov_inv @ diff
            tmp = cov_inv @ diff
            r_sq = np.dot(diff, tmp)
            # Compute weight for current pair of observations
            w = np.exp(b * r_sq)
            # Add weighted contribution of current pair of observations
            for k in range(p):
                for l in range(p):
                    V[k, l] += w * diff[k] * diff[l]
            denominator += w

    return V / denominator



[docs]
def tcov(X, beta=2):
    """
    Computes a pairwise one-step M-estimate of scatter with weights based on pairwise Mahalanobis distances.
    It can be interpreted as a `local` covariance matrix. Without the exponential term (or when beta=0), :math:`T_n` is
    proportional to the classical covariance matrix. The weighting scheme increases the importance of close sample pairs
    and decreases the contribution of pairs that are far apart.
    Note that this estimator is based on pairwise differences and therefore no location estimate is returned.

    Parameters:
        X (array-like):  data
        beta (int or float > 0, default=2): positive numeric value specifying the tuning parameter of the tcov estimator. The optimal value depends on the data but it usually is between 1.5 and 2.5.

    Returns:
        Scatter: An object containing the location(=None) and scatter matrix.

    Details:
        It is given for a :math:`n` x :math:`p` matrix :math:`X` by:

    .. math::

        T_n(\\beta) =
        \\frac{
            \\sum_{i=1}^{n} \\sum_{j=i+1}^{n}
            \\exp\\left(
                -\\frac{\\beta}{2}
                \\lVert X_i - X_j \\rVert_{V_n^{-1}}^2
            \\right)
            (X_i - X_j)(X_i - X_j)^\\top
        }{
            \\sum_{i=1}^{n} \\sum_{j=i+1}^{n}
            \\exp\\left(
                -\\frac{\\beta}{2}
                \\lVert X_i - X_j \\rVert_{V_n^{-1}}^2
            \\right)
        }

    where

    .. math::

        V_n = \\frac{1}{n}
        \\sum_{i=1}^{n}
        (X_i - \\bar X_n)(X_i - \\bar X_n)^\\top

    .. math::

        \\bar X_n = \\frac{1}{n} \\sum_{i=1}^{n} X_i

    .. math::

        \\lVert x \\rVert_M^2 = x^\\top M x


    The computation is optimized with Numba. In the paper, we have :math:`w(x) = exp(-x/2)`. But since we always
    call :math:`w(beta * r^2)`, we instead set :math:`b = -beta/2` and use :math:`w(x) = exp(x)`.

    References:
        - Caussinus, H. and Ruiz-Gazen, A. (1993) Projection Pursuit and Generalized Principal Component Analysis. In Morgenthaler, S., Ronchetti, E., Stahel, W.A. (eds.) New Directions in Statistical Data Analysis and Robustness, 35-46. Monte Verita, Proceedings of the Centro Stefano Franciscini Ascona Series. Springer-Verlag.
        - Caussinus, H. and Ruiz-Gazen, A. (1995) Metrics for Finding Typical Structures by Means of Principal Component Analysis. In Data Science and its Applications, 177-192. Academic Press.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import tcov
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> tcov_X = tcov(X)
        >>> print(tcov_X.scatter)
        [[5.03250611 3.2351825 ]
         [3.2351825  2.15678833]]
    """

    # Check types
    X = np.asarray(X, dtype=np.float64)
    if X.ndim > 2:
        raise ValueError("X has more than 2 dimensions")
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    X = np.ascontiguousarray(X)

    n = X.shape[0]
    if n < 2:
        raise ValueError("X must have at least 2 observations")

    beta = float(beta)

    # Initialize b and the inverse covariance
    b = -beta / 2.0
    cov_inv = np.linalg.inv(np.cov(X, rowvar=False))

    tcov_X = _tcov_numba(X, cov_inv, b)

    return Scatter(location=None, scatter=tcov_X, label="Tcov")



def _norm_mu_V(a, B, A):
    """
    Computes the norm used to define convergence as in Arslan et al. (1995).
    """

    A_inv = np.linalg.inv(A)
    BA_inv = B @ A_inv
    square = a.T @ A_inv @ a + np.trace(BA_inv @ BA_inv)
    return np.sqrt(square)


def _alg3(X, df, mu_init, V_init, eps, maxiter):
    """
    Computes tM location and scatter according to Algorithm 3 in Kent and Tyler

    Parameters:
        X (np.ndarray(n,p)): data
        df (int >= 1): assumed degrees of freedom of the t-distribution. Default is 1 which corresponds
        to the Cauchy distribution.
        mu_init (np.ndarray(p)): initial value of the location mu
        V_init (np.ndarray(p,p)): initial value of the scatter V
        eps (float): convergence tolerance
        maxiter (int): maximum number of iterations

    Returns:
        tuple: location mu, scatter V, and number of iterations iter
    """

    n, p = X.shape

    # Init parameters
    mu_i = mu_init
    V_i = V_init
    gamma = 1
    differ = np.inf
    iter_count = 0

    while differ > eps:
        iter_count += 1

        # Update weights
        d = np.apply_along_axis(mahalanobis, 1, X, mu_i, np.linalg.inv(V_i))
        w = (df + p) / (df - 1 + (1 / gamma) + (1 / gamma) * (d ** 2))
        # Update estimate mu
        gamma_new = np.mean(w)
        mu_new = np.average(X, axis=0, weights=w) / gamma_new
        # Update estimate V
        X_centered = X - mu_new
        V_new = ((X_centered.T * w) @ X_centered / n) / gamma_new

        # Compute difference
        differ = _norm_mu_V(a=mu_new - mu_i, B=V_new - V_i, A=V_new)
        mu_i, V_i = mu_new, V_new

        if iter_count >= maxiter:
            raise RuntimeError("maxiter reached without convergence")

    return mu_new, V_new, iter_count



[docs]
def tM(X, df=1, mu_init=None, V_init=None, eps=1e-6, maxiter=100):
    """
    Computes the location and scatter for a multivariate t-distribution for a given degree of freedom.
    This function implements the third EM algorithm described in Kent et al. (1994). The norm used to define convergence
    is as in Arslan et al. (1995).

    Parameters:
        X (array-like): data matrix, must be at least bi-variate
        df (int >= 1, default=1) assumed degrees of freedom of the t-distribution. Default is 1 which corresponds
        to the Cauchy distribution.
        mu_init (ndarray(p) or None, default=None) initial value of the location mu
        V_init (ndarray(p,p) or None, default=None) initial value of the scatter V
        eps (float, default=1e-6) convergence tolerance
        maxiter (int, default=100) maximum number of iterations

    Returns:
        tuple: _alg3 output

    References:
        - Kent, J.T., Tyler, D.E. and Vardi, Y. (1994), A curious likelihood identity for the multivariate tdistribution, Communications in Statistics, Simulation and Computation, 23, 441–453. <doi:10.1080/03610919408813180>.
        - Arslan, O., Constable, P.D.L. and Kent, J.T. (1995), Convergence behaviour of the EM algorithm for the multivariate t-distribution, Communications in Statistics, Theory and Methods, 24, 2981–3000. <doi:10.1080/03610929508831664>.
        - Nordhausen, K., Oja, H., & Tyler, D. E. (2008). Tools for exploring multivariate data: The package ICS. Journal of Statistical Software, 28, 1-31.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import tM
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> tM_X = tM(X)
        >>> print(tM_X.scatter)
        [[4.66666667 3.        ]
         [3.         2.        ]]
    """

    # Check inputs
    X = np.asarray(X)
    if X.ndim != 2:
        raise ValueError("X must be a 2D array")
    n, p = X.shape
    if np.isnan(X).any():
        raise ValueError("X must not contain NaN values")
    if p <= 1:
        raise ValueError("X must be at least bi-variate")

    if df < 1:
        raise ValueError("df (degrees of freedom) must be >= 1")
    if mu_init is not None and mu_init.shape[0] != X.shape[1]:
        raise ValueError("mu_init must have length equal to the number of features in X")
    if V_init is not None and V_init.shape != (X.shape[1], X.shape[1]):
        raise ValueError("V_init must be square with shape (n_features, n_features)")
    if eps <= 0:
        raise ValueError("eps must be positive")
    if maxiter < 1:
        raise ValueError("maxiter must be >= 1")

    # Initialize values for location mu and scatter V if necessary
    if mu_init is None:
        mu_init = X.mean(axis=0)
    if V_init is None:
        V_init = np.cov(X, rowvar=False)

    mu, V, iter = _alg3(X, df, mu_init, V_init, eps, maxiter)

    return Scatter(location=mu, scatter=V, label="tM")



@njit
def _tcovAxis_numba(X, cov_inv):
    """Loop over pairs of observations and add their weighted contribution."""

    n, p = X.shape
    V = np.zeros((p, p))

    for i in range(1, n):
        xi = X[i]
        for j in range(i):
            # Compute difference of current pair of observations
            diff = xi - X[j]
            # Compute squared pairwise Mahalanobis distance r_sq = diff^T @ cov_inv @ diff
            tmp = cov_inv @ diff
            r_sq = np.dot(diff, tmp)

            V += np.outer(diff, diff) / r_sq ** 2

    return 2 / (n * (n-1)) * V


[docs]
def tcovAxis(X):
    """
    Computes a pairwise one-step M-estimate of scatter, similar to tcov, but using the same weights as covAxis.

    Parameters:
        X (array-like): data matrix, must be at least bi-variate.

    Returns:
        Scatter: An object containing the location(=None) and scatter matrix.

    Details:
        It is given for a :math:`n` x :math:`p` matrix :math:`X` by:

    .. math::

        T_n =
        \\frac{2}{n(n-1)}
        \\sum_{i=1}^{n-1} \\sum_{j=i+1}^{n}
        \\frac{
            (y_i - y_j)(y_i - y_j)^\\top
        }{
            \\left(
                (y_i - y_j)^\\top S_n^{-1} (y_i - y_j)
            \\right)^2
        }

    Reference:
        - Tyler, D.E., Critchley, F., Dumbgen, L. and Oja, H. (2009), Invariant co-ordinate selecetion, Journal of the Royal Statistical Society,Series B, 71, 549–592. <doi:10.1111/j.1467-9868.2009.00706.x>.

    Example:
        >>> import numpy as np
        >>> from icspylab.scatter import tcovAxis
        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        >>> tcovAxis_X = tcovAxis(X)
        >>> print(tcovAxis_X.scatter)
        [[0.98 0.63]
         [0.63 0.42]]
    """

    # Check types
    X = np.asarray(X, dtype=np.float64)
    if X.ndim != 2:
        raise ValueError("X must be a 2D array")
    X = np.ascontiguousarray(X)

    n, p = X.shape
    if n < 2:
        raise ValueError("X must have at least 2 observations")
    if p <= 1:
        raise ValueError("X must be at least bi-variate")

    cov_inv = np.linalg.inv(np.cov(X, rowvar=False))
    tcov_X = _tcovAxis_numba(X, cov_inv)

    return Scatter(location=None, scatter=tcov_X, label="TcovAxis")