Source code for icspylab.distributions

import numpy as np
from numpy.linalg import eigh
from scipy.stats import gamma, multivariate_t
import warnings


# --- Multivariate distributions ---


[docs]
def unifsphere(n, p):
    """
    Generate n vectors uniformly distributed on the unit sphere in dimension p.

    Parameters:
        n (int): Number of observations.
        p (int): Dimension of the sphere.

    Returns:
        ndarray (n, p)

    Example:
        >>> from icspylab.distributions import unifsphere
        >>> X = unifsphere(n=100, p=2)
        >>> print(X.shape)
    (100, 2)
    """
    x = np.random.normal(size=(n, p))
    norm = np.linalg.norm(x, axis=1, keepdims=True)
    return x / norm




[docs]
def multivariate_powerexp(n, scatter, location=None, beta=1):
    """
    Generate n observations from a multivariate power exponential distribution.

    Parameters:
        n (int): Number of observations.
        scatter (array-like): Symmetric positive definite scatter matrix (p x p).
        location (array-like): Mean vector of dimension p.
        beta (float):  Shape parameter (> 0). beta = 1 corresponds to the multivariate normal distribution, beta < 1 corresponds to heavier tails.

    Returns:
        ndarray (n, p)

    References:
        - Oja, H. (2010), Multivariate Nonparametric Methods with R, Springer.
        - Nordhausen, K., & Oja, H. (2011). Multivariate L1 statistical methods: The package MNM. Journal of Statistical Software, 43, 1-28.

    Example:
        >>> from icspylab.distributions import multivariate_powerexp
        >>> X = multivariate_powerexp(n=100, scatter=np.eye(3), beta=4)
        >>> print(X.shape)
        (100, 3)
    """

    scatter = np.asarray(scatter)
    p = scatter.shape[0]

    if location is None:
        location = np.zeros(p)
    else:
        location = np.asarray(location)

    # check symmetry
    if not np.allclose(scatter, scatter.T, atol=np.sqrt(np.finfo(float).eps)):
        raise ValueError("scatter must be a symmetric matrix")

    # check dimensions
    if len(location) != p:
        raise ValueError("location and scatter have non-conforming size")

    # check beta
    if beta <= 0:
        raise ValueError("beta must be positive")

    # eigen decomposition of scatter matrix
    eigenvalues, eigenvectors = eigh(scatter)

    # check numerical positive definiteness
    if not np.all(eigenvalues >= -np.sqrt(np.finfo(float).eps) * abs(eigenvalues[-1])):
        warnings.warn("warning: scatter is numerically not positive definite")

    # matrix square root of scatter
    scattersqrt = eigenvectors @ np.diag(np.sqrt(eigenvalues)) @ eigenvectors.T

    # generate radial component
    radius = gamma.rvs(a=p / (2 * beta), scale=2, size=n) ** (1 / (2 * beta))

    # generate directions uniformly on unit sphere
    un = unifsphere(n, p)

    # construct multivariate samples
    mvpowerexp = (radius[:, None] * un) @ scattersqrt

    # add location vector
    mvpowerexp += location

    return mvpowerexp



# --- Elliptical mixtures ---


[docs]
def generate_gaussian_mixture(eps, mu, sigma, n, p):
    """
    Generates a Gaussian Mixture Model (GMM) with the given parameters.

    Parameters:
        eps (list of float): Proportions of points assigned to each cluster (must sum to 1).
        mu (list of np.ndarray): List of mean vectors (centroids) for each cluster (size k).
        sigma (list of np.ndarray): List of covariance matrices (size k).
        n (int): Total number of data points to generate.
        p (int): Dimension of the data, including noise.

    Returns:
        tuple: A tuple containing:
            data_with_noise (ndarray): Matrix (n, p) of generated data points.
            labels (ndarray): Array of cluster labels (size n).

    Example:
        >>> eps = [0.5, 0.5]
        >>> mu = [np.ones(2), np.ones(2)*10]
        >>> sigma = [np.eye(2) for _ in range(2)]
        >>> X, labels = generate_gaussian_mixture(eps, mu, sigma, n=1000, p=6)
    """
    # Validate inputs
    assert np.isclose(sum(eps), 1.0), "The elements of eps must sum to 1."
    assert len(eps) > 0, "Proportions (eps) must contain at least one group."
    assert len(eps) == len(sigma), "Proportions (eps) and sigma must have the same length."
    assert len(eps) == len(mu), "Proportions (eps) and mu must have the same length."

    # Number of clusters k
    k = len(eps)

    # Determine the number of points for each group based on eps
    points_per_group = (np.array(eps) * n).astype(int)
    # Adjust to ensure the total points add up to n
    points_per_group[0] = n - np.sum(points_per_group[1:])

    # Generate data
    data = []

    for i in range(k):
        # Generate points from a multivariate normal distribution
        group_points = np.random.default_rng().multivariate_normal(mu[i], sigma[i], points_per_group[i])
        data.append(group_points)

    # Combine data points into a single array (n x r)
    data = np.vstack(data)

    # Add noise
    p_noise = p - data.shape[1]
    noise = np.random.normal(loc=0, scale=1, size=(n, p_noise))
    data_with_noise = np.hstack((data, noise))

    # Save group label for each point
    group_labels = ["Group_" + str(i + 1) for i in range(k)]
    labels = [val for val, count in zip(group_labels, points_per_group) for _ in range(count)]
    labels = np.array(labels)

    return data_with_noise, labels




[docs]
def generate_student_mixture(eps, mu, sigma, df, n, p):
    """
    Generates a Student-t Mixture Model (SMM) with the given parameters.

    Parameters:
        eps (list of float): Proportions of points assigned to each cluster (must sum to 1).
        mu (list of ndarray): List of mean vectors (centroids) for each cluster (size k).
        sigma (list of ndarray): List of covariance matrices (size k).
        df (int or list of int): Degrees of freedom (size k if list). Must be strictly positive integers.
        n (int): Total number of data points to generate.
        p (int): Dimension of the data, including noise.

    Returns:
        tuple: A tuple containing:
            data_with_noise (ndarray): Matrix (n, p) of generated data points.
            labels (ndarray): Array of cluster labels (size n).

    Example:
        >>> eps = [0.5, 0.5]
        >>> mu = [np.ones(2), np.ones(2)*10]
        >>> sigma = [np.eye(2) for _ in range(2)]
        >>> X, labels = generate_student_mixture(eps, mu, sigma, df=2, n=1000, p=6)
    """

    # Number of clusters k
    k = len(eps)

    if isinstance(df, int):
        df = [df for _ in range(k)]

    # Validate inputs
    assert np.isclose(sum(eps), 1.0), "The elements of eps must sum to 1."
    assert len(eps) > 0, "Proportions (eps) must contain at least one group."
    assert len(eps) == len(sigma), "Proportions (eps) and sigma must have the same length."
    assert len(eps) == len(mu), "Proportions (eps) and mu must have the same length."
    assert len(eps) == len(df), "Proportions (eps) and df must have the same length."

    # Determine the number of points for each group based on eps
    points_per_group = (np.array(eps) * n).astype(int)
    # Adjust to ensure the total points add up to n
    points_per_group[0] = n - np.sum(points_per_group[1:])

    # Generate data
    data = []

    for i in range(k):
        # Generate points from a multivariate Student-t distribution
        group_points = multivariate_t.rvs(loc=mu[i], shape=sigma[i], df=df[i], size=points_per_group[i])
        data.append(group_points)

    # Combine data points into a single array (n x r)
    data = np.vstack(data)

    # Add noise
    p_noise = p - data.shape[1]
    noise = np.random.normal(loc=0, scale=1, size=(n, p_noise))
    data_with_noise = np.hstack((data, noise))

    # Save group label for each point
    group_labels = ["Group_" + str(i + 1) for i in range(k)]
    labels = [val for val, count in zip(group_labels, points_per_group) for _ in range(count)]
    labels = np.array(labels)

    return data_with_noise, labels




[docs]
def generate_powerexp_mixture(eps, mu, sigma, beta, n, p):
    """
    Generates a mixture of multivariate power exponential distribution (PEM) with the given parameters.

    Parameters:
        eps (list of float): Proportions of points assigned to each cluster (must sum to 1).
        mu (list of np.ndarray): List of mean vectors (centroids) for each cluster (size k).
        sigma (list of np.ndarray): List of covariance matrices (size k).
        beta (float or list of float): Shape parameters (size k if list).
        n (int): Total number of data points to generate.
        p (int): Dimension of the data, including noise.

    Returns:
        tuple: A tuple containing:
            data_with_noise (ndarray): Matrix (n, p) of generated data points.
            labels (ndarray): Array of cluster labels (size n).

    Example:
        >>> eps = [0.5, 0.5]
        >>> mu = [np.ones(2), np.ones(2)*10]
        >>> sigma = [np.eye(2) for _ in range(2)]
        >>> X, labels = generate_powerexp_mixture(eps, mu, sigma, beta=0.8, n=1000, p=6)
    """

    # Number of clusters k
    k = len(eps)

    if isinstance(beta, float):
        beta = [beta for _ in range(k)]

    # Validate inputs
    assert np.isclose(sum(eps), 1.0), "The elements of eps must sum to 1."
    assert len(eps) > 0, "Proportions (eps) must contain at least one group."
    assert len(eps) == len(sigma), "Proportions (eps) and sigma must have the same length."
    assert len(eps) == len(mu), "Proportions (eps) and mu must have the same length."
    assert len(eps) == len(beta), "Proportions (eps) and beta must have the same length."

    # Determine the number of points for each group based on eps
    points_per_group = (np.array(eps) * n).astype(int)
    # Adjust to ensure the total points add up to n
    points_per_group[0] = n - np.sum(points_per_group[1:])

    # Generate data
    data = []

    for i in range(k):
        # Generate points from a multivariate normal distribution
        group_points = multivariate_powerexp(n=points_per_group[i], location=mu[i], scatter=sigma[i], beta=beta[i])
        data.append(group_points)

    # Combine data points into a single array (n x r)
    data = np.vstack(data)

    # Add noise
    p_noise = p - data.shape[1]
    noise = np.random.normal(loc=0, scale=1, size=(n, p_noise))
    data_with_noise = np.hstack((data, noise))

    # Save group label for each point
    group_labels = ["Group_" + str(i + 1) for i in range(k)]
    labels = [val for val, count in zip(group_labels, points_per_group) for _ in range(count)]
    labels = np.array(labels)

    return data_with_noise, labels



# --- RANDU ---


[docs]
def generate_randu(n=400, seed=1):
    """
    Generate a synthetic dataset based on the classical RANDU pseudo-random number generator.

    RANDU is an obsolete linear congruential generator that is widely used as a benchmark example of poor randomness properties.

    The implementation follows the standard definition described in the R datasets package manual.

    Parameters:
        n (int, default=400): Number of data points to generate.
        seed (int, default=1): Seed of the generator.

    Returns:
        ndarray (n, 3)

    References:
        - Fortran Language Reference Manual (1999), Compaq.
        - R Core Team (datasets package), https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/randu.html

    Example:
        >>> from icspylab.distributions import generate_randu
        >>> X = generate_randu(n=100)
        >>> print(X.shape)
        (100, 3)
    """

    def randu():
        nonlocal seed
        seed = ((2 ** 16 + 3) * seed) % (2 ** 31)
        return seed / (2 ** 31)

    x = np.empty((n, 3), dtype=float)

    for i in range(n):
        U = np.array([randu() for _ in range(5)])
        x[i, :] = np.round(U[:3], 6)

    return x