Source code for icspylab.distributions

import numpy as np
from numpy.linalg import eigh
from scipy.stats import gamma, multivariate_t
import warnings


# --- Multivariate distributions ---

[docs] def unifsphere(n, p): """ Generate n vectors uniformly distributed on the unit sphere in dimension p. Parameters: n (int): Number of observations. p (int): Dimension of the sphere. Returns: ndarray (n, p) Example: >>> from icspylab.distributions import unifsphere >>> X = unifsphere(n=100, p=2) >>> print(X.shape) (100, 2) """ x = np.random.normal(size=(n, p)) norm = np.linalg.norm(x, axis=1, keepdims=True) return x / norm
[docs] def multivariate_powerexp(n, scatter, location=None, beta=1): """ Generate n observations from a multivariate power exponential distribution. Parameters: n (int): Number of observations. scatter (array-like): Symmetric positive definite scatter matrix (p x p). location (array-like): Mean vector of dimension p. beta (float): Shape parameter (> 0). beta = 1 corresponds to the multivariate normal distribution, beta < 1 corresponds to heavier tails. Returns: ndarray (n, p) References: - Oja, H. (2010), Multivariate Nonparametric Methods with R, Springer. - Nordhausen, K., & Oja, H. (2011). Multivariate L1 statistical methods: The package MNM. Journal of Statistical Software, 43, 1-28. Example: >>> from icspylab.distributions import multivariate_powerexp >>> X = multivariate_powerexp(n=100, scatter=np.eye(3), beta=4) >>> print(X.shape) (100, 3) """ scatter = np.asarray(scatter) p = scatter.shape[0] if location is None: location = np.zeros(p) else: location = np.asarray(location) # check symmetry if not np.allclose(scatter, scatter.T, atol=np.sqrt(np.finfo(float).eps)): raise ValueError("scatter must be a symmetric matrix") # check dimensions if len(location) != p: raise ValueError("location and scatter have non-conforming size") # check beta if beta <= 0: raise ValueError("beta must be positive") # eigen decomposition of scatter matrix eigenvalues, eigenvectors = eigh(scatter) # check numerical positive definiteness if not np.all(eigenvalues >= -np.sqrt(np.finfo(float).eps) * abs(eigenvalues[-1])): warnings.warn("warning: scatter is numerically not positive definite") # matrix square root of scatter scattersqrt = eigenvectors @ np.diag(np.sqrt(eigenvalues)) @ eigenvectors.T # generate radial component radius = gamma.rvs(a=p / (2 * beta), scale=2, size=n) ** (1 / (2 * beta)) # generate directions uniformly on unit sphere un = unifsphere(n, p) # construct multivariate samples mvpowerexp = (radius[:, None] * un) @ scattersqrt # add location vector mvpowerexp += location return mvpowerexp
# --- Elliptical mixtures ---
[docs] def generate_gaussian_mixture(eps, mu, sigma, n, p): """ Generates a Gaussian Mixture Model (GMM) with the given parameters. Parameters: eps (list of float): Proportions of points assigned to each cluster (must sum to 1). mu (list of np.ndarray): List of mean vectors (centroids) for each cluster (size k). sigma (list of np.ndarray): List of covariance matrices (size k). n (int): Total number of data points to generate. p (int): Dimension of the data, including noise. Returns: tuple: A tuple containing: data_with_noise (ndarray): Matrix (n, p) of generated data points. labels (ndarray): Array of cluster labels (size n). Example: >>> eps = [0.5, 0.5] >>> mu = [np.ones(2), np.ones(2)*10] >>> sigma = [np.eye(2) for _ in range(2)] >>> X, labels = generate_gaussian_mixture(eps, mu, sigma, n=1000, p=6) """ # Validate inputs assert np.isclose(sum(eps), 1.0), "The elements of eps must sum to 1." assert len(eps) > 0, "Proportions (eps) must contain at least one group." assert len(eps) == len(sigma), "Proportions (eps) and sigma must have the same length." assert len(eps) == len(mu), "Proportions (eps) and mu must have the same length." # Number of clusters k k = len(eps) # Determine the number of points for each group based on eps points_per_group = (np.array(eps) * n).astype(int) # Adjust to ensure the total points add up to n points_per_group[0] = n - np.sum(points_per_group[1:]) # Generate data data = [] for i in range(k): # Generate points from a multivariate normal distribution group_points = np.random.default_rng().multivariate_normal(mu[i], sigma[i], points_per_group[i]) data.append(group_points) # Combine data points into a single array (n x r) data = np.vstack(data) # Add noise p_noise = p - data.shape[1] noise = np.random.normal(loc=0, scale=1, size=(n, p_noise)) data_with_noise = np.hstack((data, noise)) # Save group label for each point group_labels = ["Group_" + str(i + 1) for i in range(k)] labels = [val for val, count in zip(group_labels, points_per_group) for _ in range(count)] labels = np.array(labels) return data_with_noise, labels
[docs] def generate_student_mixture(eps, mu, sigma, df, n, p): """ Generates a Student-t Mixture Model (SMM) with the given parameters. Parameters: eps (list of float): Proportions of points assigned to each cluster (must sum to 1). mu (list of ndarray): List of mean vectors (centroids) for each cluster (size k). sigma (list of ndarray): List of covariance matrices (size k). df (int or list of int): Degrees of freedom (size k if list). Must be strictly positive integers. n (int): Total number of data points to generate. p (int): Dimension of the data, including noise. Returns: tuple: A tuple containing: data_with_noise (ndarray): Matrix (n, p) of generated data points. labels (ndarray): Array of cluster labels (size n). Example: >>> eps = [0.5, 0.5] >>> mu = [np.ones(2), np.ones(2)*10] >>> sigma = [np.eye(2) for _ in range(2)] >>> X, labels = generate_student_mixture(eps, mu, sigma, df=2, n=1000, p=6) """ # Number of clusters k k = len(eps) if isinstance(df, int): df = [df for _ in range(k)] # Validate inputs assert np.isclose(sum(eps), 1.0), "The elements of eps must sum to 1." assert len(eps) > 0, "Proportions (eps) must contain at least one group." assert len(eps) == len(sigma), "Proportions (eps) and sigma must have the same length." assert len(eps) == len(mu), "Proportions (eps) and mu must have the same length." assert len(eps) == len(df), "Proportions (eps) and df must have the same length." # Determine the number of points for each group based on eps points_per_group = (np.array(eps) * n).astype(int) # Adjust to ensure the total points add up to n points_per_group[0] = n - np.sum(points_per_group[1:]) # Generate data data = [] for i in range(k): # Generate points from a multivariate Student-t distribution group_points = multivariate_t.rvs(loc=mu[i], shape=sigma[i], df=df[i], size=points_per_group[i]) data.append(group_points) # Combine data points into a single array (n x r) data = np.vstack(data) # Add noise p_noise = p - data.shape[1] noise = np.random.normal(loc=0, scale=1, size=(n, p_noise)) data_with_noise = np.hstack((data, noise)) # Save group label for each point group_labels = ["Group_" + str(i + 1) for i in range(k)] labels = [val for val, count in zip(group_labels, points_per_group) for _ in range(count)] labels = np.array(labels) return data_with_noise, labels
[docs] def generate_powerexp_mixture(eps, mu, sigma, beta, n, p): """ Generates a mixture of multivariate power exponential distribution (PEM) with the given parameters. Parameters: eps (list of float): Proportions of points assigned to each cluster (must sum to 1). mu (list of np.ndarray): List of mean vectors (centroids) for each cluster (size k). sigma (list of np.ndarray): List of covariance matrices (size k). beta (float or list of float): Shape parameters (size k if list). n (int): Total number of data points to generate. p (int): Dimension of the data, including noise. Returns: tuple: A tuple containing: data_with_noise (ndarray): Matrix (n, p) of generated data points. labels (ndarray): Array of cluster labels (size n). Example: >>> eps = [0.5, 0.5] >>> mu = [np.ones(2), np.ones(2)*10] >>> sigma = [np.eye(2) for _ in range(2)] >>> X, labels = generate_powerexp_mixture(eps, mu, sigma, beta=0.8, n=1000, p=6) """ # Number of clusters k k = len(eps) if isinstance(beta, float): beta = [beta for _ in range(k)] # Validate inputs assert np.isclose(sum(eps), 1.0), "The elements of eps must sum to 1." assert len(eps) > 0, "Proportions (eps) must contain at least one group." assert len(eps) == len(sigma), "Proportions (eps) and sigma must have the same length." assert len(eps) == len(mu), "Proportions (eps) and mu must have the same length." assert len(eps) == len(beta), "Proportions (eps) and beta must have the same length." # Determine the number of points for each group based on eps points_per_group = (np.array(eps) * n).astype(int) # Adjust to ensure the total points add up to n points_per_group[0] = n - np.sum(points_per_group[1:]) # Generate data data = [] for i in range(k): # Generate points from a multivariate normal distribution group_points = multivariate_powerexp(n=points_per_group[i], location=mu[i], scatter=sigma[i], beta=beta[i]) data.append(group_points) # Combine data points into a single array (n x r) data = np.vstack(data) # Add noise p_noise = p - data.shape[1] noise = np.random.normal(loc=0, scale=1, size=(n, p_noise)) data_with_noise = np.hstack((data, noise)) # Save group label for each point group_labels = ["Group_" + str(i + 1) for i in range(k)] labels = [val for val, count in zip(group_labels, points_per_group) for _ in range(count)] labels = np.array(labels) return data_with_noise, labels
# --- RANDU ---
[docs] def generate_randu(n=400, seed=1): """ Generate a synthetic dataset based on the classical RANDU pseudo-random number generator. RANDU is an obsolete linear congruential generator that is widely used as a benchmark example of poor randomness properties. The implementation follows the standard definition described in the R datasets package manual. Parameters: n (int, default=400): Number of data points to generate. seed (int, default=1): Seed of the generator. Returns: ndarray (n, 3) References: - Fortran Language Reference Manual (1999), Compaq. - R Core Team (datasets package), https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/randu.html Example: >>> from icspylab.distributions import generate_randu >>> X = generate_randu(n=100) >>> print(X.shape) (100, 3) """ def randu(): nonlocal seed seed = ((2 ** 16 + 3) * seed) % (2 ** 31) return seed / (2 ** 31) x = np.empty((n, 3), dtype=float) for i in range(n): U = np.array([randu() for _ in range(5)]) x[i, :] = np.round(U[:3], 6) return x