From 85aca0f94971e6155d0a4f42774577fc32d20073 Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:11:17 +0530 Subject: [PATCH 1/2] Add t-SNE algorithm for dimensionality reduction (#13432) --- DIRECTORY.md | 1 + machine_learning/tsne.py | 204 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 machine_learning/tsne.py diff --git a/DIRECTORY.md b/DIRECTORY.md index 36acb3b97f1e..5470491850f1 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -623,6 +623,7 @@ * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) * [Similarity Search](machine_learning/similarity_search.py) * [Support Vector Machines](machine_learning/support_vector_machines.py) + * [t-SNE] (machine_learning/tsne.py) * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py) diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py new file mode 100644 index 000000000000..6197265f3f26 --- /dev/null +++ b/machine_learning/tsne.py @@ -0,0 +1,204 @@ +""" +t-Distributed Stochastic Neighbor Embedding (t-SNE) +--------------------------------------------------- +t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing +high-dimensional data in a lower-dimensional (usually 2D or 3D) space. + +It models pairwise similarities between points in both the high-dimensional +and low-dimensional spaces, and minimizes the difference between them +using gradient descent. + +This simplified implementation demonstrates the core idea of t-SNE for +educational purposes — it is **not optimized for large datasets**. + +This implementation: +- Computes pairwise similarities in the high-dimensional space. +- Computes pairwise similarities in the low-dimensional (embedding) space. +- Minimizes the Kullback–Leibler divergence between these distributions + using gradient descent. +- Follows the original t-SNE formulation by van der Maaten & Hinton (2008). + +References: +- van der Maaten, L. and Hinton, G. (2008). + "Visualizing Data using t-SNE". Journal of Machine Learning Research. +- https://lvdmaaten.github.io/tsne/ + +Key Steps: +1. Compute pairwise similarities (P) in high-dimensional space. +2. Initialize low-dimensional map (Y) randomly. +3. Compute pairwise similarities (Q) in low-dimensional space using + Student-t distribution. +4. Minimize KL-divergence between P and Q using gradient descent. +""" +import doctest +import numpy as np +from sklearn.datasets import load_iris + +def collect_dataset() -> tuple[np.ndarray, np.ndarray]: + """ + Collects the dataset (Iris dataset) and returns feature matrix and target values. + + :return: Tuple containing feature matrix (X) and target labels (y) + + Example: + >>> X, y = collect_dataset() + >>> X.shape + (150, 4) + >>> y.shape + (150,) + """ + data = load_iris() + return np.array(data.data), np.array(data.target) + +def compute_pairwise_affinities(X: np.ndarray, sigma: float = 1.0) -> np.ndarray: + """ + Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel. + + :param X: Input data of shape (n_samples, n_features) + :param sigma: Variance (Bandwidth) of the Gaussian kernel + :return: Symmetrized probability matrix P of shape (n_samples, n_samples)/ Pairwise affinity matrix P + + Example: + >>> import numpy as np + >>> X = np.array([[0.0, 0.0], [1.0, 0.0]]) + >>> P = compute_pairwise_affinities(X) + >>> float(round(P[0, 1], 3)) + 0.25 + """ + n = X.shape[0] + sum_X = np.sum(np.square(X), axis=1) + D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) + P = np.exp(-D / (2 * sigma ** 2)) + np.fill_diagonal(P, 0) + P /= np.sum(P) + return (P + P.T) / (2 * n) + +def compute_low_dim_affinities(Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """ + Computes low-dimensional similarities (Q matrix) using Student-t distribution. + + :param Y: Low-dimensional embeddings (n_samples, n_components) + :return: Tuple (Q, num) where Q is the probability matrix and num is numerator array + """ + sum_Y = np.sum(np.square(Y), axis=1) + num = 1 / (1 + np.add(np.add(-2 * np.dot(Y, Y.T), sum_Y).T, sum_Y)) + np.fill_diagonal(num, 0) + Q = num / np.sum(num) + return Q, num + + +def apply_tsne( + data_x: np.ndarray, + n_components: int = 2, + learning_rate: float = 200.0, + n_iter: int = 500, +) -> np.ndarray: + """ + Applies t-SNE to reduce data dimensionality for visualization. + + :param data_x: Original dataset (features) + :param n_components: Target dimension (2D or 3D) + :param learning_rate: Learning rate for gradient descent + :param n_iter: Number of iterations + :return: Transformed dataset (low-dimensional embedding) + + Example: + >>> X, _ = collect_dataset() + >>> Y = apply_tsne(X, n_components=2, n_iter=250) + >>> Y.shape + (150, 2) + """ + if n_components < 1: + raise ValueError("n_components must be >= 1") + if n_iter < 1: + raise ValueError("n_iter must be >= 1") + + n_samples = data_x.shape[0] + + # Initialize low-dimensional map randomly + Y = np.random.randn(n_samples, n_components) * 1e-4 + P = compute_pairwise_affinities(data_x) + P = np.maximum(P, 1e-12) + + # Initialize parameters + Y_inc = np.zeros_like(Y) + momentum = 0.5 + + for i in range(n_iter): + Q, num = compute_low_dim_affinities(Y) + Q = np.maximum(Q, 1e-12) + + PQ = P - Q + + # Compute gradient + dY = 4 * ( + np.dot((PQ * num), Y) + - np.multiply(np.sum(PQ * num, axis=1)[:, np.newaxis], Y) + ) + + # Update with momentum and learning rate + Y_inc = momentum * Y_inc - learning_rate * dY + Y += Y_inc + + # Adjust momentum halfway through + if i == int(n_iter / 4): + momentum = 0.8 + + return Y + + +def main() -> None: + """ + Driver function for t-SNE demonstration. + """ + X, y = collect_dataset() + + Y = apply_tsne(X, n_components=2, n_iter=300) + print("t-SNE embedding (first 5 points):") + print(Y[:5]) + + # Optional visualization (commented to avoid dependency) + # import matplotlib.pyplot as plt + # plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap="viridis") + # plt.title("t-SNE Visualization of Iris Dataset") + # plt.xlabel("Component 1") + # plt.ylabel("Component 2") + # plt.show() + + +if __name__ == "__main__": + doctest.testmod() + main() + +""" +Explanation of t-SNE Implementation +----------------------------------- + +Input: +- data_x: numpy array of shape (n_samples, n_features) + Example: Iris dataset (150 samples × 4 features) +- n_components: target dimension (usually 2 or 3 for visualization) +- learning_rate: controls step size in gradient descent +- n_iter: number of iterations for optimization + +Output: +- Y: numpy array of shape (n_samples, n_components) + Each row is the low-dimensional embedding of the corresponding high-dimensional point. + +How it works: +1. Compute high-dimensional similarities (P matrix): + - Measures how likely points are neighbors in the original space. +2. Initialize low-dimensional map (Y) randomly. +3. Compute low-dimensional similarities (Q matrix) using Student-t distribution: + - Heavy tail prevents distant points from crowding together. +4. Compute gradient of KL divergence between P and Q: + - If points are too far in low-D (Q < P), pull them closer. + - If points are too close in low-D (Q > P), push them apart. +5. Update Y using gradient descent with momentum: + - Repeat for n_iter iterations until low-dimensional layout reflects high-dimensional structure. + +Why it works: +- t-SNE tries to preserve **local structure**: neighbors stay close in the embedding. +- Distant points may not be perfectly preserved (global structure is secondary). +- The algorithm minimizes the KL divergence between high-D and low-D similarity distributions. +""" From af84d7f70c0976f6e4cfdd5aa53a9b106ab23dbf Mon Sep 17 00:00:00 2001 From: Nikita-Kedari Date: Sun, 12 Oct 2025 01:19:27 +0530 Subject: [PATCH 2/2] Add t-SNE to DIRECTORY.md --- DIRECTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 5470491850f1..e81c4fc4a7d9 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -623,7 +623,7 @@ * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) * [Similarity Search](machine_learning/similarity_search.py) * [Support Vector Machines](machine_learning/support_vector_machines.py) - * [t-SNE] (machine_learning/tsne.py) + * [t-SNE](machine_learning/tsne.py) * [Word Frequency Functions](machine_learning/word_frequency_functions.py) * [Xgboost Classifier](machine_learning/xgboost_classifier.py) * [Xgboost Regressor](machine_learning/xgboost_regressor.py)