|
| 1 | +""" |
| 2 | +t-Distributed Stochastic Neighbor Embedding (t-SNE) |
| 3 | +--------------------------------------------------- |
| 4 | +t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing |
| 5 | +high-dimensional data in a lower-dimensional (usually 2D or 3D) space. |
| 6 | +
|
| 7 | +It models pairwise similarities between points in both the high-dimensional |
| 8 | +and low-dimensional spaces, and minimizes the difference between them |
| 9 | +using gradient descent. |
| 10 | +
|
| 11 | +This simplified implementation demonstrates the core idea of t-SNE for |
| 12 | +educational purposes — it is **not optimized for large datasets**. |
| 13 | +
|
| 14 | +This implementation: |
| 15 | +- Computes pairwise similarities in the high-dimensional space. |
| 16 | +- Computes pairwise similarities in the low-dimensional (embedding) space. |
| 17 | +- Minimizes the Kullback–Leibler divergence between these distributions |
| 18 | + using gradient descent. |
| 19 | +- Follows the original t-SNE formulation by van der Maaten & Hinton (2008). |
| 20 | +
|
| 21 | +References: |
| 22 | +- van der Maaten, L. and Hinton, G. (2008). |
| 23 | + "Visualizing Data using t-SNE". Journal of Machine Learning Research. |
| 24 | +- https://lvdmaaten.github.io/tsne/ |
| 25 | +
|
| 26 | +Key Steps: |
| 27 | +1. Compute pairwise similarities (P) in high-dimensional space. |
| 28 | +2. Initialize low-dimensional map (Y) randomly. |
| 29 | +3. Compute pairwise similarities (Q) in low-dimensional space using |
| 30 | + Student-t distribution. |
| 31 | +4. Minimize KL-divergence between P and Q using gradient descent. |
| 32 | +""" |
| 33 | +import doctest |
| 34 | +import numpy as np |
| 35 | +from sklearn.datasets import load_iris |
| 36 | + |
| 37 | +def collect_dataset() -> tuple[np.ndarray, np.ndarray]: |
| 38 | + """ |
| 39 | + Collects the dataset (Iris dataset) and returns feature matrix and target values. |
| 40 | +
|
| 41 | + :return: Tuple containing feature matrix (X) and target labels (y) |
| 42 | +
|
| 43 | + Example: |
| 44 | + >>> X, y = collect_dataset() |
| 45 | + >>> X.shape |
| 46 | + (150, 4) |
| 47 | + >>> y.shape |
| 48 | + (150,) |
| 49 | + """ |
| 50 | + data = load_iris() |
| 51 | + return np.array(data.data), np.array(data.target) |
| 52 | + |
| 53 | +def compute_pairwise_affinities(X: np.ndarray, sigma: float = 1.0) -> np.ndarray: |
| 54 | + """ |
| 55 | + Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel. |
| 56 | +
|
| 57 | + :param X: Input data of shape (n_samples, n_features) |
| 58 | + :param sigma: Variance (Bandwidth) of the Gaussian kernel |
| 59 | + :return: Symmetrized probability matrix P of shape (n_samples, n_samples)/ Pairwise affinity matrix P |
| 60 | +
|
| 61 | + Example: |
| 62 | + >>> import numpy as np |
| 63 | + >>> X = np.array([[0.0, 0.0], [1.0, 0.0]]) |
| 64 | + >>> P = compute_pairwise_affinities(X) |
| 65 | + >>> float(round(P[0, 1], 3)) |
| 66 | + 0.25 |
| 67 | + """ |
| 68 | + n = X.shape[0] |
| 69 | + sum_X = np.sum(np.square(X), axis=1) |
| 70 | + D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) |
| 71 | + P = np.exp(-D / (2 * sigma ** 2)) |
| 72 | + np.fill_diagonal(P, 0) |
| 73 | + P /= np.sum(P) |
| 74 | + return (P + P.T) / (2 * n) |
| 75 | + |
| 76 | +def compute_low_dim_affinities(Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]: |
| 77 | + """ |
| 78 | + Computes low-dimensional similarities (Q matrix) using Student-t distribution. |
| 79 | +
|
| 80 | + :param Y: Low-dimensional embeddings (n_samples, n_components) |
| 81 | + :return: Tuple (Q, num) where Q is the probability matrix and num is numerator array |
| 82 | + """ |
| 83 | + sum_Y = np.sum(np.square(Y), axis=1) |
| 84 | + num = 1 / (1 + np.add(np.add(-2 * np.dot(Y, Y.T), sum_Y).T, sum_Y)) |
| 85 | + np.fill_diagonal(num, 0) |
| 86 | + Q = num / np.sum(num) |
| 87 | + return Q, num |
| 88 | + |
| 89 | + |
| 90 | +def apply_tsne( |
| 91 | + data_x: np.ndarray, |
| 92 | + n_components: int = 2, |
| 93 | + learning_rate: float = 200.0, |
| 94 | + n_iter: int = 500, |
| 95 | +) -> np.ndarray: |
| 96 | + """ |
| 97 | + Applies t-SNE to reduce data dimensionality for visualization. |
| 98 | +
|
| 99 | + :param data_x: Original dataset (features) |
| 100 | + :param n_components: Target dimension (2D or 3D) |
| 101 | + :param learning_rate: Learning rate for gradient descent |
| 102 | + :param n_iter: Number of iterations |
| 103 | + :return: Transformed dataset (low-dimensional embedding) |
| 104 | +
|
| 105 | + Example: |
| 106 | + >>> X, _ = collect_dataset() |
| 107 | + >>> Y = apply_tsne(X, n_components=2, n_iter=250) |
| 108 | + >>> Y.shape |
| 109 | + (150, 2) |
| 110 | + """ |
| 111 | + if n_components < 1: |
| 112 | + raise ValueError("n_components must be >= 1") |
| 113 | + if n_iter < 1: |
| 114 | + raise ValueError("n_iter must be >= 1") |
| 115 | + |
| 116 | + n_samples = data_x.shape[0] |
| 117 | + |
| 118 | + # Initialize low-dimensional map randomly |
| 119 | + Y = np.random.randn(n_samples, n_components) * 1e-4 |
| 120 | + P = compute_pairwise_affinities(data_x) |
| 121 | + P = np.maximum(P, 1e-12) |
| 122 | + |
| 123 | + # Initialize parameters |
| 124 | + Y_inc = np.zeros_like(Y) |
| 125 | + momentum = 0.5 |
| 126 | + |
| 127 | + for i in range(n_iter): |
| 128 | + Q, num = compute_low_dim_affinities(Y) |
| 129 | + Q = np.maximum(Q, 1e-12) |
| 130 | + |
| 131 | + PQ = P - Q |
| 132 | + |
| 133 | + # Compute gradient |
| 134 | + dY = 4 * ( |
| 135 | + np.dot((PQ * num), Y) |
| 136 | + - np.multiply(np.sum(PQ * num, axis=1)[:, np.newaxis], Y) |
| 137 | + ) |
| 138 | + |
| 139 | + # Update with momentum and learning rate |
| 140 | + Y_inc = momentum * Y_inc - learning_rate * dY |
| 141 | + Y += Y_inc |
| 142 | + |
| 143 | + # Adjust momentum halfway through |
| 144 | + if i == int(n_iter / 4): |
| 145 | + momentum = 0.8 |
| 146 | + |
| 147 | + return Y |
| 148 | + |
| 149 | + |
| 150 | +def main() -> None: |
| 151 | + """ |
| 152 | + Driver function for t-SNE demonstration. |
| 153 | + """ |
| 154 | + X, y = collect_dataset() |
| 155 | + |
| 156 | + Y = apply_tsne(X, n_components=2, n_iter=300) |
| 157 | + print("t-SNE embedding (first 5 points):") |
| 158 | + print(Y[:5]) |
| 159 | + |
| 160 | + # Optional visualization (commented to avoid dependency) |
| 161 | + # import matplotlib.pyplot as plt |
| 162 | + # plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap="viridis") |
| 163 | + # plt.title("t-SNE Visualization of Iris Dataset") |
| 164 | + # plt.xlabel("Component 1") |
| 165 | + # plt.ylabel("Component 2") |
| 166 | + # plt.show() |
| 167 | + |
| 168 | + |
| 169 | +if __name__ == "__main__": |
| 170 | + doctest.testmod() |
| 171 | + main() |
| 172 | + |
| 173 | +""" |
| 174 | +Explanation of t-SNE Implementation |
| 175 | +----------------------------------- |
| 176 | +
|
| 177 | +Input: |
| 178 | +- data_x: numpy array of shape (n_samples, n_features) |
| 179 | + Example: Iris dataset (150 samples × 4 features) |
| 180 | +- n_components: target dimension (usually 2 or 3 for visualization) |
| 181 | +- learning_rate: controls step size in gradient descent |
| 182 | +- n_iter: number of iterations for optimization |
| 183 | +
|
| 184 | +Output: |
| 185 | +- Y: numpy array of shape (n_samples, n_components) |
| 186 | + Each row is the low-dimensional embedding of the corresponding high-dimensional point. |
| 187 | +
|
| 188 | +How it works: |
| 189 | +1. Compute high-dimensional similarities (P matrix): |
| 190 | + - Measures how likely points are neighbors in the original space. |
| 191 | +2. Initialize low-dimensional map (Y) randomly. |
| 192 | +3. Compute low-dimensional similarities (Q matrix) using Student-t distribution: |
| 193 | + - Heavy tail prevents distant points from crowding together. |
| 194 | +4. Compute gradient of KL divergence between P and Q: |
| 195 | + - If points are too far in low-D (Q < P), pull them closer. |
| 196 | + - If points are too close in low-D (Q > P), push them apart. |
| 197 | +5. Update Y using gradient descent with momentum: |
| 198 | + - Repeat for n_iter iterations until low-dimensional layout reflects high-dimensional structure. |
| 199 | +
|
| 200 | +Why it works: |
| 201 | +- t-SNE tries to preserve **local structure**: neighbors stay close in the embedding. |
| 202 | +- Distant points may not be perfectly preserved (global structure is secondary). |
| 203 | +- The algorithm minimizes the KL divergence between high-D and low-D similarity distributions. |
| 204 | +""" |
0 commit comments