Skip to content

Commit 85aca0f

Browse files
committed
Add t-SNE algorithm for dimensionality reduction (#13432)
1 parent 788d95b commit 85aca0f

File tree

2 files changed

+205
-0
lines changed

2 files changed

+205
-0
lines changed

DIRECTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@
623623
* [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py)
624624
* [Similarity Search](machine_learning/similarity_search.py)
625625
* [Support Vector Machines](machine_learning/support_vector_machines.py)
626+
* [t-SNE] (machine_learning/tsne.py)
626627
* [Word Frequency Functions](machine_learning/word_frequency_functions.py)
627628
* [Xgboost Classifier](machine_learning/xgboost_classifier.py)
628629
* [Xgboost Regressor](machine_learning/xgboost_regressor.py)

machine_learning/tsne.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""
2+
t-Distributed Stochastic Neighbor Embedding (t-SNE)
3+
---------------------------------------------------
4+
t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing
5+
high-dimensional data in a lower-dimensional (usually 2D or 3D) space.
6+
7+
It models pairwise similarities between points in both the high-dimensional
8+
and low-dimensional spaces, and minimizes the difference between them
9+
using gradient descent.
10+
11+
This simplified implementation demonstrates the core idea of t-SNE for
12+
educational purposes — it is **not optimized for large datasets**.
13+
14+
This implementation:
15+
- Computes pairwise similarities in the high-dimensional space.
16+
- Computes pairwise similarities in the low-dimensional (embedding) space.
17+
- Minimizes the Kullback–Leibler divergence between these distributions
18+
using gradient descent.
19+
- Follows the original t-SNE formulation by van der Maaten & Hinton (2008).
20+
21+
References:
22+
- van der Maaten, L. and Hinton, G. (2008).
23+
"Visualizing Data using t-SNE". Journal of Machine Learning Research.
24+
- https://lvdmaaten.github.io/tsne/
25+
26+
Key Steps:
27+
1. Compute pairwise similarities (P) in high-dimensional space.
28+
2. Initialize low-dimensional map (Y) randomly.
29+
3. Compute pairwise similarities (Q) in low-dimensional space using
30+
Student-t distribution.
31+
4. Minimize KL-divergence between P and Q using gradient descent.
32+
"""
33+
import doctest
34+
import numpy as np
35+
from sklearn.datasets import load_iris
36+
37+
def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
38+
"""
39+
Collects the dataset (Iris dataset) and returns feature matrix and target values.
40+
41+
:return: Tuple containing feature matrix (X) and target labels (y)
42+
43+
Example:
44+
>>> X, y = collect_dataset()
45+
>>> X.shape
46+
(150, 4)
47+
>>> y.shape
48+
(150,)
49+
"""
50+
data = load_iris()
51+
return np.array(data.data), np.array(data.target)
52+
53+
def compute_pairwise_affinities(X: np.ndarray, sigma: float = 1.0) -> np.ndarray:
54+
"""
55+
Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel.
56+
57+
:param X: Input data of shape (n_samples, n_features)
58+
:param sigma: Variance (Bandwidth) of the Gaussian kernel
59+
:return: Symmetrized probability matrix P of shape (n_samples, n_samples)/ Pairwise affinity matrix P
60+
61+
Example:
62+
>>> import numpy as np
63+
>>> X = np.array([[0.0, 0.0], [1.0, 0.0]])
64+
>>> P = compute_pairwise_affinities(X)
65+
>>> float(round(P[0, 1], 3))
66+
0.25
67+
"""
68+
n = X.shape[0]
69+
sum_X = np.sum(np.square(X), axis=1)
70+
D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
71+
P = np.exp(-D / (2 * sigma ** 2))
72+
np.fill_diagonal(P, 0)
73+
P /= np.sum(P)
74+
return (P + P.T) / (2 * n)
75+
76+
def compute_low_dim_affinities(Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
77+
"""
78+
Computes low-dimensional similarities (Q matrix) using Student-t distribution.
79+
80+
:param Y: Low-dimensional embeddings (n_samples, n_components)
81+
:return: Tuple (Q, num) where Q is the probability matrix and num is numerator array
82+
"""
83+
sum_Y = np.sum(np.square(Y), axis=1)
84+
num = 1 / (1 + np.add(np.add(-2 * np.dot(Y, Y.T), sum_Y).T, sum_Y))
85+
np.fill_diagonal(num, 0)
86+
Q = num / np.sum(num)
87+
return Q, num
88+
89+
90+
def apply_tsne(
91+
data_x: np.ndarray,
92+
n_components: int = 2,
93+
learning_rate: float = 200.0,
94+
n_iter: int = 500,
95+
) -> np.ndarray:
96+
"""
97+
Applies t-SNE to reduce data dimensionality for visualization.
98+
99+
:param data_x: Original dataset (features)
100+
:param n_components: Target dimension (2D or 3D)
101+
:param learning_rate: Learning rate for gradient descent
102+
:param n_iter: Number of iterations
103+
:return: Transformed dataset (low-dimensional embedding)
104+
105+
Example:
106+
>>> X, _ = collect_dataset()
107+
>>> Y = apply_tsne(X, n_components=2, n_iter=250)
108+
>>> Y.shape
109+
(150, 2)
110+
"""
111+
if n_components < 1:
112+
raise ValueError("n_components must be >= 1")
113+
if n_iter < 1:
114+
raise ValueError("n_iter must be >= 1")
115+
116+
n_samples = data_x.shape[0]
117+
118+
# Initialize low-dimensional map randomly
119+
Y = np.random.randn(n_samples, n_components) * 1e-4
120+
P = compute_pairwise_affinities(data_x)
121+
P = np.maximum(P, 1e-12)
122+
123+
# Initialize parameters
124+
Y_inc = np.zeros_like(Y)
125+
momentum = 0.5
126+
127+
for i in range(n_iter):
128+
Q, num = compute_low_dim_affinities(Y)
129+
Q = np.maximum(Q, 1e-12)
130+
131+
PQ = P - Q
132+
133+
# Compute gradient
134+
dY = 4 * (
135+
np.dot((PQ * num), Y)
136+
- np.multiply(np.sum(PQ * num, axis=1)[:, np.newaxis], Y)
137+
)
138+
139+
# Update with momentum and learning rate
140+
Y_inc = momentum * Y_inc - learning_rate * dY
141+
Y += Y_inc
142+
143+
# Adjust momentum halfway through
144+
if i == int(n_iter / 4):
145+
momentum = 0.8
146+
147+
return Y
148+
149+
150+
def main() -> None:
151+
"""
152+
Driver function for t-SNE demonstration.
153+
"""
154+
X, y = collect_dataset()
155+
156+
Y = apply_tsne(X, n_components=2, n_iter=300)
157+
print("t-SNE embedding (first 5 points):")
158+
print(Y[:5])
159+
160+
# Optional visualization (commented to avoid dependency)
161+
# import matplotlib.pyplot as plt
162+
# plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap="viridis")
163+
# plt.title("t-SNE Visualization of Iris Dataset")
164+
# plt.xlabel("Component 1")
165+
# plt.ylabel("Component 2")
166+
# plt.show()
167+
168+
169+
if __name__ == "__main__":
170+
doctest.testmod()
171+
main()
172+
173+
"""
174+
Explanation of t-SNE Implementation
175+
-----------------------------------
176+
177+
Input:
178+
- data_x: numpy array of shape (n_samples, n_features)
179+
Example: Iris dataset (150 samples × 4 features)
180+
- n_components: target dimension (usually 2 or 3 for visualization)
181+
- learning_rate: controls step size in gradient descent
182+
- n_iter: number of iterations for optimization
183+
184+
Output:
185+
- Y: numpy array of shape (n_samples, n_components)
186+
Each row is the low-dimensional embedding of the corresponding high-dimensional point.
187+
188+
How it works:
189+
1. Compute high-dimensional similarities (P matrix):
190+
- Measures how likely points are neighbors in the original space.
191+
2. Initialize low-dimensional map (Y) randomly.
192+
3. Compute low-dimensional similarities (Q matrix) using Student-t distribution:
193+
- Heavy tail prevents distant points from crowding together.
194+
4. Compute gradient of KL divergence between P and Q:
195+
- If points are too far in low-D (Q < P), pull them closer.
196+
- If points are too close in low-D (Q > P), push them apart.
197+
5. Update Y using gradient descent with momentum:
198+
- Repeat for n_iter iterations until low-dimensional layout reflects high-dimensional structure.
199+
200+
Why it works:
201+
- t-SNE tries to preserve **local structure**: neighbors stay close in the embedding.
202+
- Distant points may not be perfectly preserved (global structure is secondary).
203+
- The algorithm minimizes the KL divergence between high-D and low-D similarity distributions.
204+
"""

0 commit comments

Comments
 (0)