Skip to content

Commit 1aa6b33

Browse files
committed
Changed tsne.py
1 parent c235a71 commit 1aa6b33

File tree

1 file changed

+130
-93
lines changed

1 file changed

+130
-93
lines changed

machine_learning/tsne.py

Lines changed: 130 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -2,58 +2,33 @@
22
t-Distributed Stochastic Neighbor Embedding (t-SNE)
33
---------------------------------------------------
44
5-
t-SNE is a nonlinear dimensionality reduction algorithm for visualizing
6-
high-dimensional data in a low-dimensional space (2D or 3D).
7-
8-
It computes pairwise similarities in both spaces and minimizes the
9-
Kullback-Leibler divergence using gradient descent.
5+
Nonlinear dimensionality reduction for visualizing high-dimensional data
6+
in 2D or 3D. Computes pairwise similarities in high and low-dimensional
7+
spaces and minimizes Kullback-Leibler divergence using gradient descent.
108
119
References:
1210
- van der Maaten, L. & Hinton, G. (2008), JMLR.
1311
- https://lvdmaaten.github.io/tsne/
1412
"""
1513

16-
import doctest
17-
1814
import numpy as np
15+
from numpy import ndarray
1916
from sklearn.datasets import load_iris
2017

21-
22-
def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
18+
def _compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray:
2319
"""
24-
Load Iris dataset and return features and labels.
25-
26-
Returns:
27-
Tuple[np.ndarray, np.ndarray]: feature matrix and target labels
28-
29-
Example:
30-
>>> x, y = collect_dataset()
31-
>>> x.shape
32-
(150, 4)
33-
>>> y.shape
34-
(150,)
35-
"""
36-
data = load_iris()
37-
return np.array(data.data), np.array(data.target)
38-
39-
40-
def compute_pairwise_affinities(
41-
data_x: np.ndarray, sigma: float = 1.0
42-
) -> np.ndarray:
43-
"""
44-
Compute high-dimensional affinities (P matrix) using Gaussian kernel.
20+
Compute high-dimensional affinities using Gaussian kernel.
4521
4622
Args:
47-
data_x: Input data of shape (n_samples, n_features)
48-
sigma: Gaussian kernel bandwidth
23+
data_x (ndarray): shape (n_samples, n_features)
24+
sigma (float): Gaussian kernel bandwidth
4925
5026
Returns:
51-
np.ndarray: Symmetrized probability matrix
27+
ndarray: Symmetrized probability matrix
5228
5329
Example:
54-
>>> import numpy as np
5530
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
56-
>>> p = compute_pairwise_affinities(x)
31+
>>> p = _compute_pairwise_affinities(x)
5732
>>> float(round(p[0, 1], 3))
5833
0.25
5934
"""
@@ -66,97 +41,159 @@ def compute_pairwise_affinities(
6641
return (p + p.T) / (2 * n_samples)
6742

6843

69-
def compute_low_dim_affinities(
70-
y: np.ndarray,
71-
) -> tuple[np.ndarray, np.ndarray]:
44+
def _compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]:
7245
"""
73-
Compute low-dimensional affinities (Q matrix) using Student-t distribution.
46+
Compute low-dimensional affinities using Student-t distribution.
7447
7548
Args:
76-
y: Low-dimensional embeddings of shape (n_samples, n_components)
49+
low_dim_embedding (ndarray): shape (n_samples, n_components)
7750
7851
Returns:
79-
Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array
52+
tuple[ndarray, ndarray]: Q matrix and numerator
53+
54+
Example:
55+
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
56+
>>> q, num = _compute_low_dim_affinities(y)
57+
>>> q.shape
58+
(2, 2)
8059
"""
81-
sum_y = np.sum(np.square(y), axis=1)
82-
num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
60+
sum_y = np.sum(np.square(low_dim_embedding), axis=1)
61+
num = 1 / (1 + np.add(np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, sum_y))
8362
np.fill_diagonal(num, 0)
8463
q = num / np.sum(num)
8564
return q, num
8665

8766

88-
def apply_tsne(
89-
data_x: np.ndarray,
90-
n_components: int = 2,
91-
learning_rate: float = 200.0,
92-
n_iter: int = 500,
93-
) -> np.ndarray:
67+
class TSNE:
9468
"""
95-
Apply t-SNE for dimensionality reduction.
69+
t-SNE class for dimensionality reduction.
9670
9771
Args:
98-
data_x: Original dataset (features)
99-
n_components: Target dimension (2D or 3D)
100-
learning_rate: Step size for gradient descent
101-
n_iter: Number of iterations
102-
103-
Returns:
104-
np.ndarray: Low-dimensional embedding of the data
72+
n_components (int): target dimension (default: 2)
73+
learning_rate (float): gradient descent step size (default: 200)
74+
n_iter (int): number of iterations (default: 500)
10575
10676
Example:
107-
>>> x, _ = collect_dataset()
108-
>>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
109-
>>> y_emb.shape
77+
>>> x, _ = load_iris(return_X_y=True)
78+
>>> tsne = TSNE(n_components=2, n_iter=50)
79+
>>> tsne.fit(x)
80+
>>> emb = tsne.embedding_
81+
>>> emb.shape
11082
(150, 2)
11183
"""
112-
if n_components < 1 or n_iter < 1:
113-
raise ValueError("n_components and n_iter must be >= 1")
114-
115-
n_samples = data_x.shape[0]
116-
rng = np.random.default_rng()
117-
y = rng.standard_normal((n_samples, n_components)) * 1e-4
118-
119-
p = compute_pairwise_affinities(data_x)
120-
p = np.maximum(p, 1e-12)
121-
122-
y_inc = np.zeros_like(y)
123-
momentum = 0.5
124-
125-
for i in range(n_iter):
126-
q, num = compute_low_dim_affinities(y)
127-
q = np.maximum(q, 1e-12)
12884

129-
pq = p - q
130-
d_y = 4 * (
131-
np.dot((pq * num), y)
132-
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
133-
)
134-
135-
y_inc = momentum * y_inc - learning_rate * d_y
136-
y += y_inc
85+
def __init__(self, *, n_components: int = 2, learning_rate: float = 200.0, n_iter: int = 500) -> None:
86+
if n_components < 1:
87+
raise ValueError("n_components must be >= 1")
88+
if n_iter < 1:
89+
raise ValueError("n_iter must be >= 1")
90+
self.n_components = n_components
91+
self.learning_rate = learning_rate
92+
self.n_iter = n_iter
93+
self.embedding_: ndarray | None = None
94+
95+
def fit(self, data_x: ndarray) -> None:
96+
"""
97+
Fit t-SNE on data and compute low-dimensional embedding.
98+
99+
Args:
100+
data_x (ndarray): shape (n_samples, n_features)
101+
102+
Example:
103+
>>> x, _ = load_iris(return_X_y=True)
104+
>>> tsne = TSNE(n_iter=10)
105+
>>> tsne.fit(x)
106+
>>> tsne.embedding_.shape
107+
(150, 2)
108+
"""
109+
n_samples = data_x.shape[0]
110+
rng = np.random.default_rng()
111+
y = rng.standard_normal((n_samples, self.n_components)) * 1e-4
112+
113+
p = _compute_pairwise_affinities(data_x)
114+
p = np.maximum(p, 1e-12)
115+
116+
y_inc = np.zeros_like(y)
117+
momentum = 0.5
118+
119+
for i in range(self.n_iter):
120+
q, num = _compute_low_dim_affinities(y)
121+
q = np.maximum(q, 1e-12)
122+
pq = p - q
123+
124+
d_y = 4 * (
125+
np.dot((pq * num), y)
126+
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
127+
)
128+
129+
y_inc = momentum * y_inc - self.learning_rate * d_y
130+
y += y_inc
131+
132+
if i == int(self.n_iter / 4):
133+
momentum = 0.8
134+
135+
self.embedding_ = y
136+
137+
def transform(self, data_x: ndarray) -> ndarray:
138+
"""
139+
Return the computed embedding after fitting.
140+
141+
Args:
142+
data_x (ndarray): unused, exists for API consistency
143+
144+
Returns:
145+
ndarray: low-dimensional embedding
146+
147+
Example:
148+
>>> x, _ = load_iris(return_X_y=True)
149+
>>> tsne = TSNE(n_iter=10)
150+
>>> tsne.fit(x)
151+
>>> tsne.transform(x).shape
152+
(150, 2)
153+
"""
154+
if self.embedding_ is None:
155+
raise ValueError("Fit the model first using fit()")
156+
return self.embedding_
157+
158+
159+
def collect_dataset() -> tuple[ndarray, ndarray]:
160+
"""
161+
Load Iris dataset.
137162
138-
if i == int(n_iter / 4):
139-
momentum = 0.8
163+
Returns:
164+
tuple[ndarray, ndarray]: features and labels
140165
141-
return y
166+
Example:
167+
>>> x, y = collect_dataset()
168+
>>> x.shape
169+
(150, 4)
170+
>>> y.shape
171+
(150,)
172+
"""
173+
data = load_iris()
174+
return np.array(data.data), np.array(data.target)
142175

143176

144177
def main() -> None:
145178
"""
146-
Run t-SNE on Iris dataset and display the first 5 embeddings.
179+
Run t-SNE on Iris dataset and print first 5 points.
180+
181+
Example:
182+
>>> main() # runs without errors
147183
"""
148184
data_x, _ = collect_dataset()
149-
y_emb = apply_tsne(data_x, n_components=2, n_iter=300)
150-
185+
tsne = TSNE(n_components=2, n_iter=300)
186+
tsne.fit(data_x)
151187
print("t-SNE embedding (first 5 points):")
152-
print(y_emb[:5])
188+
print(tsne.embedding_[:5])
153189

154-
# Optional visualization (commented out)
190+
# Optional visualization
155191
# import matplotlib.pyplot as plt
156-
# plt.scatter(y_emb[:, 0], y_emb[:, 1], c=_labels, cmap="viridis")
192+
# plt.scatter(tsne.embedding_[:, 0], tsne.embedding_[:, 1], c=_labels, cmap="viridis")
157193
# plt.show()
158194

159195

160196
if __name__ == "__main__":
197+
import doctest
161198
doctest.testmod()
162199
main()

0 commit comments

Comments
 (0)