Skip to content

Commit c235a71

Browse files
committed
Changed tsne.py
1 parent 6cff5b8 commit c235a71

File tree

1 file changed

+61
-104
lines changed

1 file changed

+61
-104
lines changed

machine_learning/tsne.py

Lines changed: 61 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -2,92 +2,84 @@
22
t-Distributed Stochastic Neighbor Embedding (t-SNE)
33
---------------------------------------------------
44
5-
t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing
6-
high-dimensional data in a lower-dimensional (usually 2D or 3D) space.
5+
t-SNE is a nonlinear dimensionality reduction algorithm for visualizing
6+
high-dimensional data in a low-dimensional space (2D or 3D).
77
8-
It models pairwise similarities between points in both the high-dimensional
9-
and low-dimensional spaces, and minimizes the difference between them using
10-
gradient descent.
11-
12-
This simplified implementation demonstrates the core idea of t-SNE for
13-
educational purposes — it is **not optimized for large datasets**.
14-
15-
This implementation:
16-
- Computes pairwise similarities in the high-dimensional space.
17-
- Computes pairwise similarities in the low-dimensional (embedding) space.
18-
- Minimizes the Kullback-Leibler divergence between these distributions
19-
using gradient descent.
20-
- Follows the original t-SNE formulation by van der Maaten & Hinton (2008).
8+
It computes pairwise similarities in both spaces and minimizes the
9+
Kullback-Leibler divergence using gradient descent.
2110
2211
References:
23-
- van der Maaten, L. and Hinton, G. (2008).
24-
"Visualizing Data using t-SNE". Journal of Machine Learning Research.
12+
- van der Maaten, L. & Hinton, G. (2008), JMLR.
2513
- https://lvdmaaten.github.io/tsne/
2614
"""
2715

2816
import doctest
17+
2918
import numpy as np
3019
from sklearn.datasets import load_iris
3120

3221

3322
def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
3423
"""
35-
Collects the Iris dataset and returns features and labels.
24+
Load Iris dataset and return features and labels.
3625
37-
:return: Tuple containing feature matrix and target labels
26+
Returns:
27+
Tuple[np.ndarray, np.ndarray]: feature matrix and target labels
3828
3929
Example:
40-
>>> data_x, data_y = collect_dataset()
41-
>>> data_x.shape
30+
>>> x, y = collect_dataset()
31+
>>> x.shape
4232
(150, 4)
43-
>>> data_y.shape
33+
>>> y.shape
4434
(150,)
4535
"""
4636
data = load_iris()
4737
return np.array(data.data), np.array(data.target)
4838

4939

50-
def compute_pairwise_affinities(data_x: np.ndarray, sigma: float = 1.0) -> np.ndarray:
40+
def compute_pairwise_affinities(
41+
data_x: np.ndarray, sigma: float = 1.0
42+
) -> np.ndarray:
5143
"""
52-
Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel.
44+
Compute high-dimensional affinities (P matrix) using Gaussian kernel.
5345
54-
:param data_x: Input data of shape (n_samples, n_features)
55-
:param sigma: Variance (Bandwidth) of the Gaussian kernel
56-
:return: Symmetrized probability matrix p
46+
Args:
47+
data_x: Input data of shape (n_samples, n_features)
48+
sigma: Gaussian kernel bandwidth
49+
50+
Returns:
51+
np.ndarray: Symmetrized probability matrix
5752
5853
Example:
5954
>>> import numpy as np
60-
>>> data_x = np.array([[0.0, 0.0], [1.0, 0.0]])
61-
>>> p = compute_pairwise_affinities(data_x)
55+
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
56+
>>> p = compute_pairwise_affinities(x)
6257
>>> float(round(p[0, 1], 3))
6358
0.25
6459
"""
6560
n_samples = data_x.shape[0]
6661
sum_x = np.sum(np.square(data_x), axis=1)
6762
d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x)
68-
p = np.exp(-d / (2 * sigma ** 2))
63+
p = np.exp(-d / (2 * sigma**2))
6964
np.fill_diagonal(p, 0)
7065
p /= np.sum(p)
7166
return (p + p.T) / (2 * n_samples)
7267

7368

74-
def compute_low_dim_affinities(embedding_y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
69+
def compute_low_dim_affinities(
70+
y: np.ndarray,
71+
) -> tuple[np.ndarray, np.ndarray]:
7572
"""
76-
Computes low-dimensional similarities (Q matrix) using Student-t distribution.
73+
Compute low-dimensional affinities (Q matrix) using Student-t distribution.
7774
78-
:param embedding_y: Low-dimensional embeddings (n_samples, n_components)
79-
:return: Tuple (q, num) where q is the probability matrix and num is numerator array
75+
Args:
76+
y: Low-dimensional embeddings of shape (n_samples, n_components)
8077
81-
Example:
82-
>>> embedding_y = np.array([[0.0, 0.0], [1.0, 0.0]])
83-
>>> q, num = compute_low_dim_affinities(embedding_y)
84-
>>> q.shape
85-
(2, 2)
86-
>>> num.shape
87-
(2, 2)
78+
Returns:
79+
Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array
8880
"""
89-
sum_y = np.sum(np.square(embedding_y), axis=1)
90-
num = 1 / (1 + np.add(np.add(-2 * np.dot(embedding_y, embedding_y.T), sum_y).T, sum_y))
81+
sum_y = np.sum(np.square(y), axis=1)
82+
num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
9183
np.fill_diagonal(num, 0)
9284
q = num / np.sum(num)
9385
return q, num
@@ -100,106 +92,71 @@ def apply_tsne(
10092
n_iter: int = 500,
10193
) -> np.ndarray:
10294
"""
103-
Applies t-SNE to reduce data dimensionality for visualization.
95+
Apply t-SNE for dimensionality reduction.
10496
105-
:param data_x: Original dataset (features)
106-
:param n_components: Target dimension (2D or 3D)
107-
:param learning_rate: Learning rate for gradient descent
108-
:param n_iter: Number of iterations
109-
:return: Transformed dataset (low-dimensional embedding)
97+
Args:
98+
data_x: Original dataset (features)
99+
n_components: Target dimension (2D or 3D)
100+
learning_rate: Step size for gradient descent
101+
n_iter: Number of iterations
102+
103+
Returns:
104+
np.ndarray: Low-dimensional embedding of the data
110105
111106
Example:
112-
>>> data_x, _ = collect_dataset()
113-
>>> y_emb = apply_tsne(data_x, n_components=2, n_iter=50)
107+
>>> x, _ = collect_dataset()
108+
>>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
114109
>>> y_emb.shape
115110
(150, 2)
116111
"""
117-
if n_components < 1:
118-
raise ValueError("n_components must be >= 1")
119-
if n_iter < 1:
120-
raise ValueError("n_iter must be >= 1")
112+
if n_components < 1 or n_iter < 1:
113+
raise ValueError("n_components and n_iter must be >= 1")
121114

122115
n_samples = data_x.shape[0]
116+
rng = np.random.default_rng()
117+
y = rng.standard_normal((n_samples, n_components)) * 1e-4
123118

124-
# Initialize low-dimensional map randomly
125-
y_emb = np.random.randn(n_samples, n_components) * 1e-4
126119
p = compute_pairwise_affinities(data_x)
127120
p = np.maximum(p, 1e-12)
128121

129-
# Initialize parameters
130-
y_inc = np.zeros_like(y_emb)
122+
y_inc = np.zeros_like(y)
131123
momentum = 0.5
132124

133125
for i in range(n_iter):
134-
q, num = compute_low_dim_affinities(y_emb)
126+
q, num = compute_low_dim_affinities(y)
135127
q = np.maximum(q, 1e-12)
136128

137129
pq = p - q
138-
139-
# Compute gradient
140130
d_y = 4 * (
141-
np.dot((pq * num), y_emb)
142-
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y_emb)
131+
np.dot((pq * num), y)
132+
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
143133
)
144134

145-
# Update with momentum and learning rate
146135
y_inc = momentum * y_inc - learning_rate * d_y
147-
y_emb += y_inc
136+
y += y_inc
148137

149-
# Adjust momentum halfway through
150138
if i == int(n_iter / 4):
151139
momentum = 0.8
152140

153-
return y_emb
141+
return y
154142

155143

156144
def main() -> None:
157145
"""
158-
Driver function for t-SNE demonstration.
159-
160-
Example:
161-
>>> main() # doctest: +ELLIPSIS
162-
t-SNE embedding (first 5 points):
163-
...
146+
Run t-SNE on Iris dataset and display the first 5 embeddings.
164147
"""
165-
data_x, data_y = collect_dataset()
148+
data_x, _ = collect_dataset()
166149
y_emb = apply_tsne(data_x, n_components=2, n_iter=300)
150+
167151
print("t-SNE embedding (first 5 points):")
168152
print(y_emb[:5])
169153

170-
# Optional visualization (commented to avoid dependency)
154+
# Optional visualization (commented out)
171155
# import matplotlib.pyplot as plt
172-
# plt.scatter(y_emb[:, 0], y_emb[:, 1], c=data_y, cmap="viridis")
173-
# plt.title("t-SNE Visualization of Iris Dataset")
174-
# plt.xlabel("Component 1")
175-
# plt.ylabel("Component 2")
156+
# plt.scatter(y_emb[:, 0], y_emb[:, 1], c=_labels, cmap="viridis")
176157
# plt.show()
177158

178159

179160
if __name__ == "__main__":
180161
doctest.testmod()
181162
main()
182-
183-
184-
"""
185-
Explanation of Input and Output
186-
--------------------------------
187-
188-
Input:
189-
- data_x: numpy array of shape (n_samples, n_features)
190-
Example: Iris dataset (150 samples × 4 features)
191-
- n_components: target dimension (usually 2 or 3)
192-
- learning_rate: gradient descent step size
193-
- n_iter: number of iterations for optimization
194-
195-
Output:
196-
- y_emb: numpy array of shape (n_samples, n_components)
197-
Each row is the low-dimensional embedding of the corresponding high-dimensional point.
198-
199-
How it works:
200-
1. Compute high-dimensional similarities (P matrix)
201-
2. Initialize low-dimensional map (y_emb) randomly
202-
3. Compute low-dimensional similarities (Q matrix)
203-
4. Minimize KL divergence between P and Q using gradient descent
204-
5. Update y_emb with momentum and learning rate iteratively
205-
"""

0 commit comments

Comments
 (0)