22t-Distributed Stochastic Neighbor Embedding (t-SNE)
33---------------------------------------------------
44
5- t-SNE is a nonlinear dimensionality reduction algorithm for visualizing
6- high-dimensional data in a low-dimensional space (2D or 3D).
7-
8- It computes pairwise similarities in both spaces and minimizes the
9- Kullback-Leibler divergence using gradient descent.
5+ Nonlinear dimensionality reduction for visualizing high-dimensional data
6+ in 2D or 3D. Computes pairwise similarities in high and low-dimensional
7+ spaces and minimizes Kullback-Leibler divergence using gradient descent.
108
119References:
1210- van der Maaten, L. & Hinton, G. (2008), JMLR.
1311- https://lvdmaaten.github.io/tsne/
1412"""
1513
16- import doctest
17-
1814import numpy as np
15+ from numpy import ndarray
1916from sklearn .datasets import load_iris
2017
21-
22- def collect_dataset () -> tuple [np .ndarray , np .ndarray ]:
18+ def _compute_pairwise_affinities (data_x : ndarray , sigma : float = 1.0 ) -> ndarray :
2319 """
24- Load Iris dataset and return features and labels.
25-
26- Returns:
27- Tuple[np.ndarray, np.ndarray]: feature matrix and target labels
28-
29- Example:
30- >>> x, y = collect_dataset()
31- >>> x.shape
32- (150, 4)
33- >>> y.shape
34- (150,)
35- """
36- data = load_iris ()
37- return np .array (data .data ), np .array (data .target )
38-
39-
40- def compute_pairwise_affinities (
41- data_x : np .ndarray , sigma : float = 1.0
42- ) -> np .ndarray :
43- """
44- Compute high-dimensional affinities (P matrix) using Gaussian kernel.
20+ Compute high-dimensional affinities using Gaussian kernel.
4521
4622 Args:
47- data_x: Input data of shape (n_samples, n_features)
48- sigma: Gaussian kernel bandwidth
23+ data_x (ndarray): shape (n_samples, n_features)
24+ sigma (float) : Gaussian kernel bandwidth
4925
5026 Returns:
51- np. ndarray: Symmetrized probability matrix
27+ ndarray: Symmetrized probability matrix
5228
5329 Example:
54- >>> import numpy as np
5530 >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
56- >>> p = compute_pairwise_affinities (x)
31+ >>> p = _compute_pairwise_affinities (x)
5732 >>> float(round(p[0, 1], 3))
5833 0.25
5934 """
@@ -66,97 +41,159 @@ def compute_pairwise_affinities(
6641 return (p + p .T ) / (2 * n_samples )
6742
6843
69- def compute_low_dim_affinities (
70- y : np .ndarray ,
71- ) -> tuple [np .ndarray , np .ndarray ]:
44+ def _compute_low_dim_affinities (low_dim_embedding : ndarray ) -> tuple [ndarray , ndarray ]:
7245 """
73- Compute low-dimensional affinities (Q matrix) using Student-t distribution.
46+ Compute low-dimensional affinities using Student-t distribution.
7447
7548 Args:
76- y: Low-dimensional embeddings of shape (n_samples, n_components)
49+ low_dim_embedding (ndarray): shape (n_samples, n_components)
7750
7851 Returns:
79- Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array
52+ tuple[ndarray, ndarray]: Q matrix and numerator
53+
54+ Example:
55+ >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
56+ >>> q, num = _compute_low_dim_affinities(y)
57+ >>> q.shape
58+ (2, 2)
8059 """
81- sum_y = np .sum (np .square (y ), axis = 1 )
82- num = 1 / (1 + np .add (np .add (- 2 * np .dot (y , y .T ), sum_y ).T , sum_y ))
60+ sum_y = np .sum (np .square (low_dim_embedding ), axis = 1 )
61+ num = 1 / (1 + np .add (np .add (- 2 * np .dot (low_dim_embedding , low_dim_embedding .T ), sum_y ).T , sum_y ))
8362 np .fill_diagonal (num , 0 )
8463 q = num / np .sum (num )
8564 return q , num
8665
8766
88- def apply_tsne (
89- data_x : np .ndarray ,
90- n_components : int = 2 ,
91- learning_rate : float = 200.0 ,
92- n_iter : int = 500 ,
93- ) -> np .ndarray :
67+ class TSNE :
9468 """
95- Apply t-SNE for dimensionality reduction.
69+ t-SNE class for dimensionality reduction.
9670
9771 Args:
98- data_x: Original dataset (features)
99- n_components: Target dimension (2D or 3D)
100- learning_rate: Step size for gradient descent
101- n_iter: Number of iterations
102-
103- Returns:
104- np.ndarray: Low-dimensional embedding of the data
72+ n_components (int): target dimension (default: 2)
73+ learning_rate (float): gradient descent step size (default: 200)
74+ n_iter (int): number of iterations (default: 500)
10575
10676 Example:
107- >>> x, _ = collect_dataset()
108- >>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
109- >>> y_emb.shape
77+ >>> x, _ = load_iris(return_X_y=True)
78+ >>> tsne = TSNE(n_components=2, n_iter=50)
79+ >>> tsne.fit(x)
80+ >>> emb = tsne.embedding_
81+ >>> emb.shape
11082 (150, 2)
11183 """
112- if n_components < 1 or n_iter < 1 :
113- raise ValueError ("n_components and n_iter must be >= 1" )
114-
115- n_samples = data_x .shape [0 ]
116- rng = np .random .default_rng ()
117- y = rng .standard_normal ((n_samples , n_components )) * 1e-4
118-
119- p = compute_pairwise_affinities (data_x )
120- p = np .maximum (p , 1e-12 )
121-
122- y_inc = np .zeros_like (y )
123- momentum = 0.5
124-
125- for i in range (n_iter ):
126- q , num = compute_low_dim_affinities (y )
127- q = np .maximum (q , 1e-12 )
12884
129- pq = p - q
130- d_y = 4 * (
131- np .dot ((pq * num ), y )
132- - np .multiply (np .sum (pq * num , axis = 1 )[:, np .newaxis ], y )
133- )
134-
135- y_inc = momentum * y_inc - learning_rate * d_y
136- y += y_inc
85+ def __init__ (self , * , n_components : int = 2 , learning_rate : float = 200.0 , n_iter : int = 500 ) -> None :
86+ if n_components < 1 :
87+ raise ValueError ("n_components must be >= 1" )
88+ if n_iter < 1 :
89+ raise ValueError ("n_iter must be >= 1" )
90+ self .n_components = n_components
91+ self .learning_rate = learning_rate
92+ self .n_iter = n_iter
93+ self .embedding_ : ndarray | None = None
94+
95+ def fit (self , data_x : ndarray ) -> None :
96+ """
97+ Fit t-SNE on data and compute low-dimensional embedding.
98+
99+ Args:
100+ data_x (ndarray): shape (n_samples, n_features)
101+
102+ Example:
103+ >>> x, _ = load_iris(return_X_y=True)
104+ >>> tsne = TSNE(n_iter=10)
105+ >>> tsne.fit(x)
106+ >>> tsne.embedding_.shape
107+ (150, 2)
108+ """
109+ n_samples = data_x .shape [0 ]
110+ rng = np .random .default_rng ()
111+ y = rng .standard_normal ((n_samples , self .n_components )) * 1e-4
112+
113+ p = _compute_pairwise_affinities (data_x )
114+ p = np .maximum (p , 1e-12 )
115+
116+ y_inc = np .zeros_like (y )
117+ momentum = 0.5
118+
119+ for i in range (self .n_iter ):
120+ q , num = _compute_low_dim_affinities (y )
121+ q = np .maximum (q , 1e-12 )
122+ pq = p - q
123+
124+ d_y = 4 * (
125+ np .dot ((pq * num ), y )
126+ - np .multiply (np .sum (pq * num , axis = 1 )[:, np .newaxis ], y )
127+ )
128+
129+ y_inc = momentum * y_inc - self .learning_rate * d_y
130+ y += y_inc
131+
132+ if i == int (self .n_iter / 4 ):
133+ momentum = 0.8
134+
135+ self .embedding_ = y
136+
137+ def transform (self , data_x : ndarray ) -> ndarray :
138+ """
139+ Return the computed embedding after fitting.
140+
141+ Args:
142+ data_x (ndarray): unused, exists for API consistency
143+
144+ Returns:
145+ ndarray: low-dimensional embedding
146+
147+ Example:
148+ >>> x, _ = load_iris(return_X_y=True)
149+ >>> tsne = TSNE(n_iter=10)
150+ >>> tsne.fit(x)
151+ >>> tsne.transform(x).shape
152+ (150, 2)
153+ """
154+ if self .embedding_ is None :
155+ raise ValueError ("Fit the model first using fit()" )
156+ return self .embedding_
157+
158+
159+ def collect_dataset () -> tuple [ndarray , ndarray ]:
160+ """
161+ Load Iris dataset.
137162
138- if i == int ( n_iter / 4 ) :
139- momentum = 0.8
163+ Returns :
164+ tuple[ndarray, ndarray]: features and labels
140165
141- return y
166+ Example:
167+ >>> x, y = collect_dataset()
168+ >>> x.shape
169+ (150, 4)
170+ >>> y.shape
171+ (150,)
172+ """
173+ data = load_iris ()
174+ return np .array (data .data ), np .array (data .target )
142175
143176
144177def main () -> None :
145178 """
146- Run t-SNE on Iris dataset and display the first 5 embeddings.
179+ Run t-SNE on Iris dataset and print first 5 points.
180+
181+ Example:
182+ >>> main() # runs without errors
147183 """
148184 data_x , _ = collect_dataset ()
149- y_emb = apply_tsne ( data_x , n_components = 2 , n_iter = 300 )
150-
185+ tsne = TSNE ( n_components = 2 , n_iter = 300 )
186+ tsne . fit ( data_x )
151187 print ("t-SNE embedding (first 5 points):" )
152- print (y_emb [:5 ])
188+ print (tsne . embedding_ [:5 ])
153189
154- # Optional visualization (commented out)
190+ # Optional visualization
155191 # import matplotlib.pyplot as plt
156- # plt.scatter(y_emb [:, 0], y_emb [:, 1], c=_labels, cmap="viridis")
192+ # plt.scatter(tsne.embedding_ [:, 0], tsne.embedding_ [:, 1], c=_labels, cmap="viridis")
157193 # plt.show()
158194
159195
160196if __name__ == "__main__" :
197+ import doctest
161198 doctest .testmod ()
162199 main ()
0 commit comments