11import doctest
2-
32import numpy as np
43from numpy import ndarray
54from sklearn .datasets import load_iris
65
76
87def collect_dataset () -> tuple [ndarray , ndarray ]:
98 """
10- Load Iris dataset and return features and labels.
9+ Load the Iris dataset and return features and labels.
10+
1111 Returns:
12- tuple[ndarray, ndarray]: feature matrix and target labels
12+ tuple[ndarray, ndarray]: Feature matrix and target labels.
13+
1314 Example:
14- >>> x, y = collect_dataset()
15- >>> x .shape
16- (150, 4)
17- >>> y .shape
18- (150,)
15+ >>> features, targets = collect_dataset()
16+ >>> features .shape
17+ (150, 4)
18+ >>> targets .shape
19+ (150,)
1920 """
20- data = load_iris ()
21- return np .array (data .data ), np .array (data .target )
21+ iris_dataset = load_iris ()
22+ return np .array (iris_dataset .data ), np .array (iris_dataset .target )
2223
2324
24- def compute_pairwise_affinities (data_x : ndarray , sigma : float = 1.0 ) -> ndarray :
25+ def compute_pairwise_affinities (data_matrix : ndarray , sigma : float = 1.0 ) -> ndarray :
2526 """
26- Compute high-dimensional affinities (P matrix) using Gaussian kernel.
27+ Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
28+
2729 Args:
28- data_x: Input data of shape (n_samples, n_features)
29- sigma: Gaussian kernel bandwidth
30+ data_matrix: Input data of shape (n_samples, n_features).
31+ sigma: Gaussian kernel bandwidth.
32+
3033 Returns:
31- ndarray: Symmetrized probability matrix
34+ ndarray: Symmetrized probability matrix.
35+
3236 Example:
33- >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
34- >>> p = compute_pairwise_affinities(x)
35- >>> float(round(p [0, 1], 3))
36- 0.25
37+ >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
38+ >>> probabilities = compute_pairwise_affinities(x)
39+ >>> float(round(probabilities [0, 1], 3))
40+ 0.25
3741 """
38- n_samples = data_x .shape [0 ]
39- sum_x = np .sum (np .square (data_x ), axis = 1 )
40- dist_sq = np .add (np .add (- 2 * np .dot (data_x , data_x .T ), sum_x ).T , sum_x )
41- p = np .exp (- dist_sq / (2 * sigma ** 2 ))
42- np .fill_diagonal (p , 0 )
43- p /= np .sum (p )
44- return (p + p .T ) / (2 * n_samples )
42+ n_samples = data_matrix .shape [0 ]
43+ squared_sum = np .sum (np .square (data_matrix ), axis = 1 )
44+ squared_distance = np .add (np .add (- 2 * np .dot (data_matrix , data_matrix .T ), squared_sum ).T , squared_sum )
45+
46+ affinity_matrix = np .exp (- squared_distance / (2 * sigma ** 2 ))
47+ np .fill_diagonal (affinity_matrix , 0 )
48+
49+ affinity_matrix /= np .sum (affinity_matrix )
50+ return (affinity_matrix + affinity_matrix .T ) / (2 * n_samples )
4551
4652
47- def compute_low_dim_affinities (low_dim_embedding : ndarray ) -> tuple [ndarray , ndarray ]:
53+ def compute_low_dim_affinities (embedding_matrix : ndarray ) -> tuple [ndarray , ndarray ]:
4854 """
49- Compute low-dimensional affinities (Q matrix) using Student-t distribution.
55+ Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
56+
5057 Args:
51- low_dim_embedding: shape (n_samples, n_components)
58+ embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
59+
5260 Returns:
53- tuple[ndarray, ndarray]: Q probability matrix and numerator
61+ tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
62+
5463 Example:
55- >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
56- >>> q, num = compute_low_dim_affinities(y)
57- >>> q .shape
58- (2, 2)
64+ >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
65+ >>> q_matrix, numerators = compute_low_dim_affinities(y)
66+ >>> q_matrix .shape
67+ (2, 2)
5968 """
60- sum_y = np .sum (np .square (low_dim_embedding ), axis = 1 )
61- numerator = 1 / (
69+ squared_sum = np .sum (np .square (embedding_matrix ), axis = 1 )
70+ numerator_matrix = 1 / (
6271 1
6372 + np .add (
64- np .add (- 2 * np .dot (low_dim_embedding , low_dim_embedding .T ), sum_y ).T ,
65- sum_y ,
73+ np .add (- 2 * np .dot (embedding_matrix , embedding_matrix .T ), squared_sum ).T ,
74+ squared_sum ,
6675 )
6776 )
68- np .fill_diagonal (numerator , 0 )
69- q = numerator / np .sum (numerator )
70- return q , numerator
77+ np .fill_diagonal (numerator_matrix , 0 )
78+
79+ q_matrix = numerator_matrix / np .sum (numerator_matrix )
80+ return q_matrix , numerator_matrix
7181
7282
7383def apply_tsne (
74- data_x : ndarray ,
84+ data_matrix : ndarray ,
7585 n_components : int = 2 ,
7686 learning_rate : float = 200.0 ,
7787 n_iter : int = 500 ,
7888) -> ndarray :
7989 """
8090 Apply t-SNE for dimensionality reduction.
91+
8192 Args:
82- data_x: Original dataset (features)
83- n_components: Target dimension (2D or 3D)
84- learning_rate: Step size for gradient descent
85- n_iter: Number of iterations
93+ data_matrix: Original dataset (features).
94+ n_components: Target dimension (2D or 3D).
95+ learning_rate: Step size for gradient descent.
96+ n_iter: Number of iterations.
97+
8698 Returns:
87- ndarray: Low-dimensional embedding of the data
99+ ndarray: Low-dimensional embedding of the data.
100+
88101 Example:
89- >>> x , _ = collect_dataset()
90- >>> y_emb = apply_tsne(x , n_components=2, n_iter=50)
91- >>> y_emb .shape
92- (150, 2)
102+ >>> features , _ = collect_dataset()
103+ >>> embedding = apply_tsne(features , n_components=2, n_iter=50)
104+ >>> embedding .shape
105+ (150, 2)
93106 """
94107 if n_components < 1 or n_iter < 1 :
95108 raise ValueError ("n_components and n_iter must be >= 1" )
96109
97- n_samples = data_x .shape [0 ]
110+ n_samples = data_matrix .shape [0 ]
98111 rng = np .random .default_rng ()
99- y = rng .standard_normal ((n_samples , n_components )) * 1e-4
112+ embedding = rng .standard_normal ((n_samples , n_components )) * 1e-4
100113
101- p = compute_pairwise_affinities (data_x )
102- p = np .maximum (p , 1e-12 )
114+ high_dim_affinities = compute_pairwise_affinities (data_matrix )
115+ high_dim_affinities = np .maximum (high_dim_affinities , 1e-12 )
103116
104- y_inc = np .zeros_like (y )
117+ embedding_increment = np .zeros_like (embedding )
105118 momentum = 0.5
106119
107- for i in range (n_iter ):
108- q , num = compute_low_dim_affinities (y )
109- q = np .maximum (q , 1e-12 )
120+ for iteration in range (n_iter ):
121+ low_dim_affinities , numerator_matrix = compute_low_dim_affinities (embedding )
122+ low_dim_affinities = np .maximum (low_dim_affinities , 1e-12 )
110123
111- pq = p - q
112- d_y = 4 * (
113- np .dot ((pq * num ), y )
114- - np .multiply (np .sum (pq * num , axis = 1 )[:, np .newaxis ], y )
124+ affinity_diff = high_dim_affinities - low_dim_affinities
125+
126+ gradient = 4 * (
127+ np .dot ((affinity_diff * numerator_matrix ), embedding )
128+ - np .multiply (np .sum (affinity_diff * numerator_matrix , axis = 1 )[:, np .newaxis ], embedding )
115129 )
116130
117- y_inc = momentum * y_inc - learning_rate * d_y
118- y += y_inc
131+ embedding_increment = momentum * embedding_increment - learning_rate * gradient
132+ embedding += embedding_increment
119133
120- if i == int (n_iter / 4 ):
134+ if iteration == int (n_iter / 4 ):
121135 momentum = 0.8
122136
123- return y
137+ return embedding
124138
125139
126140def main () -> None :
127141 """
128- Run t-SNE on Iris dataset and display the first 5 embeddings.
142+ Run t-SNE on the Iris dataset and display the first 5 embeddings.
143+
129144 Example:
130- >>> main() # doctest: +ELLIPSIS
131- t-SNE embedding (first 5 points):
132- [[...
145+ >>> main() # doctest: +ELLIPSIS
146+ t-SNE embedding (first 5 points):
147+ [[...
133148 """
134- data_x , labels = collect_dataset ()
149+ data_x ,labels = collect_dataset ()
135150 y_emb = apply_tsne (data_x , n_components = 2 , n_iter = 300 )
136151
137- if not isinstance (y_emb , np .ndarray ):
152+ if not isinstance (embedding , np .ndarray ):
138153 raise TypeError ("t-SNE embedding must be an ndarray" )
139154
140155 print ("t-SNE embedding (first 5 points):" )
141- print (y_emb [:5 ])
156+ print (embedding [:5 ])
142157
143158 # Optional visualization ( Ruff/mypy compliant)
144159 import matplotlib .pyplot as plt
145-
146- plt .scatter (y_emb [:, 0 ], y_emb [:, 1 ], c = labels , cmap = "viridis" )
160+ plt .scatter (
161+ y_emb [:, 0 ],
162+ y_emb [:, 1 ],
163+ c = labels ,
164+ cmap = "viridis"
165+ )
147166 plt .title ("t-SNE Visualization of Iris Dataset" )
148167 plt .xlabel ("Dimension 1" )
149168 plt .ylabel ("Dimension 2" )
@@ -153,3 +172,4 @@ def main() -> None:
153172if __name__ == "__main__" :
154173 doctest .testmod ()
155174 main ()
175+
0 commit comments