-
-
Notifications
You must be signed in to change notification settings - Fork 50.2k
Add t-SNE algorithm for dimensionality reduction (#13432) #13436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
85aca0f
af84d7f
88666f0
6cff5b8
c235a71
25571d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,160 @@ | ||
| """ | ||
| t-Distributed Stochastic Neighbor Embedding (t-SNE) | ||
| --------------------------------------------------- | ||
|
|
||
| t-SNE is a nonlinear dimensionality reduction algorithm for visualizing | ||
| high-dimensional data in a low-dimensional space (2D or 3D). | ||
|
|
||
| It computes pairwise similarities in both spaces and minimizes the | ||
| Kullback-Leibler divergence using gradient descent. | ||
|
|
||
| References: | ||
| - van der Maaten, L. & Hinton, G. (2008), JMLR. | ||
| - https://lvdmaaten.github.io/tsne/ | ||
| """ | ||
|
|
||
| import doctest | ||
|
|
||
| import numpy as np | ||
| from sklearn.datasets import load_iris | ||
|
|
||
|
|
||
| def collect_dataset() -> tuple[np.ndarray, np.ndarray]: | ||
| """ | ||
| Load Iris dataset and return features and labels. | ||
|
|
||
| Returns: | ||
| Tuple[np.ndarray, np.ndarray]: feature matrix and target labels | ||
|
|
||
| Example: | ||
| >>> x, y = collect_dataset() | ||
| >>> x.shape | ||
| (150, 4) | ||
| >>> y.shape | ||
| (150,) | ||
| """ | ||
| data = load_iris() | ||
| return np.array(data.data), np.array(data.target) | ||
|
|
||
|
|
||
| def compute_pairwise_affinities(data_x: np.ndarray, sigma: float = 1.0) -> np.ndarray: | ||
| """ | ||
| Compute high-dimensional affinities (P matrix) using Gaussian kernel. | ||
|
|
||
| Args: | ||
| data_x: Input data of shape (n_samples, n_features) | ||
| sigma: Gaussian kernel bandwidth | ||
|
|
||
| Returns: | ||
| np.ndarray: Symmetrized probability matrix | ||
|
|
||
| Example: | ||
| >>> import numpy as np | ||
| >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) | ||
| >>> p = compute_pairwise_affinities(x) | ||
| >>> float(round(p[0, 1], 3)) | ||
| 0.25 | ||
| """ | ||
| n_samples = data_x.shape[0] | ||
| sum_x = np.sum(np.square(data_x), axis=1) | ||
| d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) | ||
| p = np.exp(-d / (2 * sigma**2)) | ||
| np.fill_diagonal(p, 0) | ||
| p /= np.sum(p) | ||
| return (p + p.T) / (2 * n_samples) | ||
|
|
||
|
|
||
| def compute_low_dim_affinities( | ||
| y: np.ndarray, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please provide descriptive name for the parameter: |
||
| ) -> tuple[np.ndarray, np.ndarray]: | ||
| """ | ||
| Compute low-dimensional affinities (Q matrix) using Student-t distribution. | ||
|
|
||
| Args: | ||
| y: Low-dimensional embeddings of shape (n_samples, n_components) | ||
|
|
||
| Returns: | ||
| Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array | ||
| """ | ||
| sum_y = np.sum(np.square(y), axis=1) | ||
| num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y)) | ||
| np.fill_diagonal(num, 0) | ||
| q = num / np.sum(num) | ||
| return q, num | ||
|
|
||
|
|
||
| def apply_tsne( | ||
| data_x: np.ndarray, | ||
| n_components: int = 2, | ||
| learning_rate: float = 200.0, | ||
| n_iter: int = 500, | ||
| ) -> np.ndarray: | ||
| """ | ||
| Apply t-SNE for dimensionality reduction. | ||
|
|
||
| Args: | ||
| data_x: Original dataset (features) | ||
| n_components: Target dimension (2D or 3D) | ||
| learning_rate: Step size for gradient descent | ||
| n_iter: Number of iterations | ||
|
|
||
| Returns: | ||
| np.ndarray: Low-dimensional embedding of the data | ||
|
|
||
| Example: | ||
| >>> x, _ = collect_dataset() | ||
| >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) | ||
| >>> y_emb.shape | ||
| (150, 2) | ||
| """ | ||
| if n_components < 1 or n_iter < 1: | ||
| raise ValueError("n_components and n_iter must be >= 1") | ||
|
|
||
| n_samples = data_x.shape[0] | ||
| rng = np.random.default_rng() | ||
| y = rng.standard_normal((n_samples, n_components)) * 1e-4 | ||
|
|
||
| p = compute_pairwise_affinities(data_x) | ||
| p = np.maximum(p, 1e-12) | ||
|
|
||
| y_inc = np.zeros_like(y) | ||
| momentum = 0.5 | ||
|
|
||
| for i in range(n_iter): | ||
| q, num = compute_low_dim_affinities(y) | ||
| q = np.maximum(q, 1e-12) | ||
|
|
||
| pq = p - q | ||
| d_y = 4 * ( | ||
| np.dot((pq * num), y) | ||
| - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) | ||
| ) | ||
|
|
||
| y_inc = momentum * y_inc - learning_rate * d_y | ||
| y += y_inc | ||
|
|
||
| if i == int(n_iter / 4): | ||
| momentum = 0.8 | ||
|
|
||
| return y | ||
|
|
||
|
|
||
| def main() -> None: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file |
||
| """ | ||
| Run t-SNE on Iris dataset and display the first 5 embeddings. | ||
| """ | ||
| data_x, _ = collect_dataset() | ||
| y_emb = apply_tsne(data_x, n_components=2, n_iter=300) | ||
|
|
||
| print("t-SNE embedding (first 5 points):") | ||
| print(y_emb[:5]) | ||
|
|
||
| # Optional visualization (commented out) | ||
| # import matplotlib.pyplot as plt | ||
| # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=_labels, cmap="viridis") | ||
| # plt.show() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| doctest.testmod() | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As there is no test file in this pull request nor any test function or class in the file
machine_learning/tsne.py, please provide doctest for the functioncompute_low_dim_affinities