Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DIRECTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,7 @@
* [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py)
* [Similarity Search](machine_learning/similarity_search.py)
* [Support Vector Machines](machine_learning/support_vector_machines.py)
* [t-SNE](machine_learning/tsne.py)
* [Word Frequency Functions](machine_learning/word_frequency_functions.py)
* [Xgboost Classifier](machine_learning/xgboost_classifier.py)
* [Xgboost Regressor](machine_learning/xgboost_regressor.py)
Expand Down
184 changes: 184 additions & 0 deletions machine_learning/tsne.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
t-Distributed Stochastic Neighbor Embedding (t-SNE)
---------------------------------------------------

t-SNE is a nonlinear dimensionality reduction algorithm for visualizing
high-dimensional data in a low-dimensional space (2D or 3D).

It computes pairwise similarities in both spaces and minimizes the
Kullback-Leibler divergence using gradient descent.

References:
- van der Maaten, L. & Hinton, G. (2008), JMLR.
- https://lvdmaaten.github.io/tsne/
"""

import doctest
from typing import Tuple

Check failure on line 17 in machine_learning/tsne.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP035)

machine_learning/tsne.py:17:1: UP035 `typing.Tuple` is deprecated, use `tuple` instead

import numpy as np
from numpy import ndarray
from sklearn.datasets import load_iris


def collect_dataset() -> Tuple[ndarray, ndarray]:

Check failure on line 24 in machine_learning/tsne.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/tsne.py:24:26: UP006 Use `tuple` instead of `Tuple` for type annotation
"""
Load Iris dataset and return features and labels.

Returns:
Tuple[ndarray, ndarray]: feature matrix and target labels

Example:
>>> x, y = collect_dataset()
>>> x.shape
(150, 4)
>>> y.shape
(150,)
"""
data = load_iris()
return np.array(data.data), np.array(data.target)


def compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray:
"""
Compute high-dimensional affinities (P matrix) using Gaussian kernel.

Args:
data_x: Input data of shape (n_samples, n_features)
sigma: Gaussian kernel bandwidth

Returns:
ndarray: Symmetrized probability matrix

Example:
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
>>> p = compute_pairwise_affinities(x)
>>> float(round(p[0, 1], 3))
0.25
"""
n_samples = data_x.shape[0]
sum_x = np.sum(np.square(data_x), axis=1)
dist_sq = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x)
p = np.exp(-dist_sq / (2 * sigma**2))
np.fill_diagonal(p, 0)
p /= np.sum(p)
return (p + p.T) / (2 * n_samples)


def compute_low_dim_affinities(
low_dim_embedding: ndarray,
) -> Tuple[ndarray, ndarray]:

Check failure on line 70 in machine_learning/tsne.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP006)

machine_learning/tsne.py:70:6: UP006 Use `tuple` instead of `Tuple` for type annotation
"""
Compute low-dimensional affinities (Q matrix) using Student-t distribution.

Args:
low_dim_embedding: shape (n_samples, n_components)

Returns:
Tuple[ndarray, ndarray]: Q probability matrix and numerator

Example:
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
>>> q, num = compute_low_dim_affinities(y)
>>> q.shape
(2, 2)
"""
sum_y = np.sum(np.square(low_dim_embedding), axis=1)
numerator = 1 / (
1
+ np.add(
np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T,
sum_y,
)
)
np.fill_diagonal(numerator, 0)
q = numerator / np.sum(numerator)
return q, numerator


def apply_tsne(
data_x: ndarray,
n_components: int = 2,
learning_rate: float = 200.0,
n_iter: int = 500,
) -> ndarray:
"""
Apply t-SNE for dimensionality reduction.

Args:
data_x: Original dataset (features)
n_components: Target dimension (2D or 3D)
learning_rate: Step size for gradient descent
n_iter: Number of iterations

Returns:
ndarray: Low-dimensional embedding of the data

Example:
>>> x, _ = collect_dataset()
>>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
>>> y_emb.shape
(150, 2)
"""
if n_components < 1 or n_iter < 1:
raise ValueError("n_components and n_iter must be >= 1")

n_samples = data_x.shape[0]
rng = np.random.default_rng()
y = rng.standard_normal((n_samples, n_components)) * 1e-4

p = compute_pairwise_affinities(data_x)
p = np.maximum(p, 1e-12)

y_inc = np.zeros_like(y)
momentum = 0.5

for i in range(n_iter):
q, num = compute_low_dim_affinities(y)
q = np.maximum(q, 1e-12)

pq = p - q
d_y = 4 * (
np.dot((pq * num), y)
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
)

y_inc = momentum * y_inc - learning_rate * d_y
y += y_inc

if i == int(n_iter / 4):
momentum = 0.8

return y


def main() -> None:
"""
Run t-SNE on Iris dataset and display the first 5 embeddings.

Example:
>>> main() # runs without errors
"""
data_x, _ = collect_dataset()
y_emb = apply_tsne(data_x, n_components=2, n_iter=300)

if not isinstance(y_emb, np.ndarray):
raise TypeError("t-SNE embedding must be an ndarray")

print("t-SNE embedding (first 5 points):")
print(y_emb[:5])

# Optional visualization (commented, Ruff/mypy compliant)
# import matplotlib.pyplot as plt
# plt.scatter(
# y_emb[:, 0],
# y_emb[:, 1],
# c=_labels,
# cmap="viridis"
# )
# plt.show()


if __name__ == "__main__":
doctest.testmod()
main()
Loading