Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DIRECTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,7 @@
* [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py)
* [Similarity Search](machine_learning/similarity_search.py)
* [Support Vector Machines](machine_learning/support_vector_machines.py)
* [t-SNE](machine_learning/tsne.py)
* [Word Frequency Functions](machine_learning/word_frequency_functions.py)
* [Xgboost Classifier](machine_learning/xgboost_classifier.py)
* [Xgboost Regressor](machine_learning/xgboost_regressor.py)
Expand Down
160 changes: 160 additions & 0 deletions machine_learning/tsne.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
t-Distributed Stochastic Neighbor Embedding (t-SNE)
---------------------------------------------------

t-SNE is a nonlinear dimensionality reduction algorithm for visualizing
high-dimensional data in a low-dimensional space (2D or 3D).

It computes pairwise similarities in both spaces and minimizes the
Kullback-Leibler divergence using gradient descent.

References:
- van der Maaten, L. & Hinton, G. (2008), JMLR.
- https://lvdmaaten.github.io/tsne/
"""

import doctest

import numpy as np
from sklearn.datasets import load_iris


def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
"""
Load Iris dataset and return features and labels.

Returns:
Tuple[np.ndarray, np.ndarray]: feature matrix and target labels

Example:
>>> x, y = collect_dataset()
>>> x.shape
(150, 4)
>>> y.shape
(150,)
"""
data = load_iris()
return np.array(data.data), np.array(data.target)


def compute_pairwise_affinities(data_x: np.ndarray, sigma: float = 1.0) -> np.ndarray:
"""
Compute high-dimensional affinities (P matrix) using Gaussian kernel.

Args:
data_x: Input data of shape (n_samples, n_features)
sigma: Gaussian kernel bandwidth

Returns:
np.ndarray: Symmetrized probability matrix

Example:
>>> import numpy as np
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
>>> p = compute_pairwise_affinities(x)
>>> float(round(p[0, 1], 3))
0.25
"""
n_samples = data_x.shape[0]
sum_x = np.sum(np.square(data_x), axis=1)
d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x)
p = np.exp(-d / (2 * sigma**2))
np.fill_diagonal(p, 0)
p /= np.sum(p)
return (p + p.T) / (2 * n_samples)


def compute_low_dim_affinities(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/tsne.py, please provide doctest for the function compute_low_dim_affinities

y: np.ndarray,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: y

) -> tuple[np.ndarray, np.ndarray]:
"""
Compute low-dimensional affinities (Q matrix) using Student-t distribution.

Args:
y: Low-dimensional embeddings of shape (n_samples, n_components)

Returns:
Tuple[np.ndarray, np.ndarray]: Q probability matrix and numerator array
"""
sum_y = np.sum(np.square(y), axis=1)
num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
np.fill_diagonal(num, 0)
q = num / np.sum(num)
return q, num


def apply_tsne(
data_x: np.ndarray,
n_components: int = 2,
learning_rate: float = 200.0,
n_iter: int = 500,
) -> np.ndarray:
"""
Apply t-SNE for dimensionality reduction.

Args:
data_x: Original dataset (features)
n_components: Target dimension (2D or 3D)
learning_rate: Step size for gradient descent
n_iter: Number of iterations

Returns:
np.ndarray: Low-dimensional embedding of the data

Example:
>>> x, _ = collect_dataset()
>>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
>>> y_emb.shape
(150, 2)
"""
if n_components < 1 or n_iter < 1:
raise ValueError("n_components and n_iter must be >= 1")

n_samples = data_x.shape[0]
rng = np.random.default_rng()
y = rng.standard_normal((n_samples, n_components)) * 1e-4

p = compute_pairwise_affinities(data_x)
p = np.maximum(p, 1e-12)

y_inc = np.zeros_like(y)
momentum = 0.5

for i in range(n_iter):
q, num = compute_low_dim_affinities(y)
q = np.maximum(q, 1e-12)

pq = p - q
d_y = 4 * (
np.dot((pq * num), y)
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
)

y_inc = momentum * y_inc - learning_rate * d_y
y += y_inc

if i == int(n_iter / 4):
momentum = 0.8

return y


def main() -> None:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/tsne.py, please provide doctest for the function main

"""
Run t-SNE on Iris dataset and display the first 5 embeddings.
"""
data_x, _ = collect_dataset()
y_emb = apply_tsne(data_x, n_components=2, n_iter=300)

print("t-SNE embedding (first 5 points):")
print(y_emb[:5])

# Optional visualization (commented out)
# import matplotlib.pyplot as plt
# plt.scatter(y_emb[:, 0], y_emb[:, 1], c=_labels, cmap="viridis")
# plt.show()


if __name__ == "__main__":
doctest.testmod()
main()
Loading