Skip to content

Commit e10941a

Browse files
committed
fix: resolve ruff linting errors in vision_transformer.py
- Replace Optional with X | None syntax (UP045) - Use np.random.Generator instead of legacy np.random methods (NPY002) - Fix line length violations (E501) - Assign f-string literals to variables in exceptions (EM102) - Remove unused variables and parameters (RUF059, F841) - Add noqa comment for intentionally unused API parameter - All doctests still pass successfully
1 parent 960111b commit e10941a

File tree

1 file changed

+35
-24
lines changed

1 file changed

+35
-24
lines changed

computer_vision/vision_transformer.py

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
"""
1717

1818
import numpy as np
19-
from typing import Optional
2019

2120

2221
def create_patches(
@@ -50,15 +49,17 @@ def create_patches(
5049
(14, 14)
5150
"""
5251
if len(image.shape) != 3:
53-
raise ValueError(f"Expected 3D image, got shape {image.shape}")
52+
msg = f"Expected 3D image, got shape {image.shape}"
53+
raise ValueError(msg)
5454

5555
height, width, channels = image.shape
5656

5757
if height % patch_size != 0 or width % patch_size != 0:
58-
raise ValueError(
58+
msg = (
5959
f"Image dimensions ({height}x{width}) must be divisible by "
6060
f"patch_size ({patch_size})"
6161
)
62+
raise ValueError(msg)
6263

6364
# Calculate number of patches in each dimension
6465
num_patches_h = height // patch_size
@@ -81,7 +82,8 @@ def patch_embedding(patches: np.ndarray, embedding_dim: int = 768) -> np.ndarray
8182
Linearly project flattened patches to embedding dimension.
8283
8384
Args:
84-
patches: Array of patches with shape (num_patches, patch_size, patch_size, channels)
85+
patches: Array of patches with shape
86+
(num_patches, patch_size, patch_size, channels)
8587
embedding_dim: Dimension of the embedding space (default: 768)
8688
8789
Returns:
@@ -102,18 +104,19 @@ def patch_embedding(patches: np.ndarray, embedding_dim: int = 768) -> np.ndarray
102104
# Flatten each patch
103105
flattened = patches.reshape(num_patches, -1)
104106

105-
# Linear projection (simplified - in practice this would be a learned weight matrix)
107+
# Linear projection (simplified - in practice this is a learned weight matrix)
106108
# For demonstration, we use random projection
107109
patch_dim = flattened.shape[1]
108-
projection_matrix = np.random.randn(patch_dim, embedding_dim) * 0.02
110+
rng = np.random.default_rng()
111+
projection_matrix = rng.standard_normal((patch_dim, embedding_dim)) * 0.02
109112

110113
embedded = flattened @ projection_matrix
111114

112115
return embedded
113116

114117

115118
def add_positional_encoding(
116-
embeddings: np.ndarray, num_positions: Optional[int] = None
119+
embeddings: np.ndarray, num_positions: int | None = None
117120
) -> np.ndarray:
118121
"""
119122
Add learnable positional encodings to patch embeddings.
@@ -142,11 +145,12 @@ def add_positional_encoding(
142145
# Add 1 for the CLS token
143146
num_positions = num_patches + 1
144147

145-
# Create learnable positional encodings (simplified - normally learned parameters)
146-
positional_encodings = np.random.randn(num_positions, embedding_dim) * 0.02
148+
# Create learnable positional encodings (simplified - normally learned)
149+
rng = np.random.default_rng()
150+
positional_encodings = rng.standard_normal((num_positions, embedding_dim)) * 0.02
147151

148152
# Prepend CLS token
149-
cls_token = np.random.randn(1, embedding_dim) * 0.02
153+
cls_token = rng.standard_normal((1, embedding_dim)) * 0.02
150154

151155
# Concatenate CLS token with patch embeddings
152156
embeddings_with_cls = np.vstack([cls_token, embeddings])
@@ -158,7 +162,10 @@ def add_positional_encoding(
158162

159163

160164
def attention_mechanism(
161-
query: np.ndarray, key: np.ndarray, value: np.ndarray, mask: Optional[np.ndarray] = None
165+
query: np.ndarray,
166+
key: np.ndarray,
167+
value: np.ndarray,
168+
mask: np.ndarray | None = None,
162169
) -> tuple[np.ndarray, np.ndarray]:
163170
"""
164171
Compute scaled dot-product attention.
@@ -257,26 +264,28 @@ def feedforward_network(x: np.ndarray, hidden_dim: int = 3072) -> np.ndarray:
257264
>>> output.shape
258265
(197, 512)
259266
"""
260-
seq_len, embedding_dim = x.shape
267+
embedding_dim = x.shape[1]
268+
rng = np.random.default_rng()
261269

262270
# First linear layer
263-
w1 = np.random.randn(embedding_dim, hidden_dim) * 0.02
271+
w1 = rng.standard_normal((embedding_dim, hidden_dim)) * 0.02
264272
b1 = np.zeros(hidden_dim)
265273
hidden = x @ w1 + b1
266274

267275
# GELU activation (approximation)
268-
hidden = 0.5 * hidden * (1 + np.tanh(np.sqrt(2 / np.pi) * (hidden + 0.044715 * hidden**3)))
276+
gelu_factor = np.sqrt(2 / np.pi) * (hidden + 0.044715 * hidden**3)
277+
hidden = 0.5 * hidden * (1 + np.tanh(gelu_factor))
269278

270279
# Second linear layer
271-
w2 = np.random.randn(hidden_dim, embedding_dim) * 0.02
280+
w2 = rng.standard_normal((hidden_dim, embedding_dim)) * 0.02
272281
b2 = np.zeros(embedding_dim)
273282
output = hidden @ w2 + b2
274283

275284
return output
276285

277286

278287
def transformer_encoder_block(
279-
x: np.ndarray, num_heads: int = 12, hidden_dim: int = 3072
288+
x: np.ndarray, num_heads: int = 12, hidden_dim: int = 3072 # noqa: ARG001
280289
) -> np.ndarray:
281290
"""
282291
Apply a single Transformer encoder block.
@@ -287,7 +296,7 @@ def transformer_encoder_block(
287296
288297
Args:
289298
x: Input array of shape (seq_len, embedding_dim)
290-
num_heads: Number of attention heads (default: 12)
299+
num_heads: Number of attention heads (default: 12, kept for API)
291300
hidden_dim: Hidden dimension for FFN (default: 3072)
292301
293302
Returns:
@@ -304,11 +313,9 @@ def transformer_encoder_block(
304313
>>> output.shape
305314
(50, 512)
306315
"""
307-
seq_len, embedding_dim = x.shape
308-
head_dim = embedding_dim // num_heads
309-
310316
# Multi-head self-attention (simplified - using single head for demonstration)
311317
# In practice, this would split into multiple heads
318+
# num_heads parameter is kept for API compatibility
312319
attention_output, _ = attention_mechanism(x, x, x)
313320

314321
# Add residual connection and apply layer norm
@@ -361,12 +368,14 @@ def vision_transformer(
361368
(10,)
362369
363370
>>> img = np.random.rand(32, 32, 3)
364-
>>> logits = vision_transformer(img, patch_size=16, embedding_dim=512, num_layers=6, num_classes=100)
371+
>>> logits = vision_transformer(
372+
... img, patch_size=16, embedding_dim=512, num_layers=6, num_classes=100
373+
... )
365374
>>> logits.shape
366375
(100,)
367376
"""
368377
# Step 1: Create patches
369-
patches, grid_size = create_patches(image, patch_size)
378+
patches, _ = create_patches(image, patch_size)
370379

371380
# Step 2: Embed patches
372381
embeddings = patch_embedding(patches, embedding_dim)
@@ -382,7 +391,8 @@ def vision_transformer(
382391
cls_token = embeddings[0]
383392

384393
# Step 6: Classification head (linear layer)
385-
classifier_weights = np.random.randn(embedding_dim, num_classes) * 0.02
394+
rng = np.random.default_rng()
395+
classifier_weights = rng.standard_normal((embedding_dim, num_classes)) * 0.02
386396
classifier_bias = np.zeros(num_classes)
387397
logits = cls_token @ classifier_weights + classifier_bias
388398

@@ -399,7 +409,8 @@ def vision_transformer(
399409
print("=" * 50)
400410

401411
# Create a sample image (224x224x3 for ImageNet-style input)
402-
sample_image = np.random.rand(224, 224, 3)
412+
rng = np.random.default_rng()
413+
sample_image = rng.random((224, 224, 3))
403414
print(f"Input image shape: {sample_image.shape}")
404415

405416
# Apply Vision Transformer

0 commit comments

Comments
 (0)