1616"""
1717
1818import numpy as np
19- from typing import Optional
2019
2120
2221def create_patches (
@@ -50,15 +49,17 @@ def create_patches(
5049 (14, 14)
5150 """
5251 if len (image .shape ) != 3 :
53- raise ValueError (f"Expected 3D image, got shape { image .shape } " )
52+ msg = f"Expected 3D image, got shape { image .shape } "
53+ raise ValueError (msg )
5454
5555 height , width , channels = image .shape
5656
5757 if height % patch_size != 0 or width % patch_size != 0 :
58- raise ValueError (
58+ msg = (
5959 f"Image dimensions ({ height } x{ width } ) must be divisible by "
6060 f"patch_size ({ patch_size } )"
6161 )
62+ raise ValueError (msg )
6263
6364 # Calculate number of patches in each dimension
6465 num_patches_h = height // patch_size
@@ -81,7 +82,8 @@ def patch_embedding(patches: np.ndarray, embedding_dim: int = 768) -> np.ndarray
8182 Linearly project flattened patches to embedding dimension.
8283
8384 Args:
84- patches: Array of patches with shape (num_patches, patch_size, patch_size, channels)
85+ patches: Array of patches with shape
86+ (num_patches, patch_size, patch_size, channels)
8587 embedding_dim: Dimension of the embedding space (default: 768)
8688
8789 Returns:
@@ -102,18 +104,19 @@ def patch_embedding(patches: np.ndarray, embedding_dim: int = 768) -> np.ndarray
102104 # Flatten each patch
103105 flattened = patches .reshape (num_patches , - 1 )
104106
105- # Linear projection (simplified - in practice this would be a learned weight matrix)
107+ # Linear projection (simplified - in practice this is a learned weight matrix)
106108 # For demonstration, we use random projection
107109 patch_dim = flattened .shape [1 ]
108- projection_matrix = np .random .randn (patch_dim , embedding_dim ) * 0.02
110+ rng = np .random .default_rng ()
111+ projection_matrix = rng .standard_normal ((patch_dim , embedding_dim )) * 0.02
109112
110113 embedded = flattened @ projection_matrix
111114
112115 return embedded
113116
114117
115118def add_positional_encoding (
116- embeddings : np .ndarray , num_positions : Optional [ int ] = None
119+ embeddings : np .ndarray , num_positions : int | None = None
117120) -> np .ndarray :
118121 """
119122 Add learnable positional encodings to patch embeddings.
@@ -142,11 +145,12 @@ def add_positional_encoding(
142145 # Add 1 for the CLS token
143146 num_positions = num_patches + 1
144147
145- # Create learnable positional encodings (simplified - normally learned parameters)
146- positional_encodings = np .random .randn (num_positions , embedding_dim ) * 0.02
148+ # Create learnable positional encodings (simplified - normally learned)
149+ rng = np .random .default_rng ()
150+ positional_encodings = rng .standard_normal ((num_positions , embedding_dim )) * 0.02
147151
148152 # Prepend CLS token
149- cls_token = np . random . randn ( 1 , embedding_dim ) * 0.02
153+ cls_token = rng . standard_normal (( 1 , embedding_dim ) ) * 0.02
150154
151155 # Concatenate CLS token with patch embeddings
152156 embeddings_with_cls = np .vstack ([cls_token , embeddings ])
@@ -158,7 +162,10 @@ def add_positional_encoding(
158162
159163
160164def attention_mechanism (
161- query : np .ndarray , key : np .ndarray , value : np .ndarray , mask : Optional [np .ndarray ] = None
165+ query : np .ndarray ,
166+ key : np .ndarray ,
167+ value : np .ndarray ,
168+ mask : np .ndarray | None = None ,
162169) -> tuple [np .ndarray , np .ndarray ]:
163170 """
164171 Compute scaled dot-product attention.
@@ -257,26 +264,28 @@ def feedforward_network(x: np.ndarray, hidden_dim: int = 3072) -> np.ndarray:
257264 >>> output.shape
258265 (197, 512)
259266 """
260- seq_len , embedding_dim = x .shape
267+ embedding_dim = x .shape [1 ]
268+ rng = np .random .default_rng ()
261269
262270 # First linear layer
263- w1 = np . random . randn ( embedding_dim , hidden_dim ) * 0.02
271+ w1 = rng . standard_normal (( embedding_dim , hidden_dim ) ) * 0.02
264272 b1 = np .zeros (hidden_dim )
265273 hidden = x @ w1 + b1
266274
267275 # GELU activation (approximation)
268- hidden = 0.5 * hidden * (1 + np .tanh (np .sqrt (2 / np .pi ) * (hidden + 0.044715 * hidden ** 3 )))
276+ gelu_factor = np .sqrt (2 / np .pi ) * (hidden + 0.044715 * hidden ** 3 )
277+ hidden = 0.5 * hidden * (1 + np .tanh (gelu_factor ))
269278
270279 # Second linear layer
271- w2 = np . random . randn ( hidden_dim , embedding_dim ) * 0.02
280+ w2 = rng . standard_normal (( hidden_dim , embedding_dim ) ) * 0.02
272281 b2 = np .zeros (embedding_dim )
273282 output = hidden @ w2 + b2
274283
275284 return output
276285
277286
278287def transformer_encoder_block (
279- x : np .ndarray , num_heads : int = 12 , hidden_dim : int = 3072
288+ x : np .ndarray , num_heads : int = 12 , hidden_dim : int = 3072 # noqa: ARG001
280289) -> np .ndarray :
281290 """
282291 Apply a single Transformer encoder block.
@@ -287,7 +296,7 @@ def transformer_encoder_block(
287296
288297 Args:
289298 x: Input array of shape (seq_len, embedding_dim)
290- num_heads: Number of attention heads (default: 12)
299+ num_heads: Number of attention heads (default: 12, kept for API )
291300 hidden_dim: Hidden dimension for FFN (default: 3072)
292301
293302 Returns:
@@ -304,11 +313,9 @@ def transformer_encoder_block(
304313 >>> output.shape
305314 (50, 512)
306315 """
307- seq_len , embedding_dim = x .shape
308- head_dim = embedding_dim // num_heads
309-
310316 # Multi-head self-attention (simplified - using single head for demonstration)
311317 # In practice, this would split into multiple heads
318+ # num_heads parameter is kept for API compatibility
312319 attention_output , _ = attention_mechanism (x , x , x )
313320
314321 # Add residual connection and apply layer norm
@@ -361,12 +368,14 @@ def vision_transformer(
361368 (10,)
362369
363370 >>> img = np.random.rand(32, 32, 3)
364- >>> logits = vision_transformer(img, patch_size=16, embedding_dim=512, num_layers=6, num_classes=100)
371+ >>> logits = vision_transformer(
372+ ... img, patch_size=16, embedding_dim=512, num_layers=6, num_classes=100
373+ ... )
365374 >>> logits.shape
366375 (100,)
367376 """
368377 # Step 1: Create patches
369- patches , grid_size = create_patches (image , patch_size )
378+ patches , _ = create_patches (image , patch_size )
370379
371380 # Step 2: Embed patches
372381 embeddings = patch_embedding (patches , embedding_dim )
@@ -382,7 +391,8 @@ def vision_transformer(
382391 cls_token = embeddings [0 ]
383392
384393 # Step 6: Classification head (linear layer)
385- classifier_weights = np .random .randn (embedding_dim , num_classes ) * 0.02
394+ rng = np .random .default_rng ()
395+ classifier_weights = rng .standard_normal ((embedding_dim , num_classes )) * 0.02
386396 classifier_bias = np .zeros (num_classes )
387397 logits = cls_token @ classifier_weights + classifier_bias
388398
@@ -399,7 +409,8 @@ def vision_transformer(
399409 print ("=" * 50 )
400410
401411 # Create a sample image (224x224x3 for ImageNet-style input)
402- sample_image = np .random .rand (224 , 224 , 3 )
412+ rng = np .random .default_rng ()
413+ sample_image = rng .random ((224 , 224 , 3 ))
403414 print (f"Input image shape: { sample_image .shape } " )
404415
405416 # Apply Vision Transformer
0 commit comments