@@ -214,63 +214,65 @@ def attention_mechanism(
214214 return output , attention_weights
215215
216216
217- def layer_norm (x : np .ndarray , epsilon : float = 1e-6 ) -> np .ndarray :
217+ def layer_norm (embeddings : np .ndarray , epsilon : float = 1e-6 ) -> np .ndarray :
218218 """
219219 Apply Layer Normalization.
220220
221221 Args:
222- x : Input array of shape (seq_len, embedding_dim)
222+ embeddings : Input array of shape (seq_len, embedding_dim)
223223 epsilon: Small constant for numerical stability (default: 1e-6)
224224
225225 Returns:
226226 Normalized array of same shape as input
227227
228228 Examples:
229- >>> x = np.random.rand(10, 768)
230- >>> normalized = layer_norm(x )
229+ >>> embeddings = np.random.rand(10, 768)
230+ >>> normalized = layer_norm(embeddings )
231231 >>> normalized.shape
232232 (10, 768)
233233 >>> np.allclose(normalized.mean(axis=1), 0.0, atol=1e-6)
234234 True
235235 >>> np.allclose(normalized.std(axis=1), 1.0, atol=1e-6)
236236 True
237237 """
238- mean = x .mean (axis = - 1 , keepdims = True )
239- std = x .std (axis = - 1 , keepdims = True )
240- return (x - mean ) / (std + epsilon )
238+ mean = embeddings .mean (axis = - 1 , keepdims = True )
239+ std = embeddings .std (axis = - 1 , keepdims = True )
240+ return (embeddings - mean ) / (std + epsilon )
241241
242242
243- def feedforward_network (x : np .ndarray , hidden_dim : int = 3072 ) -> np .ndarray :
243+ def feedforward_network (
244+ embeddings : np .ndarray , hidden_dim : int = 3072
245+ ) -> np .ndarray :
244246 """
245247 Apply position-wise feed-forward network.
246248
247249 FFN(x) = max(0, xW1 + b1)W2 + b2
248250
249251 Args:
250- x : Input array of shape (seq_len, embedding_dim)
252+ embeddings : Input array of shape (seq_len, embedding_dim)
251253 hidden_dim: Hidden dimension size (default: 3072, typically 4x embedding_dim)
252254
253255 Returns:
254256 Output array of shape (seq_len, embedding_dim)
255257
256258 Examples:
257- >>> x = np.random.rand(10, 768)
258- >>> output = feedforward_network(x , hidden_dim=3072)
259+ >>> embeddings = np.random.rand(10, 768)
260+ >>> output = feedforward_network(embeddings , hidden_dim=3072)
259261 >>> output.shape
260262 (10, 768)
261263
262- >>> x = np.random.rand(197, 512)
263- >>> output = feedforward_network(x , hidden_dim=2048)
264+ >>> embeddings = np.random.rand(197, 512)
265+ >>> output = feedforward_network(embeddings , hidden_dim=2048)
264266 >>> output.shape
265267 (197, 512)
266268 """
267- embedding_dim = x .shape [1 ]
269+ embedding_dim = embeddings .shape [1 ]
268270 rng = np .random .default_rng ()
269271
270272 # First linear layer
271273 w1 = rng .standard_normal ((embedding_dim , hidden_dim )) * 0.02
272274 b1 = np .zeros (hidden_dim )
273- hidden = x @ w1 + b1
275+ hidden = embeddings @ w1 + b1
274276
275277 # GELU activation (approximation)
276278 gelu_factor = np .sqrt (2 / np .pi ) * (hidden + 0.044715 * hidden ** 3 )
@@ -285,9 +287,9 @@ def feedforward_network(x: np.ndarray, hidden_dim: int = 3072) -> np.ndarray:
285287
286288
287289def transformer_encoder_block (
288- x : np .ndarray ,
289- num_heads : int = 12 ,
290- hidden_dim : int = 3072 , # noqa: ARG001
290+ embeddings : np .ndarray ,
291+ num_heads : int = 12 , # noqa: ARG001
292+ hidden_dim : int = 3072 ,
291293) -> np .ndarray :
292294 """
293295 Apply a single Transformer encoder block.
@@ -297,39 +299,43 @@ def transformer_encoder_block(
297299 2. Feed-forward network with residual connection and layer norm
298300
299301 Args:
300- x : Input array of shape (seq_len, embedding_dim)
302+ embeddings : Input array of shape (seq_len, embedding_dim)
301303 num_heads: Number of attention heads (default: 12, kept for API)
302304 hidden_dim: Hidden dimension for FFN (default: 3072)
303305
304306 Returns:
305307 Output array of shape (seq_len, embedding_dim)
306308
307309 Examples:
308- >>> x = np.random.rand(197, 768)
309- >>> output = transformer_encoder_block(x, num_heads=12, hidden_dim=3072)
310+ >>> embeddings = np.random.rand(197, 768)
311+ >>> output = transformer_encoder_block(
312+ ... embeddings, num_heads=12, hidden_dim=3072
313+ ... )
310314 >>> output.shape
311315 (197, 768)
312316
313- >>> x = np.random.rand(50, 512)
314- >>> output = transformer_encoder_block(x, num_heads=8, hidden_dim=2048)
317+ >>> embeddings = np.random.rand(50, 512)
318+ >>> output = transformer_encoder_block(
319+ ... embeddings, num_heads=8, hidden_dim=2048
320+ ... )
315321 >>> output.shape
316322 (50, 512)
317323 """
318324 # Multi-head self-attention (simplified - using single head for demonstration)
319325 # In practice, this would split into multiple heads
320326 # num_heads parameter is kept for API compatibility
321- attention_output , _ = attention_mechanism (x , x , x )
327+ attention_output , _ = attention_mechanism (embeddings , embeddings , embeddings )
322328
323329 # Add residual connection and apply layer norm
324- x = layer_norm (x + attention_output )
330+ embeddings = layer_norm (embeddings + attention_output )
325331
326332 # Feed-forward network
327- ffn_output = feedforward_network (x , hidden_dim )
333+ ffn_output = feedforward_network (embeddings , hidden_dim )
328334
329335 # Add residual connection and apply layer norm
330- x = layer_norm (x + ffn_output )
336+ embeddings = layer_norm (embeddings + ffn_output )
331337
332- return x
338+ return embeddings
333339
334340
335341def vision_transformer (
0 commit comments