Fix ruff issues for CI compliance

shretadas · shretadas · commit 63b074f8fd8b · 2025-10-22T17:00:31.000Z
- Replace all Greek alpha symbols (α) with 'alpha' in docstrings and comments
- Fix line length issues by breaking long type annotations
- Fix trailing whitespace issues
- Replace 'pass' with '...' in abstract base class method
- Maintain full functionality while improving code quality compliance
diff --git a/neural_network/optimizers/adagrad.py b/neural_network/optimizers/adagrad.py
@@ -7,7 +7,7 @@
 
 The update rules are:
 G_t = G_{t-1} + g_t ⊙ g_t  (element-wise squared gradient accumulation)
-θ_{t+1} = θ_t - (α / √(G_t + ε)) ⊙ g_t
+θ_{t+1} = θ_t - (alpha / √(G_t + ε)) ⊙ g_t
 
 where G_t accumulates squared gradients, ε prevents division by zero,
 and ⊙ denotes element-wise multiplication.
@@ -31,12 +31,12 @@ class Adagrad(BaseOptimizer):
 
     Mathematical formulation:
         G_t = G_{t-1} + g_t ⊙ g_t
-        θ_{t+1} = θ_t - (α / √(G_t + ε)) ⊙ g_t
+        θ_{t+1} = θ_t - (alpha / √(G_t + ε)) ⊙ g_t
 
     Where:
         - θ_t: parameters at time step t
         - G_t: accumulated squared gradients up to time t
-        - α: learning rate
+        - alpha: learning rate
         - ε: small constant for numerical stability (typically 1e-8)
         - g_t: gradients at time step t
         - ⊙: element-wise multiplication
@@ -56,7 +56,7 @@ class Adagrad(BaseOptimizer):
         True
         >>> updated1[0] > 0.85  # Small gradient -> larger step
         True
-        >>> updated1[1] < 1.95   # Large gradient -> smaller step (but still close to 2.0)
+        >>> updated1[1] < 1.95   # Large gradient -> smaller step (close to 2.0)
         True
 
         >>> # Second update (gradients accumulate)
@@ -106,7 +106,7 @@ def update(
 
         Performs adaptive gradient update:
         G_t = G_{t-1} + g_t^2
-        θ_{t+1} = θ_t - (α / √(G_t + ε)) * g_t
+        θ_{t+1} = θ_t - (alpha / √(G_t + ε)) * g_t
 
         Args:
             parameters: Current parameter values
@@ -123,7 +123,10 @@ def _adagrad_update_recursive(
             parameters: float | list[float | list[float]],
             gradients: float | list[float | list[float]],
             accumulated_gradients: float | list[float | list[float]]
-        ) -> tuple[float | list[float | list[float]], float | list[float | list[float]]]:
+        ) -> tuple[
+            float | list[float | list[float]],
+            float | list[float | list[float]]
+        ]:
             # Handle scalar case
             if isinstance(parameters, (int, float)):
                 if not isinstance(gradients, (int, float)):
@@ -137,7 +140,7 @@ def _adagrad_update_recursive(
                 # Accumulate squared gradients: G = G + g^2
                 new_acc_grads = accumulated_gradients + gradients * gradients
 
-                # Adaptive learning rate: α / √(G + ε)
+                # Adaptive learning rate: alpha / √(G + ε)
                 adaptive_lr = self.learning_rate / math.sqrt(
                     new_acc_grads + self.epsilon
                 )
diff --git a/neural_network/optimizers/adam.py b/neural_network/optimizers/adam.py
@@ -10,7 +10,7 @@
 v_t = β₂ * v_{t-1} + (1-β₂) * g_t²       # Second moment estimate
 m̂_t = m_t / (1 - β₁^t)                   # Bias-corrected first moment
 v̂_t = v_t / (1 - β₂^t)                   # Bias-corrected second moment
-θ_{t+1} = θ_t - α * m̂_t / (√v̂_t + ε)    # Parameter update
+θ_{t+1} = θ_t - alpha * m̂_t / (√v̂_t + ε)    # Parameter update
 """
 
 from __future__ import annotations
@@ -34,13 +34,13 @@ class Adam(BaseOptimizer):
         v_t = β₂ * v_{t-1} + (1-β₂) * g_t²
         m̂_t = m_t / (1 - β₁^t)
         v̂_t = v_t / (1 - β₂^t)
-        θ_{t+1} = θ_t - α * m̂_t / (√v̂_t + ε)
+        θ_{t+1} = θ_t - alpha * m̂_t / (√v̂_t + ε)
 
     Where:
         - θ_t: parameters at time step t
         - m_t, v_t: first and second moment estimates
         - m̂_t, v̂_t: bias-corrected moment estimates
-        - α: learning rate (default: 0.001)
+        - alpha: learning rate (default: 0.001)
         - β₁, β₂: exponential decay rates (default: 0.9, 0.999)
         - ε: small constant for numerical stability (default: 1e-8)
         - t: time step
@@ -139,7 +139,7 @@ def update(
         v_t = β₂ * v_{t-1} + (1-β₂) * g_t²
         m̂_t = m_t / (1 - β₁^t)
         v̂_t = v_t / (1 - β₂^t)
-        θ_{t+1} = θ_t - α * m̂_t / (√v̂_t + ε)
+        θ_{t+1} = θ_t - alpha * m̂_t / (√v̂_t + ε)
 
         Args:
             parameters: Current parameter values
@@ -188,7 +188,7 @@ def _adam_update_recursive(
                 m_hat = new_first_moment / bias_correction1
                 v_hat = new_second_moment / bias_correction2
 
-                # Parameter update: θ = θ - α * m̂ / (√v̂ + ε)
+                # Parameter update: θ = θ - alpha * m̂ / (√v̂ + ε)
                 new_param = parameters - self.learning_rate * m_hat / (
                     math.sqrt(v_hat) + self.epsilon
                 )
diff --git a/neural_network/optimizers/base_optimizer.py b/neural_network/optimizers/base_optimizer.py
@@ -77,6 +77,7 @@ def reset(self) -> None:
         or when you want to clear any accumulated state (like momentum).
         Default implementation does nothing, but optimizers with state should override.
         """
+        ...
 
     def __str__(self) -> str:
         """String representation of the optimizer."""
diff --git a/neural_network/optimizers/momentum_sgd.py b/neural_network/optimizers/momentum_sgd.py
@@ -7,10 +7,10 @@
 
 The update rules are:
 v_t = β * v_{t-1} + (1-β) * g_t
-θ_t = θ_{t-1} - α * v_t
+θ_t = θ_{t-1} - alpha * v_t
 
 where v_t is the velocity (momentum), β is the momentum coefficient,
-α is the learning rate, and g_t is the gradient.
+alpha is the learning rate, and g_t is the gradient.
 """
 
 from __future__ import annotations
@@ -28,12 +28,12 @@ class MomentumSGD(BaseOptimizer):
 
     Mathematical formulation:
         v_t = β * v_{t-1} + (1-β) * g_t
-        θ_{t+1} = θ_t - α * v_t
+        θ_{t+1} = θ_t - alpha * v_t
 
     Where:
         - θ_t: parameters at time step t
         - v_t: velocity (momentum) at time step t
-        - α: learning rate
+        - alpha: learning rate
         - β: momentum coefficient (typically 0.9)
         - g_t: gradients at time step t
 
@@ -101,7 +101,7 @@ def update(
 
         Performs momentum update:
         v_t = β * v_{t-1} + (1-β) * g_t
-        θ_t = θ_{t-1} - α * v_t
+        θ_t = θ_{t-1} - alpha * v_t
 
         Args:
             parameters: Current parameter values
@@ -131,7 +131,7 @@ def _check_shapes_and_get_velocity(
 
                 # Update velocity: v = β * v + (1-β) * g
                 new_velocity = self.momentum * velocity_values + (1 - self.momentum) * gradients
-                # Update parameter: θ = θ - α * v
+                # Update parameter: θ = θ - alpha * v
                 new_param = parameters - self.learning_rate * new_velocity
 
                 return new_param, new_velocity
diff --git a/neural_network/optimizers/nag.py b/neural_network/optimizers/nag.py
@@ -6,14 +6,14 @@
 overshooting and often leads to better convergence.
 
 The update rules are:
-θ_lookahead = θ_t - α * β * v_{t-1}
+θ_lookahead = θ_t - alpha * β * v_{t-1}
 g_t = ∇f(θ_lookahead)  # Gradient at lookahead position
 v_t = β * v_{t-1} + (1-β) * g_t
-θ_{t+1} = θ_t - α * v_t
+θ_{t+1} = θ_t - alpha * v_t
 
 However, a more efficient formulation equivalent to the above is:
 v_t = β * v_{t-1} + (1-β) * g_t
-θ_{t+1} = θ_t - α * (β * v_t + (1-β) * g_t)
+θ_{t+1} = θ_t - alpha * (β * v_t + (1-β) * g_t)
 """
 
 from __future__ import annotations
@@ -31,12 +31,12 @@ class NAG(BaseOptimizer):
 
     Mathematical formulation (efficient version):
         v_t = β * v_{t-1} + (1-β) * g_t
-        θ_{t+1} = θ_t - α * (β * v_t + (1-β) * g_t)
+        θ_{t+1} = θ_t - alpha * (β * v_t + (1-β) * g_t)
 
     Where:
         - θ_t: parameters at time step t
         - v_t: velocity (momentum) at time step t
-        - α: learning rate
+        - alpha: learning rate
         - β: momentum coefficient (typically 0.9)
         - g_t: gradients at time step t
 
@@ -103,7 +103,7 @@ def update(
 
         Performs Nesterov update using efficient formulation:
         v_t = β * v_{t-1} + (1-β) * g_t
-        θ_{t+1} = θ_t - α * (β * v_t + (1-β) * g_t)
+        θ_{t+1} = θ_t - alpha * (β * v_t + (1-β) * g_t)
 
         Args:
             parameters: Current parameter values
@@ -134,7 +134,7 @@ def _nag_update_recursive(
                 # Update velocity: v = β * v + (1-β) * g
                 new_velocity = self.momentum * velocity + (1 - self.momentum) * gradients
 
-                # NAG update: θ = θ - α * (β * v + (1-β) * g)
+                # NAG update: θ = θ - alpha * (β * v + (1-β) * g)
                 nesterov_update = (
                     self.momentum * new_velocity + (1 - self.momentum) * gradients
                 )
diff --git a/neural_network/optimizers/sgd.py b/neural_network/optimizers/sgd.py
@@ -4,8 +4,8 @@
 SGD is the most basic optimization algorithm for neural networks. It updates
 parameters by moving in the direction opposite to the gradient of the loss function.
 
-The update rule is: θ = θ - α * ∇θ
-where θ are the parameters, α is the learning rate, and ∇θ is the gradient.
+The update rule is: θ = θ - alpha * ∇θ
+where θ are the parameters, alpha is the learning rate, and ∇θ is the gradient.
 """
 
 from __future__ import annotations
@@ -22,11 +22,11 @@ class SGD(BaseOptimizer):
     the learning rate.
 
     Mathematical formulation:
-        θ_{t+1} = θ_t - α * g_t
+        θ_{t+1} = θ_t - alpha * g_t
 
     Where:
         - θ_t: parameters at time step t
-        - α: learning rate
+        - alpha: learning rate
         - g_t: gradients at time step t
 
     Parameters:
@@ -83,7 +83,7 @@ def update(
         """
         Update parameters using SGD rule.
 
-        Performs the classic SGD update: θ = θ - α * ∇θ
+        Performs the classic SGD update: θ = θ - alpha * ∇θ
 
         Args:
             parameters: Current parameter values