77
88The update rules are:
99G_t = G_{t-1} + g_t ⊙ g_t (element-wise squared gradient accumulation)
10- θ_{t+1} = θ_t - (α / √(G_t + ε)) ⊙ g_t
10+ θ_{t+1} = θ_t - (alpha / √(G_t + ε)) ⊙ g_t
1111
1212where G_t accumulates squared gradients, ε prevents division by zero,
1313and ⊙ denotes element-wise multiplication.
@@ -31,12 +31,12 @@ class Adagrad(BaseOptimizer):
3131
3232 Mathematical formulation:
3333 G_t = G_{t-1} + g_t ⊙ g_t
34- θ_{t+1} = θ_t - (α / √(G_t + ε)) ⊙ g_t
34+ θ_{t+1} = θ_t - (alpha / √(G_t + ε)) ⊙ g_t
3535
3636 Where:
3737 - θ_t: parameters at time step t
3838 - G_t: accumulated squared gradients up to time t
39- - α : learning rate
39+ - alpha : learning rate
4040 - ε: small constant for numerical stability (typically 1e-8)
4141 - g_t: gradients at time step t
4242 - ⊙: element-wise multiplication
@@ -56,7 +56,7 @@ class Adagrad(BaseOptimizer):
5656 True
5757 >>> updated1[0] > 0.85 # Small gradient -> larger step
5858 True
59- >>> updated1[1] < 1.95 # Large gradient -> smaller step (but still close to 2.0)
59+ >>> updated1[1] < 1.95 # Large gradient -> smaller step (close to 2.0)
6060 True
6161
6262 >>> # Second update (gradients accumulate)
@@ -106,7 +106,7 @@ def update(
106106
107107 Performs adaptive gradient update:
108108 G_t = G_{t-1} + g_t^2
109- θ_{t+1} = θ_t - (α / √(G_t + ε)) * g_t
109+ θ_{t+1} = θ_t - (alpha / √(G_t + ε)) * g_t
110110
111111 Args:
112112 parameters: Current parameter values
@@ -123,7 +123,10 @@ def _adagrad_update_recursive(
123123 parameters : float | list [float | list [float ]],
124124 gradients : float | list [float | list [float ]],
125125 accumulated_gradients : float | list [float | list [float ]]
126- ) -> tuple [float | list [float | list [float ]], float | list [float | list [float ]]]:
126+ ) -> tuple [
127+ float | list [float | list [float ]],
128+ float | list [float | list [float ]]
129+ ]:
127130 # Handle scalar case
128131 if isinstance (parameters , (int , float )):
129132 if not isinstance (gradients , (int , float )):
@@ -137,7 +140,7 @@ def _adagrad_update_recursive(
137140 # Accumulate squared gradients: G = G + g^2
138141 new_acc_grads = accumulated_gradients + gradients * gradients
139142
140- # Adaptive learning rate: α / √(G + ε)
143+ # Adaptive learning rate: alpha / √(G + ε)
141144 adaptive_lr = self .learning_rate / math .sqrt (
142145 new_acc_grads + self .epsilon
143146 )
0 commit comments