Skip to content

Commit f61de99

Browse files
auricomclaude
andcommitted
fix(raft): distinguish sync wait outcomes with syncResult enum
waitForBlockStoreSync previously returned bool, conflating three distinct failure modes (ctx canceled, timeout, lost leadership). The caller in Run then unconditionally called leadershipTransfer() on any false return, which is wrong when leadership was already lost. Introduce a syncResult enum (syncResultSynced, syncResultTimeout, syncResultLostLeadership, syncResultCanceled) and update Run to handle each case correctly: - syncResultCanceled → return ctx.Err() - syncResultLostLeadership → continue without calling leadershipTransfer() - syncResultTimeout → leadershipTransfer() + continue as before - syncResultSynced → refresh raftState/diff and proceed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 03e33c3 commit f61de99

1 file changed

Lines changed: 34 additions & 15 deletions

File tree

pkg/raft/election.go

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,13 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
147147
Int("store_lag_blocks", -diff).
148148
Uint64("raft_height", raftState.Height).
149149
Msg("became leader but store is significantly behind raft state; waiting for block-store sync")
150-
if !d.waitForBlockStoreSync(ctx, runnable) {
150+
switch d.waitForBlockStoreSync(ctx, runnable) {
151+
case syncResultCanceled:
152+
return ctx.Err()
153+
case syncResultLostLeadership:
154+
d.logger.Info().Msg("lost leadership while waiting for block-store sync; skipping abdication")
155+
continue
156+
case syncResultTimeout:
151157
d.logger.Warn().
152158
Int("store_lag_blocks", -diff).
153159
Uint64("raft_height", raftState.Height).
@@ -157,14 +163,16 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
157163
return fmt.Errorf("leadership transfer failed after store-lag abdication: %w", tErr)
158164
}
159165
continue
160-
}
161-
// Block store caught up — refresh state so the recovery
162-
// check below works with the latest values.
163-
d.logger.Info().Msg("block store caught up after wait; proceeding as leader")
164-
raftState = d.node.GetState()
165-
diff, err = runnable.IsSynced(raftState)
166-
if err != nil {
167-
return err
166+
case syncResultSynced:
167+
// Block store caught up — refresh state so the recovery
168+
// check below works with the latest values.
169+
d.logger.Info().Msg("block store caught up after wait; proceeding as leader")
170+
raftState = d.node.GetState()
171+
var syncErr error
172+
diff, syncErr = runnable.IsSynced(raftState)
173+
if syncErr != nil {
174+
return syncErr
175+
}
168176
}
169177
}
170178
if diff != 0 {
@@ -289,10 +297,18 @@ func (d *DynamicLeaderElection) IsRunning() bool {
289297
return d.running.Load()
290298
}
291299

300+
type syncResult int
301+
302+
const (
303+
syncResultSynced syncResult = iota // block store is within 1 block of raft FSM
304+
syncResultTimeout // deadline elapsed and store still lagging
305+
syncResultLostLeadership // lost leadership while waiting
306+
syncResultCanceled // context was canceled
307+
)
308+
292309
// waitForBlockStoreSync polls IsSynced until the block store is within 1 block
293310
// of the current raft FSM height, leadership is lost, or the context expires.
294-
// Returns true if sync was achieved in time.
295-
func (d *DynamicLeaderElection) waitForBlockStoreSync(ctx context.Context, r Runnable) bool {
311+
func (d *DynamicLeaderElection) waitForBlockStoreSync(ctx context.Context, r Runnable) syncResult {
296312
cfg := d.node.Config()
297313
timeout := cfg.ShutdownTimeout
298314
if timeout <= 0 {
@@ -306,18 +322,21 @@ func (d *DynamicLeaderElection) waitForBlockStoreSync(ctx context.Context, r Run
306322
for {
307323
select {
308324
case <-ctx.Done():
309-
return false
325+
return syncResultCanceled
310326
case <-deadline.C:
311327
// Final check before giving up.
312328
diff, err := r.IsSynced(d.node.GetState())
313-
return err == nil && diff >= -1
329+
if err == nil && diff >= -1 {
330+
return syncResultSynced
331+
}
332+
return syncResultTimeout
314333
case <-ticker.C:
315334
if d.node.leaderID() != d.node.NodeID() {
316-
return false // lost leadership during wait
335+
return syncResultLostLeadership
317336
}
318337
diff, err := r.IsSynced(d.node.GetState())
319338
if err == nil && diff >= -1 {
320-
return true
339+
return syncResultSynced
321340
}
322341
}
323342
}

0 commit comments

Comments
 (0)