From cf0eaee68d4f85edf0b904e642ed0ef23354b370 Mon Sep 17 00:00:00 2001 From: Stefan Date: Fri, 22 May 2026 15:41:34 +0200 Subject: [PATCH] fix(client): persist reth blocks during ZFS snapshot prep so the head isn't lost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reth-bal-* stateful suites whose pre-run ends on a heavy block (here the trie-heavy funding block with 15k withdrawals) fail every test: after the per-test ZFS rollback reth comes up one block behind the chain head, so each test's newPayload references a missing parent and reth stays SYNCING forever (retries exhaust; forkchoiceUpdated returns SYNCING, not VALID). Root cause (verified against reth bal-devnet-7 @ d9d028b and the run logs): reth keeps recent canonical blocks in its in-memory engine tree and only flushes to MDBX once they are persistence_threshold (default 2) deep (should_persist, tree/mod.rs). Pre-run reaches canonical head 24407728 but only 24407727 is persisted. The 20s settle does not help — with threshold 2, should_persist never fires while the gap is <=2. The only flush attempt is at graceful shutdown (persist_until_complete), which recomputes the state root and is cut off by reth's 5s graceful-shutdown timeout — too short for the trie-heavy block on slow ZFS (observed 5.7s stop). Light suites flush in time, which is why only funding-block suites break. Fix: add a Spec.SnapshotPrepareArgs() hook and apply reth's --engine.persistence-threshold=0 (+ --engine.memory-block-buffer-target=0) ONLY to the container that builds the ZFS ready snapshot, so reth persists each block as it becomes canonical during pre-run (the heavy trie write lands inside the 20s settle, not the 5s shutdown box). Per-test measurement containers keep reth's default persistence, so newPayload/FCU timings and cross-client comparisons are unaffected. Other client specs return nil. The ready snapshot is rebuilt by the prepare phase each run, so no manual snapshot surgery is needed. --- pkg/client/besu.go | 5 +++++ pkg/client/client.go | 3 +++ pkg/client/erigon.go | 5 +++++ pkg/client/ethrex.go | 5 +++++ pkg/client/geth.go | 5 +++++ pkg/client/nethermind.go | 5 +++++ pkg/client/nimbus.go | 5 +++++ pkg/client/reth.go | 8 ++++++++ pkg/runner/lifecycle.go | 19 ++++++++++++++++++- 9 files changed, 59 insertions(+), 1 deletion(-) diff --git a/pkg/client/besu.go b/pkg/client/besu.go index 38da411..4f8dd2b 100644 --- a/pkg/client/besu.go +++ b/pkg/client/besu.go @@ -103,3 +103,8 @@ func (s *besuSpec) RPCRollbackSpec() *RPCRollbackSpec { func (s *besuSpec) DefaultConfigFiles() map[string]string { return nil } + +// SnapshotPrepareArgs returns nil; Besu needs no snapshot-only args. +func (s *besuSpec) SnapshotPrepareArgs() []string { + return nil +} diff --git a/pkg/client/client.go b/pkg/client/client.go index 849fd1d..aafc505 100644 --- a/pkg/client/client.go +++ b/pkg/client/client.go @@ -88,6 +88,9 @@ type Spec interface { // Keys are target paths inside the container, values are file contents. // Returns nil if no config files are needed. DefaultConfigFiles() map[string]string + + // SnapshotPrepareArgs returns args for the ZFS snapshot-prep container only (not per-test measurement); nil if none. + SnapshotPrepareArgs() []string } // Registry manages client specifications. diff --git a/pkg/client/erigon.go b/pkg/client/erigon.go index bfd649d..1c6742e 100644 --- a/pkg/client/erigon.go +++ b/pkg/client/erigon.go @@ -109,3 +109,8 @@ func (s *erigonSpec) RPCRollbackSpec() *RPCRollbackSpec { func (s *erigonSpec) DefaultConfigFiles() map[string]string { return nil } + +// SnapshotPrepareArgs returns nil; Erigon needs no snapshot-only args. +func (s *erigonSpec) SnapshotPrepareArgs() []string { + return nil +} diff --git a/pkg/client/ethrex.go b/pkg/client/ethrex.go index c965790..71b15e9 100644 --- a/pkg/client/ethrex.go +++ b/pkg/client/ethrex.go @@ -89,3 +89,8 @@ func (s *ethrexSpec) RPCRollbackSpec() *RPCRollbackSpec { func (s *ethrexSpec) DefaultConfigFiles() map[string]string { return nil } + +// SnapshotPrepareArgs returns nil; Ethrex needs no snapshot-only args. +func (s *ethrexSpec) SnapshotPrepareArgs() []string { + return nil +} diff --git a/pkg/client/geth.go b/pkg/client/geth.go index a287456..b0124dc 100644 --- a/pkg/client/geth.go +++ b/pkg/client/geth.go @@ -108,3 +108,8 @@ IdleTimeout = 120000000000 # 120s `, } } + +// SnapshotPrepareArgs returns nil; Geth needs no snapshot-only args. +func (s *gethSpec) SnapshotPrepareArgs() []string { + return nil +} diff --git a/pkg/client/nethermind.go b/pkg/client/nethermind.go index e05d7dc..b9de7b7 100644 --- a/pkg/client/nethermind.go +++ b/pkg/client/nethermind.go @@ -101,3 +101,8 @@ func (s *nethermindSpec) RPCRollbackSpec() *RPCRollbackSpec { func (s *nethermindSpec) DefaultConfigFiles() map[string]string { return nil } + +// SnapshotPrepareArgs returns nil; Nethermind needs no snapshot-only args. +func (s *nethermindSpec) SnapshotPrepareArgs() []string { + return nil +} diff --git a/pkg/client/nimbus.go b/pkg/client/nimbus.go index 89fb472..4278e87 100644 --- a/pkg/client/nimbus.go +++ b/pkg/client/nimbus.go @@ -88,3 +88,8 @@ func (s *nimbusSpec) RPCRollbackSpec() *RPCRollbackSpec { func (s *nimbusSpec) DefaultConfigFiles() map[string]string { return nil } + +// SnapshotPrepareArgs returns nil; Nimbus needs no snapshot-only args. +func (s *nimbusSpec) SnapshotPrepareArgs() []string { + return nil +} diff --git a/pkg/client/reth.go b/pkg/client/reth.go index 786edf2..38b3f7a 100644 --- a/pkg/client/reth.go +++ b/pkg/client/reth.go @@ -90,3 +90,11 @@ func (s *rethSpec) RPCRollbackSpec() *RPCRollbackSpec { func (s *rethSpec) DefaultConfigFiles() map[string]string { return nil } + +// SnapshotPrepareArgs makes reth persist every block during snapshot prep so the pre-run head reaches disk (else the snapshot lags a block and tests hang SYNCING). +func (s *rethSpec) SnapshotPrepareArgs() []string { + return []string{ + "--engine.persistence-threshold=0", + "--engine.memory-block-buffer-target=0", + } +} diff --git a/pkg/runner/lifecycle.go b/pkg/runner/lifecycle.go index 43a2075..af3dbe2 100644 --- a/pkg/runner/lifecycle.go +++ b/pkg/runner/lifecycle.go @@ -699,8 +699,25 @@ func (r *runner) runContainerLifecycle( params.DataDirCfg = datadirCfg params.UseDataDir = useDataDir + // Apply client-specific snapshot-prepare args to the initial (pre-run/snapshot) container only, scoped to ZFS container-recreate; per-test containers clone params.ContainerSpec and keep the base command. + createSpec := containerSpec + + if prepareArgs := spec.SnapshotPrepareArgs(); len(prepareArgs) > 0 && + r.cfg.FullConfig != nil && + r.cfg.FullConfig.GetRollbackStrategy(instance) == + config.RollbackStrategyContainerRecreate && + datadirCfg != nil && datadirCfg.Method == "zfs" { + prepareSpec := *containerSpec + prepareSpec.Command = append(append([]string{}, cmd...), prepareArgs...) + createSpec = &prepareSpec + + log.WithField("args", prepareArgs).Info( + "Applying snapshot-prepare args to initial (pre-run) container", + ) + } + // Create container. - containerID, err := r.containerMgr.CreateContainer(ctx, containerSpec) + containerID, err := r.containerMgr.CreateContainer(ctx, createSpec) if err != nil { return fmt.Errorf("creating container: %w", err) }