From 9eec51917403380596f4fcf933696fd75723b73a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:39:22 -0300 Subject: [PATCH 1/2] docs: improve devnet-runner skill --- .claude/skills/devnet-runner/SKILL.md | 274 ++---------------- .../references/checkpoint-sync.md | 122 -------- .../devnet-runner/references/clients.md | 48 +-- .../references/long-lived-devnet.md | 185 ++++++++++++ .../references/validator-config.md | 110 +++++++ .../scripts/run-devnet-with-timeout.sh | 6 + 6 files changed, 332 insertions(+), 413 deletions(-) delete mode 100644 .claude/skills/devnet-runner/references/checkpoint-sync.md create mode 100644 .claude/skills/devnet-runner/references/long-lived-devnet.md create mode 100644 .claude/skills/devnet-runner/references/validator-config.md diff --git a/.claude/skills/devnet-runner/SKILL.md b/.claude/skills/devnet-runner/SKILL.md index 3bc7ebfa..30799096 100644 --- a/.claude/skills/devnet-runner/SKILL.md +++ b/.claude/skills/devnet-runner/SKILL.md @@ -1,6 +1,6 @@ --- name: devnet-runner -description: Manage local development networks for lean consensus testing. Use when users want to (1) Configure a devnet with validator nodes, (2) Start/stop devnet nodes, (3) Regenerate genesis files, (4) Collect and dump node logs to files, (5) Troubleshoot devnet issues, (6) Restart a node with checkpoint sync. +description: Manage local development networks (devnets) for lean consensus multi-client testing. This skill should be used when the user asks to run a devnet, start or stop devnet nodes, spin up a local testnet, configure validator nodes, regenerate genesis files, change Docker image tags, collect or dump node logs, troubleshoot devnet issues, restart a node with checkpoint sync, run a long-lived devnet with detached containers, or perform rolling restarts to upgrade images. --- # Devnet Runner @@ -17,19 +17,15 @@ make lean-quickstart ## Default Behavior When starting a devnet, **always**: -1. **Update validator config** - Edit `lean-quickstart/local-devnet/genesis/validator-config.yaml` to include ONLY the nodes that will run. Remove entries for nodes that won't be started (unless the user explicitly asks to keep them). This is critical because validator indices are assigned to ALL nodes in the config - if a node is in the config but not running, its validators will miss their proposer slots. +1. **Update validator config** - Edit `lean-quickstart/local-devnet/genesis/validator-config.yaml` to include ONLY the nodes that will run. Remove entries for nodes that won't be started (unless the user explicitly asks to keep them). Validator indices are assigned to ALL nodes in the config; if a node is in the config but not running, its validators will miss their proposer slots. To control which nodes run, always edit this config file rather than using `--node `, since `--node` does NOT reassign validators and causes missed slots. 2. **Update client image tags** - If the user specifies a tag (e.g., "use devnet1 tag"), edit the relevant `lean-quickstart/client-cmds/{client}-cmd.sh` file to update the `node_docker` image tag. -3. **Use run-devnet-with-timeout.sh** - This script runs all nodes in the config with a timeout, dumps logs, then stops them. Do NOT use `--node ` to select nodes - this does not reassign validators. -4. Run for **20 slots** unless the user specifies otherwise -5. The script automatically dumps all node logs to `.log` files in the repo root and stops the nodes when the timeout expires - -**Important:** Only use `--node ` (e.g., `--node zeam_0,ream_0`) if the user explicitly requests it. This flag starts only the specified nodes but does NOT reassign their validators, causing missed slots. - -This ensures consistent test runs, clean logs without spurious warnings, and captured output for debugging. +3. **Use run-devnet-with-timeout.sh** - This script runs all nodes in the config with a timeout, dumps logs, then stops them. +4. Run for **20 slots** unless the user specifies otherwise. +5. The script automatically dumps all node logs to `.log` files in the repo root and stops the nodes when the timeout expires. ## Timing Calculation -Total timeout = startup buffer + genesis offset + (slots × 4 seconds) +Total timeout = startup buffer + genesis offset + (slots x 4 seconds) | Component | Local Mode | Ansible Mode | |-----------|------------|--------------| @@ -38,23 +34,20 @@ Total timeout = startup buffer + genesis offset + (slots × 4 seconds) | Per slot | 4s | 4s | **Examples (local mode):** -- 20 slots: 10 + 30 + (20 × 4) = **120s** -- 50 slots: 10 + 30 + (50 × 4) = **240s** -- 100 slots: 10 + 30 + (100 × 4) = **440s** +- 20 slots: 10 + 30 + (20 x 4) = **120s** +- 50 slots: 10 + 30 + (50 x 4) = **240s** +- 100 slots: 10 + 30 + (100 x 4) = **440s** ## Quick Start (Default Workflow) -**Step 1: Configure nodes** - Edit `lean-quickstart/local-devnet/genesis/validator-config.yaml` to keep only the nodes you want to run. Remove all other validator entries. This is critical because validator indices are assigned based on all nodes in the config - if a node is in the config but not running, its validators will miss their slots. +**Step 1: Configure nodes** - Edit `lean-quickstart/local-devnet/genesis/validator-config.yaml` to keep only the nodes you want to run. See `references/validator-config.md` for the full schema and field reference. -**Step 2: Update image tags (if needed)** - Edit `lean-quickstart/client-cmds/{client}-cmd.sh` to change the Docker image tag in `node_docker`. +**Step 2: Update image tags (if needed)** - Edit `lean-quickstart/client-cmds/{client}-cmd.sh` to change the Docker image tag in `node_docker`. See `references/clients.md` for current default tags. **Step 3: Run the devnet** ```bash # Start devnet with fresh genesis, capture logs directly (20 slots = 120s) .claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh 120 - -# Stop any remaining nodes (cleanup) -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --stop 2>/dev/null || true ``` ## Manual Commands @@ -72,22 +65,11 @@ cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --stop cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis ``` -### Using --node to Select Specific Nodes (Advanced) - -**WARNING:** Only use `--node ` if the user explicitly requests it. This flag does NOT reassign validators - nodes not selected will still have validators assigned to them in the genesis, causing missed slots. - -```bash -# Only use if explicitly requested by user -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node zeam_0,ream_0 -``` - -For normal operation, always modify `validator-config.yaml` to include only the nodes you want, then use `run-devnet-with-timeout.sh` or `--node all`. - ## Command-Line Flags | Flag | Description | |------|-------------| -| `--node ` | **Required.** Node(s) to start. Use `all` to start all nodes in config. **Note:** Using specific node names (e.g., `--node zeam_0,ream_0`) does NOT reassign validators - use only if explicitly requested | +| `--node ` | **Required.** Node(s) to start. Use `all` to start all nodes in config | | `--generateGenesis` | Regenerate genesis files. Implies `--cleanData` | | `--cleanData` | Clean data directories before starting | | `--stop` | Stop running nodes instead of starting them | @@ -108,169 +90,13 @@ node_docker="--security-opt seccomp=unconfined blockblaz/zeam:devnet1 node \ node_docker="--security-opt seccomp=unconfined blockblaz/zeam:local node \ ``` -**Current default tags:** -| Client | Image | Default Tag | -|--------|-------|-------------| -| zeam | blockblaz/zeam | devnet1 | -| ream | ghcr.io/reamlabs/ream | latest | -| ethlambda | ghcr.io/lambdaclass/ethlambda | local | -| qlean | qdrvm/qlean-mini | 3a96a1f | -| lantern | piertwo/lantern | v0.0.1 | -| lighthouse | hopinheimer/lighthouse | latest | -| grandine | sifrai/lean | unstable | - -## Configuration Workflow - -### Validator Config File Structure - -The config file is at `lean-quickstart/local-devnet/genesis/validator-config.yaml`. This is the **single source of truth** for all node configurations. - -**Important:** Only include clients that will actually run in the devnet. If a configured validator is offline from the start, it will miss its proposer slots and affect consensus progress. Only include offline validators if you specifically want to test behavior with missing nodes. - -**Full schema:** -```yaml -shuffle: roundrobin # Proposer selection algorithm (roundrobin = deterministic turns) -deployment_mode: local # 'local' (localhost) or 'ansible' (remote servers) - -config: - activeEpoch: 18 # Log2 of active signing epochs for hash-sig keys (2^18) - keyType: "hash-sig" # Post-quantum signature scheme - -validators: - - name: "zeam_0" # Node identifier: _ - privkey: "bdf953adc..." # 64-char hex P2P private key (libp2p identity) - enrFields: - ip: "127.0.0.1" # Node IP (127.0.0.1 for local, real IP for ansible) - quic: 9001 # QUIC/UDP port for P2P communication - metricsPort: 8081 # HTTP port exposed by the node (see note below) - count: 1 # Number of validator indices assigned to this node -``` - -**Field reference:** - -| Field | Required | Description | -|-------|----------|-------------| -| `shuffle` | Yes | Proposer selection algorithm. Use `roundrobin` for deterministic turn-based proposing | -| `deployment_mode` | Yes | `local` or `ansible` - determines genesis time offset and config directory | -| `config.activeEpoch` | Yes | Exponent for hash-sig active epochs (e.g., 18 means 2^18 signatures per period) | -| `config.keyType` | Yes | Always `hash-sig` for post-quantum support | -| `name` | Yes | Format: `_`. Client name determines which `client-cmds/*.sh` script runs | -| `privkey` | Yes | 32-byte hex string (64 chars). Used for P2P identity and ENR generation | -| `enrFields.ip` | Yes | IP address. Use `127.0.0.1` for local, real IPs for ansible | -| `enrFields.quic` | Yes | QUIC port. Must be unique per node in local mode | -| `metricsPort` | Yes | HTTP port exposed by the node. Must be unique per node in local mode. For ethlambda, this maps to `--metrics-port`; the API server uses a separate `--api-port` (default 5052) | -| `count` | Yes | Number of validator indices. Sum of all counts = total validators | - -### Adding a New Validator Node - -1. **Choose a unique node name** following `_` convention: - ``` - zeam_0, zeam_1, ream_0, qlean_0, lantern_0, lighthouse_0, grandine_0, ethlambda_0 - ``` - -2. **Generate a P2P private key** (64-char hex): - ```bash - openssl rand -hex 32 - ``` - -3. **Assign unique ports** (for local mode): - - QUIC: 9001, 9002, 9003... (increment for each node) - - Metrics/API: 8081, 8082, 8083... (increment for each node) - - **ethlambda note:** ethlambda uses separate API and metrics ports. The `metricsPort` in the config maps to `--metrics-port`. The API server binds to `--api-port` (default 5052) which must also be unique if running multiple ethlambda nodes. - -4. **Add the entry to `lean-quickstart/local-devnet/genesis/validator-config.yaml`:** - ```yaml - validators: - # ... existing nodes ... - - - name: "newclient_0" - privkey: "" - enrFields: - ip: "127.0.0.1" # Use real IP for ansible - quic: 9008 # Next available port - metricsPort: 8088 # Next available port - count: 1 - ``` - -5. **Regenerate genesis with new keys:** - ```bash - cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis --forceKeyGen - ``` - -### Removing a Validator Node - -1. **Delete the node entry** from `lean-quickstart/local-devnet/genesis/validator-config.yaml` - -2. **Regenerate genesis** (required because genesis state must reflect new validator set): - ```bash - cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis - ``` - Note: `--forceKeyGen` is NOT needed when removing - existing keys for remaining indices are reused. - -### Port Allocation Guide (Local Mode) - -When running multiple nodes locally, each needs unique ports: +See `references/clients.md` for current default images, tags, and known compatibility issues. -| Node | QUIC Port | Metrics Port | -|------|-----------|--------------| -| zeam_0 | 9001 | 8081 | -| ream_0 | 9002 | 8082 | -| qlean_0 | 9003 | 8083 | -| lantern_0 | 9004 | 8084 | -| lighthouse_0 | 9005 | 8085 | -| grandine_0 | 9006 | 8086 | -| ethlambda_0 | 9007 | 8087 | +## Validator Configuration -**ethlambda dual-port note:** ethlambda runs separate API and metrics HTTP servers. The `metricsPort` from `validator-config.yaml` maps to `--metrics-port`. The API server (`--api-port`, default 5052) must also be configured with a unique port if running multiple ethlambda nodes. Update `ethlambda-cmd.sh` in `lean-quickstart` to pass both `--api-port` and `--metrics-port` flags. +**ethlambda note:** ethlambda uses separate API (`--api-port`, default 5052) and metrics (`--metrics-port`, default 5054) HTTP servers. The `metricsPort` from config maps to `--metrics-port`; the API port must be configured separately in `ethlambda-cmd.sh`. -For **ansible mode**, all nodes can use the same ports (9001, 8081) since they run on different machines. - -### Local vs Ansible Deployment - -| Aspect | Local | Ansible | -|--------|-------|---------| -| Config file | `lean-quickstart/local-devnet/genesis/validator-config.yaml` | `lean-quickstart/ansible-devnet/genesis/validator-config.yaml` | -| `deployment_mode` | `local` | `ansible` | -| IP addresses | `127.0.0.1` for all | Real server IPs | -| Ports | Must be unique per node | Same port, different machines | -| Genesis offset | +30 seconds | +360 seconds | - -## Node Lifecycle Commands - -### Start Nodes - -**Preferred method:** Use `run-devnet-with-timeout.sh` after configuring `validator-config.yaml`: -```bash -# Edit lean-quickstart/local-devnet/genesis/validator-config.yaml to include only nodes you want, then: -.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh 120 -``` - -**Alternative (no timeout):** -```bash -# All nodes in config -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all - -# Fresh start with new genesis -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis -``` - -**Advanced (only if explicitly requested):** Start specific nodes without modifying config. Note: validators will NOT be reassigned. -```bash -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node zeam_0,ream_0 -``` - -### Stop Nodes -```bash -# Via script -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --stop - -# Or press Ctrl+C in the terminal running spin-node.sh -``` - -### Clean and Restart -```bash -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --cleanData -``` +See `references/validator-config.md` for the full schema, field reference, adding/removing nodes, port allocation guide, and local vs ansible deployment differences. ## Log Collection @@ -296,23 +122,12 @@ for node in $(docker ps --format '{{.Names}}' | grep -E '^(zeam|ream|qlean|lante done ``` -**Follow and save simultaneously:** -```bash -docker logs -f zeam_0 2>&1 | tee zeam_0.log -``` - -**With timestamps:** -```bash -docker logs -t zeam_0 > zeam_0.log 2>&1 -``` - ### Data Directory Logs Client-specific data and file-based logs are stored at: ``` lean-quickstart/local-devnet/data// ``` -Example: `lean-quickstart/local-devnet/data/zeam_0/` ## Common Troubleshooting @@ -351,15 +166,6 @@ lsof -i :8081 # Check metrics port lsof -i :5052 # Check ethlambda API port (if using default) ``` -Update ports in `lean-quickstart/local-devnet/genesis/validator-config.yaml` if needed. - -### Docker Permission Issues - -Run with sudo: -```bash -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --dockerWithSudo -``` - ### Stale Containers Cause Genesis Mismatch If you see `UnknownSourceBlock` or `OutOfMemory` deserialization errors, a container from a previous run may still be running with old genesis. @@ -371,54 +177,32 @@ docker rm -f zeam_0 ethlambda_0 ream_0 qlean_0 lantern_0 grandine_0 2>/dev/null Or use `run-devnet-with-timeout.sh` which handles cleanup automatically. -### Time-Based Stop - -Use the `run-devnet-with-timeout.sh` script for timed runs. Remember to include genesis offset (30s local, 360s ansible) + startup buffer (10s): +### Docker Permission Issues ```bash -# 20 slots: 10 + 30 + 80 = 120s -.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh 120 - -# 50 slots: 10 + 30 + 200 = 240s -.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh 240 - -# 100 slots: 10 + 30 + 400 = 440s -.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh 440 +cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --dockerWithSudo ``` -**Formula:** duration = 10 + 30 + (slots × 4) seconds (local mode) - ## Scripts | Script | Description | |--------|-------------| | `scripts/run-devnet-with-timeout.sh ` | Run devnet for specified duration, dump logs to repo root, then stop | -## Restarting a Node with Checkpoint Sync - -To restart a single node mid-devnet (e.g., to test a new image or checkpoint sync itself): +## Long-Lived Devnets and Rolling Restarts -**Important:** Restart nodes one at a time, waiting for each to fully sync before restarting the next. If 1/3 or more validators are offline simultaneously, finalization stalls because 3SF-mini requires 2/3+ votes to justify checkpoints. - -1. Choose a node to restart. If restarting the aggregator, finalization and attestation inclusion in blocks will stop until it catches back up to head. -2. Identify a healthy node's API port to use as checkpoint source (ethlambda serves `/lean/v0/states/finalized` on `--api-port`, default 5052) -3. Update the Docker image tag in `client-cmds/-cmd.sh` if needed -4. **Pull the new image before restarting** to minimize node downtime: - ```bash - docker pull : - ``` -5. Restart with checkpoint sync: - ```bash - cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh \ - --restart-client \ - --checkpoint-sync-url http://127.0.0.1:/lean/v0/states/finalized - ``` +For persistent devnets on remote servers (e.g., `ssh admin@ethlambda-1`), use detached containers instead of `spin-node.sh`. This allows rolling restarts to upgrade images without losing chain state. -**Important:** ethlambda serves the API (including `/lean/v0/states/finalized`) on `--api-port` (default 5052) and Prometheus metrics on `--metrics-port` (default 5054). Use the API port for checkpoint sync URLs. +**Key points:** +- Start containers with `docker run -d --restart unless-stopped` (not `spin-node.sh`) +- Rolling restart: stop one node, **wait 60 seconds** (gossipsub backoff), start with new image + checkpoint sync +- Restart non-aggregator nodes first, aggregator last +- Checkpoint sync URL uses the API port: `http://127.0.0.1:/lean/v0/states/finalized` -See `references/checkpoint-sync.md` for the full procedure, verification steps, and troubleshooting. +See `references/long-lived-devnet.md` for the full procedure, including starting the devnet, rolling restart steps, verification, and troubleshooting. ## Reference -- `references/clients.md`: Client-specific details (images, ports, configurations) -- `references/checkpoint-sync.md`: Restarting nodes with checkpoint sync +- `references/clients.md`: Client-specific details (images, ports, known issues) +- `references/validator-config.md`: Full config schema, field reference, adding/removing nodes, port allocation +- `references/long-lived-devnet.md`: Persistent devnets with detached containers and rolling restarts diff --git a/.claude/skills/devnet-runner/references/checkpoint-sync.md b/.claude/skills/devnet-runner/references/checkpoint-sync.md deleted file mode 100644 index b3b640f3..00000000 --- a/.claude/skills/devnet-runner/references/checkpoint-sync.md +++ /dev/null @@ -1,122 +0,0 @@ -# Checkpoint Sync in Devnets - -Restarting a node with checkpoint sync instead of replaying from genesis. Useful for testing checkpoint sync itself, upgrading a node's image mid-devnet, or recovering a crashed node. - -## When to Use - -- Testing checkpoint sync behavior (interop, verification, catch-up) -- Replacing a node's Docker image mid-run (e.g., testing a new build) -- Recovering a node that fell behind or crashed - -## Prerequisites - -- A running devnet with at least one healthy node to serve the checkpoint state -- The checkpoint source node's API must be reachable (`--api-port`, default 5052) - -## Key Concepts - -**ethlambda runs separate API and metrics servers.** The API (`/lean/v0/...`, including health and states) is served on `--api-port` (default 5052). Prometheus metrics (`/metrics`) and pprof are served on `--metrics-port` (default 5054). Both share the bind address `--http-address` (default `127.0.0.1`). - -**Checkpoint sync URL format (uses the API port):** -``` -http://:/lean/v0/states/finalized -``` - -**The node must have the same genesis config.** Checkpoint sync verifies the downloaded state against the local genesis config (genesis time, validator pubkeys, validator count). The `--custom-network-config-dir` must point to the same genesis used by the rest of the devnet. - -## Restart Procedure - -**Restart nodes one at a time.** Wait for each node to fully sync and rejoin consensus before restarting the next. 3SF-mini requires 2/3+ of validators to vote in order to justify checkpoints and advance finalization. If 1/3 or more validators are offline simultaneously, finalization stalls until enough nodes come back online. - -### Step 1: Choose the node to restart - -Any node can be restarted, but be aware that restarting the aggregator node will stop finalization and attestation inclusion in blocks until it catches back up to head. Check which node is the aggregator in `validator-config.yaml`: -```yaml -# In lean-quickstart//genesis/validator-config.yaml -validators: - - name: "ethlambda_0" - isAggregator: false - - name: "ethlambda_2" - isAggregator: true # restarting this stops finalization until it catches up -``` - -### Step 2: Identify a checkpoint source - -Pick any other running node's API port as the checkpoint source. For ethlambda, the API is served on `--api-port` (default 5052). For other clients, the API may share the `metricsPort` from `validator-config.yaml`. - -For local devnets (host networking), the URL is: -``` -http://127.0.0.1:/lean/v0/states/finalized -``` - -Verify the endpoint is reachable: -```bash -curl -s http://127.0.0.1:/lean/v0/health -# Should return: {"status":"healthy","service":"lean-spec-api"} -``` - -### Step 3: Update the Docker image tag (if changing versions) - -Edit `lean-quickstart/client-cmds/-cmd.sh` and change the image tag in `node_docker` before restarting: -```bash -# In lean-quickstart/client-cmds/ethlambda-cmd.sh, change: -node_docker="ghcr.io/lambdaclass/ethlambda:local \ -# To: -node_docker="ghcr.io/lambdaclass/ethlambda:devnet3 \ -``` - -### Step 4: Pull the new Docker image - -**Pull the image before restarting** to minimize how long the node is absent from the network. If you skip this, `spin-node.sh` will pull during restart, adding minutes of downtime where the node misses proposer slots and attestation duties: -```bash -docker pull : -``` - -### Step 5: Restart with checkpoint sync - -```bash -cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh \ - --restart-client \ - --checkpoint-sync-url http://127.0.0.1:/lean/v0/states/finalized -``` - -This automatically: -1. Stops the existing container -2. Clears the data directory -3. Pulls the Docker image (skipped if already present locally) -4. Restarts with `--checkpoint-sync-url` passed to the node - -If `--checkpoint-sync-url` is omitted, it defaults to `https://leanpoint.leanroadmap.org/lean/v0/states/finalized` (the public checkpoint provider). - -Multiple nodes can be restarted at once with comma-separated names: -```bash ---restart-client ethlambda_0,ethlambda_3 -``` - -### Step 6: Verify the node synced - -```bash -docker logs --tail 20 -``` - -Look for: -- "Block imported successfully" messages catching up to the current slot -- "Fork Choice Tree" showing finalized/justified/head slots close to the network's current state -- No error messages about verification failures or SSZ decode errors - -## Troubleshooting - -### "genesis time mismatch" or "validator count mismatch" -The checkpoint source is running a different genesis than the restarting node. Ensure both use the same genesis config directory. - -### "HTTP request failed" or connection refused -The checkpoint source node is down or unreachable. Verify with `curl` that the source endpoint returns a healthy response. - -### Node exits immediately after start -Check `docker logs ` for verification errors. Checkpoint sync exits on any failure without modifying the database, so it's safe to retry. - -### Node syncs but doesn't finalize -If the restarted node is the aggregator, attestations won't be aggregated and blocks will be produced with `attestation_count=0` until it catches back up to head. Finalization resumes once the aggregator is fully synced and participating in consensus again. - -### "Fallback pruning (finalization stalled)" after catch-up -Normal during catch-up. The node accumulated blocks faster than finalization can advance. This resolves once the node is fully caught up and participating in consensus. diff --git a/.claude/skills/devnet-runner/references/clients.md b/.claude/skills/devnet-runner/references/clients.md index 54dee031..5c4463bc 100644 --- a/.claude/skills/devnet-runner/references/clients.md +++ b/.claude/skills/devnet-runner/references/clients.md @@ -42,53 +42,9 @@ Ports are configured per-node in `validator-config.yaml`. Typical port assignmen **ethlambda dual-port note:** ethlambda runs separate API (`--api-port`, default 5052) and metrics (`--metrics-port`, default 5054) HTTP servers. Both share a bind address (`--http-address`, default `127.0.0.1`). The `metricsPort` from `validator-config.yaml` maps to `--metrics-port`. The API port must be configured separately in `ethlambda-cmd.sh`. -## Client-Specific Configuration Notes +## Client Command Files -### zeam - -- Image: `blockblaz/zeam:devnet1` -- Native Zig implementation -- Command file: `client-cmds/zeam-cmd.sh` - -### ream - -- Image: `ghcr.io/reamlabs/ream:latest` -- Rust implementation by Ream Labs -- Command file: `client-cmds/ream-cmd.sh` - -### qlean - -- Image: `qdrvm/qlean-mini:3a96a1f` -- Uses specific commit hash for stability -- Command file: `client-cmds/qlean-cmd.sh` - -### lantern - -- Image: `piertwo/lantern:v0.0.1` -- PierTwo implementation -- Command file: `client-cmds/lantern-cmd.sh` - -### lighthouse - -- Image: `hopinheimer/lighthouse:latest` -- Fork of the standard Rust Lighthouse client -- Command file: `client-cmds/lighthouse-cmd.sh` - -### grandine - -- Image: `sifrai/lean:unstable` -- High-performance client by Sifrai -- Command file: `client-cmds/grandine-cmd.sh` - -### ethlambda - -- Image: `ghcr.io/lambdaclass/ethlambda:local` -- Rust implementation by LambdaClass -- Command file: `client-cmds/ethlambda-cmd.sh` -- **Dual HTTP servers:** Runs separate API and metrics servers on independent ports - - `--http-address` (default `127.0.0.1`): shared bind address - - `--api-port` (default `5052`): API server (health, states, checkpoints, fork choice) - - `--metrics-port` (default `5054`): metrics server (Prometheus, pprof) +Each client's Docker configuration is in `client-cmds/{client}-cmd.sh` (e.g., `zeam-cmd.sh`, `ream-cmd.sh`, `ethlambda-cmd.sh`). Edit the `node_docker` variable to change image/tag. ## Changing Docker Images diff --git a/.claude/skills/devnet-runner/references/long-lived-devnet.md b/.claude/skills/devnet-runner/references/long-lived-devnet.md new file mode 100644 index 00000000..54ce42f4 --- /dev/null +++ b/.claude/skills/devnet-runner/references/long-lived-devnet.md @@ -0,0 +1,185 @@ +# Long-Lived Devnets + +Running a persistent devnet with detached containers that survive SSH disconnects and support rolling restarts to upgrade images without losing chain state. + +## When to Use + +- Running a devnet on a remote server that should persist across SSH sessions +- Upgrading node images mid-devnet without resetting genesis +- Testing checkpoint sync and rolling restart procedures + +## Overview + +`spin-node.sh` runs containers with `docker run --rm` (foreground, auto-remove) and kills all containers on exit. This is fine for short test runs but not for long-lived devnets. + +The alternative: start containers directly with `docker run -d --restart unless-stopped`. Containers are decoupled from any parent process and survive SSH disconnects, script exits, and host reboots. + +## Starting a Long-Lived Devnet + +### Step 1: Generate genesis + +Use `spin-node.sh` to generate genesis config, keys, and ENR records, then immediately stop it: + +```bash +cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis +# Press Ctrl-C after nodes start (genesis is already generated) +``` + +Or, if you want to avoid starting containers at all, update `GENESIS_TIME` in `config.yaml` manually: + +```bash +GENESIS=/path/to/lean-quickstart/local-devnet/genesis +GENESIS_TIME=$(($(date +%s) + 30)) +sed -i "s/^GENESIS_TIME:.*/GENESIS_TIME: $GENESIS_TIME/" $GENESIS/config.yaml +``` + +### Step 2: Start all containers detached + +Start all nodes simultaneously so the gossipsub mesh forms correctly: + +```bash +GENESIS=/path/to/lean-quickstart/local-devnet/genesis +DATA=/path/to/lean-quickstart/local-devnet/data +IMAGE=ghcr.io/lambdaclass/ethlambda:devnet3 + +# Clean data dirs +for d in ethlambda_0 ethlambda_1 ethlambda_2 ethlambda_3; do + rm -rf "$DATA/$d/"* +done + +# Start each node (adjust ports, node-id, aggregator flag per validator-config.yaml) +docker run -d --restart unless-stopped --name ethlambda_0 --network host \ + -v $GENESIS:/config -v $DATA/ethlambda_0:/data \ + $IMAGE \ + --custom-network-config-dir /config \ + --gossipsub-port 9001 --node-id ethlambda_0 \ + --node-key /config/ethlambda_0.key \ + --metrics-address 0.0.0.0 --metrics-port 8081 + +# Repeat for other nodes, adding --is-aggregator to the aggregator node +``` + +Do NOT include `--checkpoint-sync-url` in the initial start. Nodes start from genesis. + +### Step 3: Verify + +Wait ~50 seconds (30s genesis offset + 20s for finalization to start), then check: + +```bash +for n in 0 1 2 3; do + printf "ethlambda_$n: " + docker logs --tail 15 ethlambda_$n 2>&1 | grep "Finalized:" | tail -1 +done +``` + +All nodes should show the same finalized slot advancing. + +## Rolling Restart Procedure + +To upgrade a node's image without losing chain state. Restart one node at a time; the network continues finalizing with the remaining nodes. + +### Critical: 60-Second Wait + +After stopping a node, **wait at least 60 seconds** before starting the replacement. This allows the gossipsub backoff timer on other nodes to expire. Without this wait, the restarted node's GRAFT requests are rejected and it never joins the gossip mesh, meaning it won't receive blocks or attestations via gossip. + +### Restart Order + +1. Non-aggregator nodes first +2. Aggregator node last (while it's offline, blocks are produced with `attestation_count=0` and finalization stalls) + +### Per-Node Procedure + +For each node: + +```bash +GENESIS=/path/to/lean-quickstart/local-devnet/genesis +DATA=/path/to/lean-quickstart/local-devnet/data +NEW_IMAGE=ghcr.io/lambdaclass/ethlambda:new-tag + +# 1. Pull the new image first (minimizes downtime) +docker pull $NEW_IMAGE + +# 2. Pick a healthy peer's API port as checkpoint source +# (any running node that is NOT the one being restarted) +# ethlambda serves /lean/v0/states/finalized on --api-port (default 5052) +CHECKPOINT_SOURCE_PORT=5052 # e.g., ethlambda_3's API port + +# 3. Stop and remove the container +docker rm -f ethlambda_0 +rm -rf "$DATA/ethlambda_0/"* + +# 4. Wait 60 seconds for gossipsub backoff to expire +sleep 60 + +# 5. Start with new image + checkpoint sync +docker run -d --restart unless-stopped --name ethlambda_0 --network host \ + -v $GENESIS:/config -v $DATA/ethlambda_0:/data \ + $NEW_IMAGE \ + --custom-network-config-dir /config \ + --gossipsub-port 9001 --node-id ethlambda_0 \ + --node-key /config/ethlambda_0.key \ + --metrics-address 0.0.0.0 --metrics-port 8081 \ + --checkpoint-sync-url http://127.0.0.1:$CHECKPOINT_SOURCE_PORT/lean/v0/states/finalized +``` + +### Verification After Each Node + +Wait ~20 seconds, then verify: + +```bash +# Check the restarted node receives blocks via gossip (not just req-resp) +docker logs --tail 20 ethlambda_0 2>&1 | grep "Received block from gossip" + +# Check finalization matches other nodes +for n in 0 1 2 3; do + printf "ethlambda_$n: " + docker logs --tail 15 ethlambda_$n 2>&1 | grep "Finalized:" | tail -1 +done +``` + +**Only proceed to the next node after confirming:** +- The restarted node shows "Received block from gossip" (not just BlocksByRoot) +- No "NoPeersSubscribedToTopic" warnings in recent logs +- Finalized slot matches other nodes + +## Monitoring Stack + +If Prometheus and Grafana were previously started via `spin-node.sh --metrics`, restart them separately since they're managed by docker-compose: + +```bash +cd lean-quickstart/metrics && docker compose -f docker-compose-metrics.yaml up -d +``` + +## Troubleshooting + +### Restarted node shows "NoPeersSubscribedToTopic" persistently + +The 60-second wait was not long enough, or was skipped. Stop the node, wait 60s, and start again. + +### Finalization stalls after restarting the aggregator + +Expected behavior. Finalization resumes once the aggregator catches up to head and starts aggregating attestations again. This typically takes 10-20 seconds after the node starts. + +### Chain doesn't progress after restarting all nodes + +If all nodes were restarted from genesis (no checkpoint sync) with a stale `GENESIS_TIME`, the slot gap from genesis to current time may not satisfy 3SF-mini justifiability rules. Regenerate genesis with a fresh timestamp. + +### "genesis time mismatch" or "validator count mismatch" + +The checkpoint source is running a different genesis than the restarting node. Ensure both use the same genesis config directory (`-v $GENESIS:/config`). + +### "HTTP request failed" or connection refused + +The checkpoint source node is down or unreachable. Verify with `curl`: +```bash +curl -s http://127.0.0.1:/lean/v0/health +# Should return: {"status":"healthy","service":"lean-spec-api"} +``` + +### Container name conflict on start + +The old container wasn't fully removed. Use `docker rm -f ` before `docker run`. + +### "Fallback pruning (finalization stalled)" after catch-up + +Normal during catch-up. The node accumulated blocks faster than finalization can advance. Resolves once fully caught up. diff --git a/.claude/skills/devnet-runner/references/validator-config.md b/.claude/skills/devnet-runner/references/validator-config.md new file mode 100644 index 00000000..1950e68d --- /dev/null +++ b/.claude/skills/devnet-runner/references/validator-config.md @@ -0,0 +1,110 @@ +# Validator Config Reference + +Full schema and configuration guide for `lean-quickstart/local-devnet/genesis/validator-config.yaml`. + +## Full Schema + +```yaml +shuffle: roundrobin # Proposer selection algorithm (roundrobin = deterministic turns) +deployment_mode: local # 'local' (localhost) or 'ansible' (remote servers) + +config: + activeEpoch: 18 # Log2 of active signing epochs for hash-sig keys (2^18) + keyType: "hash-sig" # Post-quantum signature scheme + +validators: + - name: "zeam_0" # Node identifier: _ + privkey: "bdf953adc..." # 64-char hex P2P private key (libp2p identity) + enrFields: + ip: "127.0.0.1" # Node IP (127.0.0.1 for local, real IP for ansible) + quic: 9001 # QUIC/UDP port for P2P communication + metricsPort: 8081 # HTTP port exposed by the node (see note below) + count: 1 # Number of validator indices assigned to this node +``` + +## Field Reference + +| Field | Required | Description | +|-------|----------|-------------| +| `shuffle` | Yes | Proposer selection algorithm. Use `roundrobin` for deterministic turn-based proposing | +| `deployment_mode` | Yes | `local` or `ansible` - determines genesis time offset and config directory | +| `config.activeEpoch` | Yes | Exponent for hash-sig active epochs (e.g., 18 means 2^18 signatures per period) | +| `config.keyType` | Yes | Always `hash-sig` for post-quantum support | +| `name` | Yes | Format: `_`. Client name determines which `client-cmds/*.sh` script runs | +| `privkey` | Yes | 32-byte hex string (64 chars). Used for P2P identity and ENR generation | +| `enrFields.ip` | Yes | IP address. Use `127.0.0.1` for local, real IPs for ansible | +| `enrFields.quic` | Yes | QUIC port. Must be unique per node in local mode | +| `metricsPort` | Yes | HTTP port exposed by the node. Must be unique per node in local mode. For ethlambda, this maps to `--metrics-port`; the API server uses a separate `--api-port` (default 5052) | +| `count` | Yes | Number of validator indices. Sum of all counts = total validators | + +## Adding a New Validator Node + +1. **Choose a unique node name** following `_` convention: + ``` + zeam_0, zeam_1, ream_0, qlean_0, lantern_0, lighthouse_0, grandine_0, ethlambda_0 + ``` + +2. **Generate a P2P private key** (64-char hex): + ```bash + openssl rand -hex 32 + ``` + +3. **Assign unique ports** (for local mode): + - QUIC: 9001, 9002, 9003... (increment for each node) + - Metrics/API: 8081, 8082, 8083... (increment for each node) + - For ethlambda, see the `metricsPort` field note in the table above regarding dual API/metrics ports. + +4. **Add the entry to `validator-config.yaml`:** + ```yaml + validators: + # ... existing nodes ... + + - name: "newclient_0" + privkey: "" + enrFields: + ip: "127.0.0.1" # Use real IP for ansible + quic: 9008 # Next available port + metricsPort: 8088 # Next available port + count: 1 + ``` + +5. **Regenerate genesis with new keys:** + ```bash + cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis --forceKeyGen + ``` + +## Removing a Validator Node + +1. **Delete the node entry** from `validator-config.yaml` + +2. **Regenerate genesis** (required because genesis state must reflect new validator set): + ```bash + cd lean-quickstart && NETWORK_DIR=local-devnet ./spin-node.sh --node all --generateGenesis + ``` + Note: `--forceKeyGen` is NOT needed when removing. Existing keys for remaining indices are reused. + +## Port Allocation Guide (Local Mode) + +When running multiple nodes locally, each needs unique ports: + +| Node | QUIC Port | Metrics Port | +|------|-----------|--------------| +| zeam_0 | 9001 | 8081 | +| ream_0 | 9002 | 8082 | +| qlean_0 | 9003 | 8083 | +| lantern_0 | 9004 | 8084 | +| lighthouse_0 | 9005 | 8085 | +| grandine_0 | 9006 | 8086 | +| ethlambda_0 | 9007 | 8087 | + +For **ansible mode**, all nodes can use the same ports (9001, 8081) since they run on different machines. + +## Local vs Ansible Deployment + +| Aspect | Local | Ansible | +|--------|-------|---------| +| Config file | `lean-quickstart/local-devnet/genesis/validator-config.yaml` | `lean-quickstart/ansible-devnet/genesis/validator-config.yaml` | +| `deployment_mode` | `local` | `ansible` | +| IP addresses | `127.0.0.1` for all | Real server IPs | +| Ports | Must be unique per node | Same port, different machines | +| Genesis offset | +30 seconds | +360 seconds | diff --git a/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh b/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh index b30e8ac5..c4fb3fbe 100755 --- a/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh +++ b/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh @@ -32,3 +32,9 @@ done kill $PID 2>/dev/null wait $PID 2>/dev/null + +# Explicitly stop and remove containers (spin-node.sh may not clean up on kill) +echo "Stopping containers..." +for node in $(docker ps --format '{{.Names}}' | grep -E '^(zeam|ream|qlean|lantern|lighthouse|grandine|ethlambda)_'); do + docker rm -f "$node" 2>/dev/null +done From 556b200460a071aa33a197e810fe555bee2f5af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Fri, 13 Mar 2026 19:14:00 -0300 Subject: [PATCH 2/2] docs: fix wrong flags in skill --- .../references/long-lived-devnet.md | 36 ++++++++++++++++--- .../references/validator-config.md | 24 +++++++------ .../scripts/run-devnet-with-timeout.sh | 2 +- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/.claude/skills/devnet-runner/references/long-lived-devnet.md b/.claude/skills/devnet-runner/references/long-lived-devnet.md index 54ce42f4..7bcd3bdb 100644 --- a/.claude/skills/devnet-runner/references/long-lived-devnet.md +++ b/.claude/skills/devnet-runner/references/long-lived-devnet.md @@ -48,15 +48,40 @@ for d in ethlambda_0 ethlambda_1 ethlambda_2 ethlambda_3; do done # Start each node (adjust ports, node-id, aggregator flag per validator-config.yaml) +# ethlambda runs two HTTP servers: --api-port (API) and --metrics-port (metrics) +# Both must be unique per node to avoid bind conflicts with --network host docker run -d --restart unless-stopped --name ethlambda_0 --network host \ -v $GENESIS:/config -v $DATA/ethlambda_0:/data \ $IMAGE \ --custom-network-config-dir /config \ --gossipsub-port 9001 --node-id ethlambda_0 \ --node-key /config/ethlambda_0.key \ - --metrics-address 0.0.0.0 --metrics-port 8081 + --http-address 0.0.0.0 --api-port 5052 --metrics-port 8081 -# Repeat for other nodes, adding --is-aggregator to the aggregator node +docker run -d --restart unless-stopped --name ethlambda_1 --network host \ + -v $GENESIS:/config -v $DATA/ethlambda_1:/data \ + $IMAGE \ + --custom-network-config-dir /config \ + --gossipsub-port 9002 --node-id ethlambda_1 \ + --node-key /config/ethlambda_1.key \ + --http-address 0.0.0.0 --api-port 5053 --metrics-port 8082 + +docker run -d --restart unless-stopped --name ethlambda_2 --network host \ + -v $GENESIS:/config -v $DATA/ethlambda_2:/data \ + $IMAGE \ + --custom-network-config-dir /config \ + --gossipsub-port 9003 --node-id ethlambda_2 \ + --node-key /config/ethlambda_2.key \ + --http-address 0.0.0.0 --api-port 5054 --metrics-port 8083 + +docker run -d --restart unless-stopped --name ethlambda_3 --network host \ + -v $GENESIS:/config -v $DATA/ethlambda_3:/data \ + $IMAGE \ + --custom-network-config-dir /config \ + --gossipsub-port 9004 --node-id ethlambda_3 \ + --node-key /config/ethlambda_3.key \ + --is-aggregator \ + --http-address 0.0.0.0 --api-port 5055 --metrics-port 8084 ``` Do NOT include `--checkpoint-sync-url` in the initial start. Nodes start from genesis. @@ -101,8 +126,9 @@ docker pull $NEW_IMAGE # 2. Pick a healthy peer's API port as checkpoint source # (any running node that is NOT the one being restarted) -# ethlambda serves /lean/v0/states/finalized on --api-port (default 5052) -CHECKPOINT_SOURCE_PORT=5052 # e.g., ethlambda_3's API port +# ethlambda serves /lean/v0/states/finalized on --api-port +# Port assignments: ethlambda_0=5052, _1=5053, _2=5054, _3=5055 +CHECKPOINT_SOURCE_PORT=5055 # ethlambda_3's API port (not the node being restarted) # 3. Stop and remove the container docker rm -f ethlambda_0 @@ -118,7 +144,7 @@ docker run -d --restart unless-stopped --name ethlambda_0 --network host \ --custom-network-config-dir /config \ --gossipsub-port 9001 --node-id ethlambda_0 \ --node-key /config/ethlambda_0.key \ - --metrics-address 0.0.0.0 --metrics-port 8081 \ + --http-address 0.0.0.0 --api-port 5052 --metrics-port 8081 \ --checkpoint-sync-url http://127.0.0.1:$CHECKPOINT_SOURCE_PORT/lean/v0/states/finalized ``` diff --git a/.claude/skills/devnet-runner/references/validator-config.md b/.claude/skills/devnet-runner/references/validator-config.md index 1950e68d..9b95cc5b 100644 --- a/.claude/skills/devnet-runner/references/validator-config.md +++ b/.claude/skills/devnet-runner/references/validator-config.md @@ -87,17 +87,19 @@ validators: When running multiple nodes locally, each needs unique ports: -| Node | QUIC Port | Metrics Port | -|------|-----------|--------------| -| zeam_0 | 9001 | 8081 | -| ream_0 | 9002 | 8082 | -| qlean_0 | 9003 | 8083 | -| lantern_0 | 9004 | 8084 | -| lighthouse_0 | 9005 | 8085 | -| grandine_0 | 9006 | 8086 | -| ethlambda_0 | 9007 | 8087 | - -For **ansible mode**, all nodes can use the same ports (9001, 8081) since they run on different machines. +| Node | QUIC Port | Metrics Port | API Port (ethlambda only) | +|------|-----------|--------------|--------------------------| +| zeam_0 | 9001 | 8081 | n/a | +| ream_0 | 9002 | 8082 | n/a | +| qlean_0 | 9003 | 8083 | n/a | +| lantern_0 | 9004 | 8084 | n/a | +| lighthouse_0 | 9005 | 8085 | n/a | +| grandine_0 | 9006 | 8086 | n/a | +| ethlambda_0 | 9007 | 8087 | 5052 | + +When running **multiple ethlambda nodes** locally, each needs a unique `--api-port` (e.g., 5052, 5053, 5054, 5055) since `validator-config.yaml` has no `apiPort` field. Pass `--api-port` directly in `ethlambda-cmd.sh` or `docker run`. + +For **ansible mode**, all nodes can use the same ports (9001, 8081, 5052) since they run on different machines. ## Local vs Ansible Deployment diff --git a/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh b/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh index c4fb3fbe..b52e3473 100755 --- a/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh +++ b/.claude/skills/devnet-runner/scripts/run-devnet-with-timeout.sh @@ -34,7 +34,7 @@ kill $PID 2>/dev/null wait $PID 2>/dev/null # Explicitly stop and remove containers (spin-node.sh may not clean up on kill) -echo "Stopping containers..." +echo "Stopping and removing containers..." for node in $(docker ps --format '{{.Names}}' | grep -E '^(zeam|ream|qlean|lantern|lighthouse|grandine|ethlambda)_'); do docker rm -f "$node" 2>/dev/null done