From 71269ec1c0abb89d38c2d07d6c8216e90a633d31 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 16:17:30 +0000 Subject: [PATCH 1/2] feat(skills): scaffold cluster-bootstrap skill for LAN/cloud k3s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps k3sup over SSH for single-host install, server install, and agent join. Persists topology to $OBOL_CONFIG_DIR/cluster-bootstrap/ and the k3s admin kubeconfig to the standard location so `obol stack up` can take over once multi-node infra concerns are resolved. Multi-node storage placement and cloudflared HA are intentionally deferred — the design tradeoffs are captured in references/multi-node-design.md and the skill exposes flag stubs (--storage-primary, --edge-node) so the CLI surface won't churn once we pick paths. --- .../embed/skills/cluster-bootstrap/SKILL.md | 147 ++++++++++++ .../references/multi-node-design.md | 96 ++++++++ .../cluster-bootstrap/scripts/bootstrap.py | 226 ++++++++++++++++++ 3 files changed, 469 insertions(+) create mode 100644 internal/embed/skills/cluster-bootstrap/SKILL.md create mode 100644 internal/embed/skills/cluster-bootstrap/references/multi-node-design.md create mode 100755 internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py diff --git a/internal/embed/skills/cluster-bootstrap/SKILL.md b/internal/embed/skills/cluster-bootstrap/SKILL.md new file mode 100644 index 00000000..d39a3c21 --- /dev/null +++ b/internal/embed/skills/cluster-bootstrap/SKILL.md @@ -0,0 +1,147 @@ +--- +name: cluster-bootstrap +description: "Bootstrap and join obol-stack across multiple hosts on LAN or cloud. Wraps k3sup over SSH to install k3s on a server node and join agent nodes, then prepares the cluster for `obol stack up`. Single-host, LAN multi-node, and cloud multi-node topologies." +metadata: { "openclaw": { "emoji": "ðŸŠī", "requires": { "bins": ["k3sup", "ssh", "python3"] } } } +--- + +# Cluster Bootstrap + +Bootstrap a k3s cluster across one or more hosts (LAN or cloud) and prepare it +to run obol-stack. Wraps [k3sup](https://github.com/alexellis/k3sup) over SSH. + +This skill is a **scaffold**. The single-host and LAN-join flows are wired up; +multi-node storage and tunnel behavior are still being designed +(see `references/multi-node-design.md`). Don't run this against a production +cluster yet. + +## When to Use + +- Standing up obol-stack on a single Linux host (no Docker / k3d) +- Joining a second/third host on the LAN as an agent node +- Bootstrapping on cloud VMs reachable via SSH + +## When NOT to Use + +- Local Mac dev — keep using `obol stack up` (k3d + Docker) +- Existing managed k8s (EKS/GKE) — point `KUBECONFIG` at it directly +- Single-host where `obolup.sh` already works + +## Topologies + +### single-host + +One Linux box. k3sup installs k3s, writes kubeconfig locally, done. Equivalent +to `curl -sfL https://get.k3s.io | sh` plus kubeconfig export. + +### lan-multi + +One server + N agent nodes on the same L2 network. Server is reachable by IP +from all agents. + +### cloud-multi + +One server + N agents across cloud VMs. Same shape as lan-multi but server +must have a routable IP and SG/firewall must allow 6443/tcp from agents. + +## Quick Start + +```bash +# Single host (current dev box) +python3 scripts/bootstrap.py single --host 192.168.1.50 --user obol \ + --ssh-key ~/.ssh/id_ed25519 + +# LAN: server + 2 agents +python3 scripts/bootstrap.py server --host 192.168.1.50 --user obol \ + --ssh-key ~/.ssh/id_ed25519 +python3 scripts/bootstrap.py join --server-host 192.168.1.50 \ + --host 192.168.1.51 --user obol --ssh-key ~/.ssh/id_ed25519 +python3 scripts/bootstrap.py join --server-host 192.168.1.50 \ + --host 192.168.1.52 --user obol --ssh-key ~/.ssh/id_ed25519 + +# After bootstrap, point obol at the kubeconfig and run stack up: +export KUBECONFIG=$(python3 scripts/bootstrap.py kubeconfig-path) +obol stack up +``` + +## Subcommands + +``` +single --host --user --ssh-key [--k3s-channel stable] + Install k3s on one host. Equivalent to running `server` with no + expected agents. + +server --host --user --ssh-key [--k3s-channel stable] [--cluster-cidr] + Install k3s server on the target host. Writes kubeconfig to + $OBOL_CONFIG_DIR/kubeconfig.yaml with API rewritten to --host. + +join --server-host --host --user --ssh-key + Install k3s agent on --host and join it to --server-host. Server + node-token is fetched over SSH from the server host. + +kubeconfig-path + Print the absolute path of the kubeconfig this skill writes to. + +label --host --label key=value [--label key=value ...] + Apply node labels (used by storage/tunnel placement — see Design + Notes below). + +status List nodes, their roles, and the labels relevant to obol-stack. +``` + +## Design Notes (in progress) + +These two areas are NOT yet implemented by this skill. They block real +multi-node usage. Captured in `references/multi-node-design.md`; high-level +proposals here so the skill's flags can be designed against them. + +### Storage placement + +Default `local-path` provisioner pins PVCs to one node. Three candidate paths: + +1. **Storage-primary node label (proposed default).** Designate one node with + `obol.org/storage=primary`. Stateful workloads add nodeAffinity to that + label. State is single-node; failure is recoverable from PVC backup. +2. **Longhorn / OpenEBS replicated block storage.** Real PVC migration. Costs + â‰Ĩ3 nodes and ~500MiB RAM/node baseline. +3. **NFS export + dual StorageClass.** Add `obol-shared` on top of an NFS + export from the bootstrap host; keep `local-path` for ephemeral work. + +This skill exposes `--storage-primary ` so the choice can be deferred +without changing the CLI surface. + +### Cloudflared placement + +Cloudflared has native HA (each replica = a tunnel connection, +Cloudflare-side load-balanced). Three candidate paths: + +1. **Multi-replica + PodAntiAffinity (proposed default for `replicas >= 2`).** + Free HA; only viable in `remote` / `local` managed mode (quickTunnel mints + per-replica URLs). +2. **Pin to a single edge-labeled node.** Right when one node has the only + good uplink. `obol.org/edge=true` node selector, `replicas: 1`. +3. **DaemonSet on edge-labeled nodes.** Don't. + +This skill exposes `--edge-node ` to pin (option 2). Default behavior is +to leave the cloudflared chart at replicas=1 until the chart learns to scale +based on `obol.org/topology=multi`. + +## Files Written by the Skill + +| Path | Purpose | +|------|---------| +| `$OBOL_CONFIG_DIR/kubeconfig.yaml` | k3s admin kubeconfig (API rewritten to server host IP) | +| `$OBOL_CONFIG_DIR/cluster-bootstrap/topology.json` | Inventory of bootstrapped nodes (host, role, labels) | +| `$OBOL_CONFIG_DIR/cluster-bootstrap/server-token` | k3s node token (mode 0600) — used to join agents | + +## Caveats + +- **Not for k3d/local Mac.** Use `obol stack up` for that — k3d-on-Docker is + still the canonical local dev path. +- **Firewalls.** Server: 6443/tcp inbound from agents. All nodes: 8472/udp + (flannel VXLAN) between each other. Cloud: configure SGs accordingly. +- **`OBOL_DEVELOPMENT=true` registry caches** are k3d-only today — they don't + run on the k3sup-bootstrapped k3s cluster yet. +- **`obol stack up` on a real k3s cluster** has not been validated end to end + on this branch; the `obol stack` lifecycle today expects the k3d cluster + name written by `obol stack init`. Treat the post-bootstrap `obol stack up` + as the next milestone, not a finished path. diff --git a/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md b/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md new file mode 100644 index 00000000..71f8a1dd --- /dev/null +++ b/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md @@ -0,0 +1,96 @@ +# Multi-node design notes + +Open questions blocking real multi-node usage of obol-stack. The +`cluster-bootstrap` skill ships flag stubs (`--storage-primary`, `--edge-node`) +so the CLI surface doesn't have to change once these are decided. + +## Storage + +Today: `internal/embed/infrastructure/base/templates/local-path.yaml` installs +the rancher local-path provisioner with `volumeBindingMode: WaitForFirstConsumer` +and `pathPattern: "{{ .PVC.Namespace }}/{{ .PVC.Name }}"` under +`{{ .Values.dataDir }}`. PVCs are pinned to whichever node first schedules a +pod that consumes them. If the pod reschedules to a different node, it cannot +re-mount the PVC. + +### Option A — single storage-primary node (proposed default) + +- Apply `obol.org/storage=primary` label to one node (the bootstrap node). +- Add `nodeAffinity` to every Deployment that owns a PVC: LiteLLM, Hermes, + default obol-agent, OpenClaw instances, eRPC if it persists state, monitoring + PVCs. +- Storage failure mode is identical to today's single-host k3d: lose the + storage node, restore from PVC backup. + +Surface: +- `--storage-primary ` on `bootstrap.py server` records the label intent. +- Helm values gain `storage.primaryLabel` so charts can opt-in. + +### Option B — Longhorn or OpenEBS Mayastor + +- Replace `local-path` as the default StorageClass. +- Need â‰Ĩ3 nodes for replication. Each node runs a per-node agent + (~500MiB RAM, plus disk overhead per replica). +- New failure modes: stuck volumes during node loss, replica rebalancing IO + pressure. + +### Option C — single NFS export + dual StorageClass + +- Bootstrap host exports `$OBOL_DATA_DIR` over NFS. +- Install `nfs-subdir-external-provisioner` with StorageClass `obol-shared`. +- Keep `local-path` as the default for ephemeral data. +- Stateful charts opt into `obol-shared` only when migration matters. +- SPOF on NFS host; fsync and lease semantics differ from local disk and may + break SQLite-style state (LiteLLM logs DB, anything using BoltDB). + +### Decision criteria + +- Cluster size 2: Option A is the only sane choice. +- Cluster size 3+, mostly stateless workloads: Option A still wins. +- Cluster size 3+, real HA requirement: Option B. +- Mixed where one host has bulk storage: Option C, but audit every workload's + fsync expectations first. + +## Cloudflared + +Today: `internal/embed/infrastructure/cloudflared/templates/deployment.yaml` +runs `replicas: 1` (or 0 when no token/credentials). Modes: `quickTunnel`, +`remoteManaged` (token), `localManaged` (credentials + config). + +### Option A — multi-replica HA (proposed default for `replicas >= 2`) + +- Cloudflared natively supports multiple replicas — each registers as a tunnel + connection and Cloudflare load-balances. +- Set `replicas: 2`, `topologySpreadConstraints` (or hard PodAntiAffinity) by + hostname so replicas land on different nodes. +- Only valid in `remote` and `local` managed modes. `quickTunnel` mints a fresh + trycloudflare URL per replica, so quick mode caps at `replicas: 1`. + +### Option B — pin to single edge node + +- Apply `obol.org/edge=true` label to whichever node has the best uplink. +- `nodeSelector: { obol.org/edge: "true" }`, `replicas: 1`. +- Right call when LAN topology is asymmetric (one wired box, others on Wi-Fi). +- Not HA; tunnel dies with the edge node. + +### Option C — DaemonSet on edge-labeled nodes + +Past ~4 connections Cloudflare gains nothing, and managing tunnel limits +becomes painful. Don't recommend. + +### Decision criteria + +- Symmetric uplinks, â‰Ĩ2 nodes: Option A. Free HA, no extra config required. +- Asymmetric uplinks (one node = the gateway): Option B with `--edge-node`. +- quickTunnel: always `replicas: 1`. + +## Other multi-node concerns (out of scope for this skill, tracked here) + +- **Dev registry cache**: today configured per-cluster in `registries.yaml`, + scoped to a single localhost cache on the dev box. Multi-node needs each + agent to either reach the cache over LAN or have its own cache. +- **Host Ollama auto-detection**: `autoConfigureLLM` detects models on the + host where `obol stack up` ran. In multi-node we need to either disable + this (require `obol model setup custom`) or aggregate across nodes. +- **Traefik / Gateway**: single Service IP works fine multi-node out of the + box; nothing to do unless we want active-active ingress per region. diff --git a/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py b/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py new file mode 100755 index 00000000..18e7e73f --- /dev/null +++ b/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""Cluster bootstrap helper for obol-stack. + +Thin wrapper around k3sup that codifies the topology and writes inventory +to $OBOL_CONFIG_DIR/cluster-bootstrap/. See ../SKILL.md for usage. + +This is a scaffold: the SSH-driven k3sup invocations are wired up, but +multi-node storage placement and cloudflared HA labeling are deferred until +the design notes in ../references/multi-node-design.md are finalized. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Optional + + +def config_dir() -> Path: + if env := os.environ.get("OBOL_CONFIG_DIR"): + return Path(env) + if xdg := os.environ.get("XDG_CONFIG_HOME"): + return Path(xdg) / "obol" + return Path.home() / ".config" / "obol" + + +def state_dir() -> Path: + p = config_dir() / "cluster-bootstrap" + p.mkdir(parents=True, exist_ok=True) + return p + + +def kubeconfig_path() -> Path: + return config_dir() / "kubeconfig.yaml" + + +def topology_path() -> Path: + return state_dir() / "topology.json" + + +def server_token_path() -> Path: + return state_dir() / "server-token" + + +@dataclass +class Node: + host: str + role: str # "server" | "agent" + user: str + labels: dict + + +def load_topology() -> dict: + p = topology_path() + if not p.exists(): + return {"nodes": []} + return json.loads(p.read_text()) + + +def save_topology(topo: dict) -> None: + topology_path().write_text(json.dumps(topo, indent=2, sort_keys=True)) + + +def upsert_node(node: Node) -> None: + topo = load_topology() + nodes = [n for n in topo["nodes"] if n["host"] != node.host] + nodes.append(asdict(node)) + topo["nodes"] = nodes + save_topology(topo) + + +def require_k3sup() -> None: + if shutil.which("k3sup") is None: + sys.exit( + "k3sup not found in PATH. Install from https://github.com/alexellis/k3sup" + ) + + +def run(cmd: list[str]) -> subprocess.CompletedProcess: + print(f"+ {' '.join(cmd)}", file=sys.stderr) + return subprocess.run(cmd, check=True) + + +def cmd_install_server(args: argparse.Namespace) -> int: + require_k3sup() + kc = kubeconfig_path() + kc.parent.mkdir(parents=True, exist_ok=True) + + k3sup_cmd = [ + "k3sup", "install", + "--ip", args.host, + "--user", args.user, + "--ssh-key", args.ssh_key, + "--local-path", str(kc), + "--context", "obol", + "--k3s-channel", args.k3s_channel, + ] + if args.cluster_cidr: + k3sup_cmd += ["--cluster-cidr", args.cluster_cidr] + run(k3sup_cmd) + + # Pull the node token off the server so agents can join later. + token = subprocess.check_output([ + "ssh", "-i", args.ssh_key, + "-o", "StrictHostKeyChecking=accept-new", + f"{args.user}@{args.host}", + "sudo cat /var/lib/rancher/k3s/server/node-token", + ]).decode().strip() + p = server_token_path() + p.write_text(token) + p.chmod(0o600) + + upsert_node(Node(host=args.host, role="server", user=args.user, labels={})) + print(f"server installed; kubeconfig at {kc}") + return 0 + + +def cmd_join(args: argparse.Namespace) -> int: + require_k3sup() + if not server_token_path().exists(): + sys.exit("no server token on disk; run `bootstrap.py server` first") + + run([ + "k3sup", "join", + "--ip", args.host, + "--user", args.user, + "--ssh-key", args.ssh_key, + "--server-ip", args.server_host, + "--server-user", args.user, + ]) + upsert_node(Node(host=args.host, role="agent", user=args.user, labels={})) + print(f"agent {args.host} joined to server {args.server_host}") + return 0 + + +def cmd_kubeconfig_path(_: argparse.Namespace) -> int: + print(kubeconfig_path()) + return 0 + + +def cmd_status(_: argparse.Namespace) -> int: + topo = load_topology() + if not topo["nodes"]: + print("no nodes recorded") + return 0 + for n in topo["nodes"]: + labels = ",".join(f"{k}={v}" for k, v in n["labels"].items()) or "-" + print(f"{n['host']:20} {n['role']:6} {n['user']:12} {labels}") + return 0 + + +def cmd_label(args: argparse.Namespace) -> int: + pairs = {} + for raw in args.label: + if "=" not in raw: + sys.exit(f"label must be key=value (got {raw!r})") + k, v = raw.split("=", 1) + pairs[k] = v + + # Persist intent locally; actual `kubectl label node` happens via the + # caller because we don't want to assume a kubeconfig is loaded yet. + topo = load_topology() + found = False + for n in topo["nodes"]: + if n["host"] == args.host: + n["labels"].update(pairs) + found = True + if not found: + sys.exit(f"host {args.host!r} not in topology") + save_topology(topo) + print(f"recorded labels for {args.host}; apply with:") + for k, v in pairs.items(): + print(f" kubectl label node $(kubectl get node -o name | grep {args.host}) {k}={v} --overwrite") + return 0 + + +def main(argv: Optional[list[str]] = None) -> int: + p = argparse.ArgumentParser(prog="bootstrap.py") + sub = p.add_subparsers(dest="cmd", required=True) + + def add_ssh(parser: argparse.ArgumentParser) -> None: + parser.add_argument("--host", required=True) + parser.add_argument("--user", required=True) + parser.add_argument("--ssh-key", required=True) + + s = sub.add_parser("single", help="install k3s on one host") + add_ssh(s) + s.add_argument("--k3s-channel", default="stable") + s.add_argument("--cluster-cidr", default=None) + s.set_defaults(func=cmd_install_server) + + s = sub.add_parser("server", help="install k3s server") + add_ssh(s) + s.add_argument("--k3s-channel", default="stable") + s.add_argument("--cluster-cidr", default=None) + s.set_defaults(func=cmd_install_server) + + s = sub.add_parser("join", help="join an agent node to the server") + add_ssh(s) + s.add_argument("--server-host", required=True) + s.set_defaults(func=cmd_join) + + s = sub.add_parser("kubeconfig-path", help="print kubeconfig path") + s.set_defaults(func=cmd_kubeconfig_path) + + s = sub.add_parser("status", help="list known nodes") + s.set_defaults(func=cmd_status) + + s = sub.add_parser("label", help="record node labels in topology") + s.add_argument("--host", required=True) + s.add_argument("--label", action="append", default=[], + help="key=value (repeatable)") + s.set_defaults(func=cmd_label) + + args = p.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) From 9dec3ef7203133ba2894031e626e448a590f2a28 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 00:35:19 +0000 Subject: [PATCH 2/2] feat(skills): lock multi-node decisions for cluster-bootstrap Storage: single primary node (Option A). One node carries obol.org/storage=primary; stateful Deployments will gain nodeAffinity to that label. Bootstrap records the label on the server by default; --no-storage-primary opts out. Cloudflared: pools list (Shape 2). The chart will render one Deployment per cloudflared.pools entry, each with its own replicas, nodeSelector, and credentials, plus hostname PodAntiAffinity to keep one replica per node within a pool. Bootstrap records obol.org/cloudflared-pool on each node so the chart-rewrite ticket can read topology.json. Rejected paths and rationale captured in references/multi-node-design.md. The actual chart-side changes (local-path nodeAffinity, cloudflared range over pools) remain a separate ticket. --- .../embed/skills/cluster-bootstrap/SKILL.md | 74 +++--- .../references/multi-node-design.md | 211 +++++++++++------- .../cluster-bootstrap/scripts/bootstrap.py | 42 +++- 3 files changed, 205 insertions(+), 122 deletions(-) diff --git a/internal/embed/skills/cluster-bootstrap/SKILL.md b/internal/embed/skills/cluster-bootstrap/SKILL.md index d39a3c21..500bdbc5 100644 --- a/internal/embed/skills/cluster-bootstrap/SKILL.md +++ b/internal/embed/skills/cluster-bootstrap/SKILL.md @@ -9,10 +9,11 @@ metadata: { "openclaw": { "emoji": "ðŸŠī", "requires": { "bins": ["k3sup", "ssh" Bootstrap a k3s cluster across one or more hosts (LAN or cloud) and prepare it to run obol-stack. Wraps [k3sup](https://github.com/alexellis/k3sup) over SSH. -This skill is a **scaffold**. The single-host and LAN-join flows are wired up; -multi-node storage and tunnel behavior are still being designed -(see `references/multi-node-design.md`). Don't run this against a production -cluster yet. +This skill is a **scaffold**. The single-host and LAN-join flows are wired up +and the multi-node design is decided (storage-primary node label, cloudflared +pools — see `references/multi-node-design.md`), but the chart changes that +consume the bootstrap output are tracked in a follow-up ticket. Don't run this +against a production cluster yet. ## When to Use @@ -67,63 +68,56 @@ obol stack up ``` single --host --user --ssh-key [--k3s-channel stable] - Install k3s on one host. Equivalent to running `server` with no - expected agents. + [--storage-primary] [--cloudflared-pool ] + Install k3s on one host. Equivalent to `server` with no agents. server --host --user --ssh-key [--k3s-channel stable] [--cluster-cidr] + [--storage-primary] [--no-storage-primary] + [--cloudflared-pool ] Install k3s server on the target host. Writes kubeconfig to $OBOL_CONFIG_DIR/kubeconfig.yaml with API rewritten to --host. + Records `obol.org/storage=primary` on the server by default and + `obol.org/cloudflared-pool=` (default `default`) into + topology.json. join --server-host --host --user --ssh-key - Install k3s agent on --host and join it to --server-host. Server - node-token is fetched over SSH from the server host. + [--cloudflared-pool ] + Install k3s agent on --host and join to --server-host. Records + `obol.org/cloudflared-pool=` (default `default`). kubeconfig-path Print the absolute path of the kubeconfig this skill writes to. label --host --label key=value [--label key=value ...] - Apply node labels (used by storage/tunnel placement — see Design - Notes below). + Apply ad-hoc node labels (used when storage/tunnel placement + needs more than the bootstrap conveniences cover). status List nodes, their roles, and the labels relevant to obol-stack. ``` -## Design Notes (in progress) +## Design Notes (decided) -These two areas are NOT yet implemented by this skill. They block real -multi-node usage. Captured in `references/multi-node-design.md`; high-level -proposals here so the skill's flags can be designed against them. +Full rationale and rejected alternatives in `references/multi-node-design.md`. -### Storage placement +### Storage — single primary node -Default `local-path` provisioner pins PVCs to one node. Three candidate paths: +One node carries `obol.org/storage=primary` (the bootstrap server by default). +Stateful Deployments — LiteLLM, Hermes, default obol-agent, OpenClaw — add +`nodeAffinity` to that label so PVCs always land on the same node. Lose the +primary, restore from PVC backup. This is `--storage-primary` (default on) +on `bootstrap.py server` / `single`. -1. **Storage-primary node label (proposed default).** Designate one node with - `obol.org/storage=primary`. Stateful workloads add nodeAffinity to that - label. State is single-node; failure is recoverable from PVC backup. -2. **Longhorn / OpenEBS replicated block storage.** Real PVC migration. Costs - â‰Ĩ3 nodes and ~500MiB RAM/node baseline. -3. **NFS export + dual StorageClass.** Add `obol-shared` on top of an NFS - export from the bootstrap host; keep `local-path` for ephemeral work. +### Cloudflared — `pools` list -This skill exposes `--storage-primary ` so the choice can be deferred -without changing the CLI surface. +The cloudflared chart will render one Deployment per entry in +`cloudflared.pools`. Each pool has its own `replicas`, `nodeSelector`, and +Cloudflare credentials, with hostname `PodAntiAffinity` ensuring at most one +replica per node within a pool. Default values ship a single `default` pool +preserving today's behavior; advanced topologies opt in by adding more pools +(e.g. `edge` + `cloud` with separate tunnel tokens). -### Cloudflared placement - -Cloudflared has native HA (each replica = a tunnel connection, -Cloudflare-side load-balanced). Three candidate paths: - -1. **Multi-replica + PodAntiAffinity (proposed default for `replicas >= 2`).** - Free HA; only viable in `remote` / `local` managed mode (quickTunnel mints - per-replica URLs). -2. **Pin to a single edge-labeled node.** Right when one node has the only - good uplink. `obol.org/edge=true` node selector, `replicas: 1`. -3. **DaemonSet on edge-labeled nodes.** Don't. - -This skill exposes `--edge-node ` to pin (option 2). Default behavior is -to leave the cloudflared chart at replicas=1 until the chart learns to scale -based on `obol.org/topology=multi`. +`bootstrap.py server --cloudflared-pool ` and `bootstrap.py join +--cloudflared-pool ` record per-node pool labels into `topology.json`. ## Files Written by the Skill diff --git a/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md b/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md index 71f8a1dd..a5d10523 100644 --- a/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md +++ b/internal/embed/skills/cluster-bootstrap/references/multi-node-design.md @@ -1,88 +1,136 @@ # Multi-node design notes -Open questions blocking real multi-node usage of obol-stack. The -`cluster-bootstrap` skill ships flag stubs (`--storage-primary`, `--edge-node`) -so the CLI surface doesn't have to change once these are decided. +Decisions for the multi-node behavior of obol-stack. The `cluster-bootstrap` +skill carries the bootstrap-time flags; the actual chart changes that consume +them live in a separate ticket (see "Implementation status" at the bottom). -## Storage +## Storage — DECIDED: Option A (storage-primary node) Today: `internal/embed/infrastructure/base/templates/local-path.yaml` installs the rancher local-path provisioner with `volumeBindingMode: WaitForFirstConsumer` and `pathPattern: "{{ .PVC.Namespace }}/{{ .PVC.Name }}"` under -`{{ .Values.dataDir }}`. PVCs are pinned to whichever node first schedules a -pod that consumes them. If the pod reschedules to a different node, it cannot -re-mount the PVC. - -### Option A — single storage-primary node (proposed default) - -- Apply `obol.org/storage=primary` label to one node (the bootstrap node). -- Add `nodeAffinity` to every Deployment that owns a PVC: LiteLLM, Hermes, - default obol-agent, OpenClaw instances, eRPC if it persists state, monitoring - PVCs. -- Storage failure mode is identical to today's single-host k3d: lose the - storage node, restore from PVC backup. - -Surface: -- `--storage-primary ` on `bootstrap.py server` records the label intent. -- Helm values gain `storage.primaryLabel` so charts can opt-in. - -### Option B — Longhorn or OpenEBS Mayastor - -- Replace `local-path` as the default StorageClass. -- Need â‰Ĩ3 nodes for replication. Each node runs a per-node agent - (~500MiB RAM, plus disk overhead per replica). -- New failure modes: stuck volumes during node loss, replica rebalancing IO - pressure. - -### Option C — single NFS export + dual StorageClass - -- Bootstrap host exports `$OBOL_DATA_DIR` over NFS. -- Install `nfs-subdir-external-provisioner` with StorageClass `obol-shared`. -- Keep `local-path` as the default for ephemeral data. -- Stateful charts opt into `obol-shared` only when migration matters. -- SPOF on NFS host; fsync and lease semantics differ from local disk and may - break SQLite-style state (LiteLLM logs DB, anything using BoltDB). - -### Decision criteria - -- Cluster size 2: Option A is the only sane choice. -- Cluster size 3+, mostly stateless workloads: Option A still wins. -- Cluster size 3+, real HA requirement: Option B. -- Mixed where one host has bulk storage: Option C, but audit every workload's - fsync expectations first. - -## Cloudflared +`{{ .Values.dataDir }}`. PVCs pin to whichever node first schedules a consumer +pod; reschedule to a different node breaks the mount. + +### Decision: A — single storage-primary node + +- One node carries `obol.org/storage=primary`. By default this is the + bootstrap (server) node. +- Every Deployment that owns a PVC adds a soft `nodeAffinity` preferring the + primary, hard `nodeAffinity` requiring it for true single-writer state + (LiteLLM, Hermes, default obol-agent, OpenClaw instances). +- Failure mode is identical to today's single-host k3d: lose the primary, + restore from PVC backup. Single-node-of-failure for state is acceptable + given our LAN/cloud-small topologies. +- Helm values gain `storage.primaryLabel` (default `obol.org/storage=primary`) + so charts can opt in via a shared values key. + +### Rejected + +- **B — Longhorn / OpenEBS Mayastor.** Real PVC migration but â‰Ĩ3 nodes, + ~500MiB RAM/node baseline, new failure modes (stuck volumes, replica + rebalance IO). Reconsider if a deployment actually needs HA state. +- **C — NFS export + dual StorageClass.** SPOF on NFS host; fsync/lease + semantics differ from local disk and would silently break SQLite-style state + (LiteLLM logs DB, BoltDB-backed services). Reconsider if a deployment + centralizes only on bulk read-mostly storage. + +### Bootstrap surface + +- `bootstrap.py server --storage-primary` records `obol.org/storage=primary` + on the server node in topology.json. Apply with `kubectl label node â€Ķ` + printed by `bootstrap.py label`. +- `bootstrap.py server --no-storage-primary` opts out (e.g. when a separate + storage node will be added later). + +## Cloudflared — DECIDED: Shape 2 (pools) Today: `internal/embed/infrastructure/cloudflared/templates/deployment.yaml` -runs `replicas: 1` (or 0 when no token/credentials). Modes: `quickTunnel`, -`remoteManaged` (token), `localManaged` (credentials + config). - -### Option A — multi-replica HA (proposed default for `replicas >= 2`) - -- Cloudflared natively supports multiple replicas — each registers as a tunnel - connection and Cloudflare load-balances. -- Set `replicas: 2`, `topologySpreadConstraints` (or hard PodAntiAffinity) by - hostname so replicas land on different nodes. -- Only valid in `remote` and `local` managed modes. `quickTunnel` mints a fresh - trycloudflare URL per replica, so quick mode caps at `replicas: 1`. - -### Option B — pin to single edge node - -- Apply `obol.org/edge=true` label to whichever node has the best uplink. -- `nodeSelector: { obol.org/edge: "true" }`, `replicas: 1`. -- Right call when LAN topology is asymmetric (one wired box, others on Wi-Fi). -- Not HA; tunnel dies with the edge node. - -### Option C — DaemonSet on edge-labeled nodes - -Past ~4 connections Cloudflare gains nothing, and managing tunnel limits -becomes painful. Don't recommend. - -### Decision criteria - -- Symmetric uplinks, â‰Ĩ2 nodes: Option A. Free HA, no extra config required. -- Asymmetric uplinks (one node = the gateway): Option B with `--edge-node`. -- quickTunnel: always `replicas: 1`. +renders one Deployment with `replicas: 1` (or 0 when no token/credentials). +Modes: `quickTunnel`, `remoteManaged` (token), `localManaged` (credentials + +config). + +### Decision: Shape 2 — `cloudflared.pools` list + +Values gain a `pools` list. Each pool is its own Deployment with hostname +PodAntiAffinity so within a pool there is at most one replica per node, and +each pool gets its own Cloudflare credentials (edge vs cloud usually map to +different zones / accounts). + +```yaml +# Default values.yaml — single pool, backwards compatible with today. +pools: + - name: default + replicas: 1 + # nodeSelector omitted -> any schedulable node + mode: auto # auto | local | remote | quick + quickTunnel: + url: "http://traefik.traefik.svc.cluster.local:80" + remoteManaged: + tokenSecretName: cloudflared-tunnel-token + tokenSecretKey: TUNNEL_TOKEN + localManaged: + secretName: cloudflared-local-credentials + configMapName: cloudflared-local-config + tunnelIDKey: tunnel_id +``` + +Per-pool example for an edge+cloud topology: + +```yaml +pools: + - name: edge + replicas: 2 + nodeSelector: + obol.org/cloudflared-pool: edge + mode: remote + remoteManaged: + tokenSecretName: cloudflared-edge-token + tokenSecretKey: TUNNEL_TOKEN + - name: cloud + replicas: 1 + nodeSelector: + obol.org/cloudflared-pool: cloud + mode: local + localManaged: + secretName: cloudflared-cloud-credentials + configMapName: cloudflared-cloud-config + tunnelIDKey: tunnel_id +``` + +Invariants the chart must enforce: +- Per-pool `requiredDuringSchedulingIgnoredDuringExecution` PodAntiAffinity by + `kubernetes.io/hostname` — at most one replica per node within a pool. +- `quickTunnel` mode caps at `replicas: 1` (per-replica trycloudflare URL). +- Resource names get a per-pool suffix: `cloudflared-` for the + Deployment, default suffix omitted only when the single pool is named + `default` and no migration is in flight. +- Validation: each pool must have exactly one of `quickTunnel` (when + `mode=quick`), `remoteManaged` (`mode=remote`), `localManaged` + (`mode=local`), or any of the three when `mode=auto`. + +Footgun documented for users: if `replicas` exceeds the count of nodes +matching `nodeSelector`, the surplus pods stay Pending. The chart NOTES.txt +should print a warning at install time. + +### Rejected + +- **Shape 1 — DaemonSet per labeled pool.** Hard-caps at one tunnel per + labeled node, which means "more tunnels on edge" requires labeling more + nodes. Doesn't compose when one beefy edge box wants two tunnels. +- **Shape 3 — single Deployment, hostname antiaffinity, replicas knob.** No + way to differentiate edge vs cloud tunnels (different Cloudflare + credentials, different zones). Replicas-exceeds-nodes footgun is the same + but with no value to offset it. + +### Bootstrap surface + +- `bootstrap.py server --cloudflared-pool ` records the pool label on + the server. Default is `default`. +- `bootstrap.py join --cloudflared-pool ` records the pool label on + the agent. Repeat with different pool names to build edge/cloud topology. +- The recorded labels are written into `topology.json` so the chart-rewrite + ticket can read them when generating per-pool values. ## Other multi-node concerns (out of scope for this skill, tracked here) @@ -94,3 +142,14 @@ becomes painful. Don't recommend. this (require `obol model setup custom`) or aggregate across nodes. - **Traefik / Gateway**: single Service IP works fine multi-node out of the box; nothing to do unless we want active-active ingress per region. + +## Implementation status + +| Piece | Status | +|------------------------------------------------------------|--------| +| `bootstrap.py` records storage-primary + cloudflared-pool | done (this PR) | +| `local-path.yaml` chart honors `storage.primaryLabel` | next ticket | +| Stateful Deployments add `nodeAffinity` to primary label | next ticket | +| `cloudflared` chart `range` over `pools` | next ticket | +| `obol stack up` consumes `topology.json` for chart values | next ticket | +| End-to-end multi-node smoke test | follow-up | diff --git a/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py b/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py index 18e7e73f..c69a2ad0 100755 --- a/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py +++ b/internal/embed/skills/cluster-bootstrap/scripts/bootstrap.py @@ -87,6 +87,10 @@ def run(cmd: list[str]) -> subprocess.CompletedProcess: return subprocess.run(cmd, check=True) +STORAGE_PRIMARY_LABEL = "obol.org/storage" +CLOUDFLARED_POOL_LABEL = "obol.org/cloudflared-pool" + + def cmd_install_server(args: argparse.Namespace) -> int: require_k3sup() kc = kubeconfig_path() @@ -116,8 +120,15 @@ def cmd_install_server(args: argparse.Namespace) -> int: p.write_text(token) p.chmod(0o600) - upsert_node(Node(host=args.host, role="server", user=args.user, labels={})) + labels: dict[str, str] = {} + if args.storage_primary: + labels[STORAGE_PRIMARY_LABEL] = "primary" + labels[CLOUDFLARED_POOL_LABEL] = args.cloudflared_pool + upsert_node(Node(host=args.host, role="server", user=args.user, labels=labels)) print(f"server installed; kubeconfig at {kc}") + if labels: + print("recorded labels:", ", ".join(f"{k}={v}" for k, v in labels.items())) + print("apply with: bootstrap.py status # then run the printed kubectl label commands") return 0 @@ -134,8 +145,10 @@ def cmd_join(args: argparse.Namespace) -> int: "--server-ip", args.server_host, "--server-user", args.user, ]) - upsert_node(Node(host=args.host, role="agent", user=args.user, labels={})) + labels = {CLOUDFLARED_POOL_LABEL: args.cloudflared_pool} + upsert_node(Node(host=args.host, role="agent", user=args.user, labels=labels)) print(f"agent {args.host} joined to server {args.server_host}") + print("recorded labels:", ", ".join(f"{k}={v}" for k, v in labels.items())) return 0 @@ -189,21 +202,38 @@ def add_ssh(parser: argparse.ArgumentParser) -> None: parser.add_argument("--user", required=True) parser.add_argument("--ssh-key", required=True) + def add_server_topology(parser: argparse.ArgumentParser) -> None: + parser.add_argument("--k3s-channel", default="stable") + parser.add_argument("--cluster-cidr", default=None) + # storage-primary defaults on; --no-storage-primary opts out. + parser.add_argument( + "--storage-primary", dest="storage_primary", + action=argparse.BooleanOptionalAction, default=True, + help="record obol.org/storage=primary on this node (default on)", + ) + parser.add_argument( + "--cloudflared-pool", default="default", + help="cloudflared pool name (label obol.org/cloudflared-pool); " + "default 'default'", + ) + s = sub.add_parser("single", help="install k3s on one host") add_ssh(s) - s.add_argument("--k3s-channel", default="stable") - s.add_argument("--cluster-cidr", default=None) + add_server_topology(s) s.set_defaults(func=cmd_install_server) s = sub.add_parser("server", help="install k3s server") add_ssh(s) - s.add_argument("--k3s-channel", default="stable") - s.add_argument("--cluster-cidr", default=None) + add_server_topology(s) s.set_defaults(func=cmd_install_server) s = sub.add_parser("join", help="join an agent node to the server") add_ssh(s) s.add_argument("--server-host", required=True) + s.add_argument( + "--cloudflared-pool", default="default", + help="cloudflared pool name for this agent (default 'default')", + ) s.set_defaults(func=cmd_join) s = sub.add_parser("kubeconfig-path", help="print kubeconfig path")