From 56b9725a5902d786da35ffd8fa51bee4e93df6c5 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 19:59:55 +0000
Subject: [PATCH 001/221] playground: scaffold ClickBench Firecracker microVM
service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
WIP checkpoint. Lets visitors run SQL against any of the 80+ ClickBench
systems via a single-page UI, each isolated in a per-system Firecracker
microVM.
- server/ aiohttp API: /api/systems, /api/state, /api/query,
/api/admin/provision. Owns the per-system VM lifecycle,
a 1-Hz CPU/disk/host-pressure watchdog, and a batched
ClickHouse-Cloud logging sink (JSONL fallback).
- agent/ stdlib HTTP agent that runs inside each VM and wraps the
system's install/start/load/query scripts.
- images/ scripts to build the base Ubuntu 22.04 rootfs + per-system
rootfs/system-disk pair (200 GB sparse + 16/88 GB sized
for the system's data format).
- web/ vanilla JS SPA — system picker, query box, X-Query-Time /
X-Output-Truncated rendering.
Smoke-tested: base rootfs boots under Firecracker, agent comes up in
~2 s, /health and /stats respond. Agent self-test on the host (no VM)
covers all 4 endpoints including 10 KB output truncation. ClickHouse
provisioning is in flight; see playground/docs/build-progress.md for
the running checkpoint.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/.gitignore | 3 +
playground/README.md | 102 ++++++
playground/__init__.py | 0
playground/agent/agent.py | 357 ++++++++++++++++++++
playground/agent/clickbench-agent.service | 23 ++
playground/clickbench-playground.service | 16 +
playground/docs/architecture.md | 115 +++++++
playground/docs/build-progress.md | 110 ++++++
playground/images/build-base-rootfs.sh | 199 +++++++++++
playground/images/build-datasets-image.sh | 41 +++
playground/images/build-system-rootfs.sh | 170 ++++++++++
playground/scripts/agent-selftest.sh | 93 +++++
playground/scripts/download-datasets.sh | 64 ++++
playground/scripts/install-firecracker.sh | 36 ++
playground/scripts/run-server.sh | 15 +
playground/scripts/smoke-boot.sh | 98 ++++++
playground/server/__init__.py | 0
playground/server/config.py | 98 ++++++
playground/server/firecracker.py | 117 +++++++
playground/server/logging_sink.py | 190 +++++++++++
playground/server/main.py | 252 ++++++++++++++
playground/server/monitor.py | 215 ++++++++++++
playground/server/net.py | 130 +++++++
playground/server/systems.py | 134 ++++++++
playground/server/vm_manager.py | 391 ++++++++++++++++++++++
playground/web/app.js | 128 +++++++
playground/web/index.html | 62 ++++
playground/web/style.css | 71 ++++
28 files changed, 3230 insertions(+)
create mode 100644 playground/.gitignore
create mode 100644 playground/README.md
create mode 100644 playground/__init__.py
create mode 100644 playground/agent/agent.py
create mode 100644 playground/agent/clickbench-agent.service
create mode 100644 playground/clickbench-playground.service
create mode 100644 playground/docs/architecture.md
create mode 100644 playground/docs/build-progress.md
create mode 100755 playground/images/build-base-rootfs.sh
create mode 100755 playground/images/build-datasets-image.sh
create mode 100755 playground/images/build-system-rootfs.sh
create mode 100755 playground/scripts/agent-selftest.sh
create mode 100755 playground/scripts/download-datasets.sh
create mode 100755 playground/scripts/install-firecracker.sh
create mode 100755 playground/scripts/run-server.sh
create mode 100755 playground/scripts/smoke-boot.sh
create mode 100644 playground/server/__init__.py
create mode 100644 playground/server/config.py
create mode 100644 playground/server/firecracker.py
create mode 100644 playground/server/logging_sink.py
create mode 100644 playground/server/main.py
create mode 100644 playground/server/monitor.py
create mode 100644 playground/server/net.py
create mode 100644 playground/server/systems.py
create mode 100644 playground/server/vm_manager.py
create mode 100644 playground/web/app.js
create mode 100644 playground/web/index.html
create mode 100644 playground/web/style.css
diff --git a/playground/.gitignore b/playground/.gitignore
new file mode 100644
index 0000000000..b6cf5f0391
--- /dev/null
+++ b/playground/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+*.pyc
+.env
diff --git a/playground/README.md b/playground/README.md
new file mode 100644
index 0000000000..cc748036bf
--- /dev/null
+++ b/playground/README.md
@@ -0,0 +1,102 @@
+# ClickBench Playground
+
+A self-service playground that lets visitors run arbitrary SQL against any of the
+80+ database systems documented in ClickBench, isolated inside a Firecracker
+microVM per system.
+
+## How it works
+
+1. The dataset (hits, in all formats ClickBench uses) is downloaded once into a
+ single directory on the host and exposed read-only to every VM as a virtio-blk
+ device.
+2. For each system, a Firecracker microVM is launched once with internet access
+ to run the system's `install`, `start`, and `load` scripts.
+3. A snapshot (memory + disk) is taken and persisted. Subsequent restorations
+ run without internet — the only path in or out is the host↔VM control link.
+4. A small in-VM **agent** exposes `POST /query` over HTTP. The host **API
+ server** proxies user queries to the agent, returns the raw output as
+ `application/octet-stream`, and puts the timing into response headers.
+5. A **monitor** loop watches per-VM CPU/disk/memory and host totals, killing
+ misbehaving or oversized VMs.
+6. Every request and every restart is appended to a ClickHouse Cloud table.
+
+## Layout
+
+```
+playground/
+├── server/ # aiohttp API server, VM manager, monitor, logging sink
+├── agent/ # In-VM HTTP agent (runs as systemd unit inside each VM)
+├── images/ # Scripts that build the base rootfs + per-system overlays
+├── web/ # Vanilla-JS single-page app
+├── scripts/ # Host-side install / dataset / network helpers
+└── docs/ # Design notes
+```
+
+Host state lives under `/opt/clickbench-playground/`:
+
+```
+/opt/clickbench-playground/
+├── bin/ firecracker, jailer
+├── kernel/vmlinux guest kernel
+├── base-rootfs.ext4 pristine Ubuntu 22.04 rootfs (built once)
+├── datasets/ hits.parquet, hits_*.parquet, hits.tsv, hits.csv
+├── datasets.ext4 read-only image of datasets/ (attached to every VM)
+├── systems// per-system rootfs, snapshot, sockets, logs
+├── vms/.sock Firecracker API socket
+└── logs/ local JSONL fallback when ClickHouse Cloud is off
+```
+
+## Networking
+
+Each VM gets its own `/30` subnet on a dedicated TAP:
+
+| Side | Address | Notes |
+|------|------------------|--------------------------------|
+| Host | `10.200..1` | TAP device `fc-tap-` |
+| VM | `10.200..2` | reachable from host always |
+
+During the install phase, `iptables FORWARD` + MASQUERADE are enabled for the
+TAP so the VM can `apt-get`/`curl`/etc. After the snapshot is taken, the
+forwarding rules are removed; the host↔VM link still works but external traffic
+is blackholed.
+
+## Configuration
+
+Environment variables (read by `server/config.py`):
+
+| Var | Purpose |
+|--------------------------------|-----------------------------------------------|
+| `CLICKHOUSE_CLOUD_URL` | HTTPS URL of CH Cloud (e.g. `https://x.clickhouse.cloud:8443`) |
+| `CLICKHOUSE_CLOUD_USER` | username |
+| `CLICKHOUSE_CLOUD_PASSWORD` | password |
+| `PLAYGROUND_STATE_DIR` | defaults to `/opt/clickbench-playground` |
+| `PLAYGROUND_LISTEN` | defaults to `0.0.0.0:8000` |
+| `PLAYGROUND_MAX_VMS` | concurrent live VMs cap (default 16) |
+| `PLAYGROUND_OUTPUT_LIMIT` | response body cap in bytes (default 10240) |
+
+## Lifecycle of a request
+
+```
+client ──HTTP──▶ api/query?system=clickhouse
+ │
+ ▼
+ vm_manager.ensure_ready("clickhouse")
+ ├─ already running and /health OK ──▶ proceed
+ ├─ not running ──▶ restore from snapshot
+ └─ unresponsive ──▶ kill, restore, retry once
+ │
+ ▼
+ agent ◀── POST /query ── body=SQL
+ agent runs ./query, captures stdout/stderr, returns:
+ Content-Type: application/octet-stream
+ X-Query-Time: 0.234
+ X-Output-Truncated: 0|1
+ X-Output-Bytes: 8042
+ body: (up to 10 KB of raw output)
+ │
+ ▼
+ logger.write_request(...)
+ │
+ ▼
+ client
+```
diff --git a/playground/__init__.py b/playground/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
new file mode 100644
index 0000000000..62f5aad59f
--- /dev/null
+++ b/playground/agent/agent.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+"""
+ClickBench in-VM agent.
+
+Runs inside the Firecracker microVM. Exposes a tiny HTTP API that the host
+server hits to:
+
+ GET /health quick liveness probe; cheap
+ GET /stats CPU/mem/disk snapshot
+ POST /provision run install -> start -> load for the bundled system
+ (only called once, before the host snapshots the VM)
+ POST /query read SQL from request body, exec ./query, return
+ output as application/octet-stream + timing headers
+
+The system's ClickBench scripts (install/start/load/query/check/stop/...) are
+mounted at /opt/clickbench/system, with the system name in /etc/clickbench-
+system. The dataset is mounted read-only at /opt/clickbench/datasets.
+
+Listens on 0.0.0.0:8080 by default.
+
+Stdlib-only — the rootfs ships python3 from the Ubuntu base; no pip needed.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import http.server
+import json
+import os
+import shutil
+import signal
+import socket
+import socketserver
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+
+SYSTEM_DIR = Path(os.environ.get("CLICKBENCH_SYSTEM_DIR", "/opt/clickbench/system"))
+DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets"))
+STATE_DIR = Path(os.environ.get("CLICKBENCH_AGENT_STATE", "/var/lib/clickbench-agent"))
+SYSTEM_NAME = (
+ os.environ.get("CLICKBENCH_SYSTEM_NAME")
+ or (Path("/etc/clickbench-system").read_text().strip()
+ if Path("/etc/clickbench-system").exists() else SYSTEM_DIR.name)
+)
+LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "8080"))
+# 10 KB cap, matching the spec. Configurable for testing.
+OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "10240"))
+# Per-query wall-clock cap so a runaway query can't tie up a VM forever.
+QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "600"))
+# Provision (install/start/load) can legitimately take an hour for some systems.
+PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", "7200"))
+
+STATE_DIR.mkdir(parents=True, exist_ok=True)
+PROVISION_DONE = STATE_DIR / "provisioned"
+PROVISION_LOG = STATE_DIR / "provision.log"
+
+# Single-writer lock; the agent serializes queries per VM. Two ClickBench
+# scripts hitting the same socket/temp file concurrently would not be safe.
+_query_lock = threading.Lock()
+_provision_lock = threading.Lock()
+
+
+def _cap(b: bytes) -> tuple[bytes, bool]:
+ """Truncate to OUTPUT_LIMIT bytes; return (body, was_truncated)."""
+ if len(b) <= OUTPUT_LIMIT:
+ return b, False
+ return b[:OUTPUT_LIMIT], True
+
+
+def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes:
+ n = int(handler.headers.get("Content-Length") or 0)
+ if n <= 0:
+ return b""
+ # Cap inbound bodies at 1 MB; queries are SQL, not bulk data.
+ return handler.rfile.read(min(n, 1 << 20))
+
+
+def _system_script(name: str) -> Path:
+ """Return path to a script in the system dir, or raise if missing/not executable."""
+ p = SYSTEM_DIR / name
+ if not p.exists():
+ raise FileNotFoundError(f"missing system script: {p}")
+ if not os.access(p, os.X_OK):
+ raise PermissionError(f"system script not executable: {p}")
+ return p
+
+
+def _read_proc_stat() -> tuple[int, int]:
+ """Return (total_jiffies, idle_jiffies) from /proc/stat."""
+ with open("/proc/stat") as f:
+ parts = f.readline().split()
+ nums = list(map(int, parts[1:]))
+ total = sum(nums)
+ idle = nums[3] + (nums[4] if len(nums) > 4 else 0)
+ return total, idle
+
+
+def _stats_snapshot() -> dict:
+ out: dict = {"system": SYSTEM_NAME, "ts": time.time()}
+ try:
+ out["loadavg"] = list(map(float, Path("/proc/loadavg").read_text().split()[:3]))
+ except Exception:
+ pass
+ try:
+ info = {k: v for k, v in (
+ l.split(":", 1) for l in Path("/proc/meminfo").read_text().splitlines() if ":" in l
+ )}
+ out["mem_total_kb"] = int(info.get("MemTotal", "0 kB").split()[0])
+ out["mem_avail_kb"] = int(info.get("MemAvailable", "0 kB").split()[0])
+ except Exception:
+ pass
+ try:
+ st = shutil.disk_usage("/")
+ out["disk_total"] = st.total
+ out["disk_free"] = st.free
+ except Exception:
+ pass
+ try:
+ t1, i1 = _read_proc_stat()
+ time.sleep(0.05)
+ t2, i2 = _read_proc_stat()
+ total = max(1, t2 - t1)
+ out["cpu_busy"] = 1.0 - (i2 - i1) / total
+ except Exception:
+ pass
+ out["provisioned"] = PROVISION_DONE.exists()
+ return out
+
+
+def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]:
+ """
+ Invoke ./query with the SQL on stdin.
+ The query script's contract per lib/benchmark-common.sh:
+ stdout: result (whatever format the system uses)
+ stderr: timing in fractional seconds on the LAST numeric line
+ exit code: 0 on success
+ """
+ script = _system_script("query")
+ t0 = time.monotonic()
+ try:
+ p = subprocess.Popen(
+ [str(script)],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ cwd=str(SYSTEM_DIR),
+ preexec_fn=os.setsid,
+ )
+ try:
+ stdout, stderr = p.communicate(input=sql, timeout=QUERY_TIMEOUT)
+ rc = p.returncode
+ except subprocess.TimeoutExpired:
+ # The system might still be inside its query; kill the whole group.
+ with contextlib.suppress(ProcessLookupError):
+ os.killpg(p.pid, signal.SIGKILL)
+ stdout, stderr = p.communicate()
+ rc = -9
+ except Exception as e:
+ return 255, b"", f"agent: failed to invoke ./query: {e}\n".encode(), 0.0
+ return rc, stdout, stderr, time.monotonic() - t0
+
+
+def _extract_script_timing(stderr: bytes) -> float | None:
+ """
+ Pull fractional-seconds timing from the last numeric line of stderr,
+ matching the lib/benchmark-common.sh tail -n1 logic.
+ """
+ # Handle the spark/pyspark carriage-return progress-bar case.
+ text = stderr.decode("utf-8", errors="replace").replace("\r", "\n")
+ last = None
+ for line in text.splitlines():
+ s = line.strip()
+ if not s:
+ continue
+ try:
+ v = float(s)
+ except ValueError:
+ continue
+ last = v
+ return last
+
+
+def _provision() -> tuple[int, bytes]:
+ """
+ Run install -> start -> wait-for-check -> load. Capture everything to
+ PROVISION_LOG. Idempotent: subsequent calls succeed-fast if PROVISION_DONE
+ is present.
+ """
+ if PROVISION_DONE.exists():
+ return 0, b"already provisioned\n"
+
+ with _provision_lock:
+ if PROVISION_DONE.exists():
+ return 0, b"already provisioned\n"
+
+ # Use the same /lib/benchmark-common.sh helpers if they're around. But
+ # since this is the playground, we want a *minimal* version: install,
+ # start, wait for check, load, sync. No cold-cycle restart, no
+ # concurrent-QPS test, no query loop.
+ steps: list[tuple[str, list[str]]] = [
+ ("install", [str(_system_script("install"))]),
+ ("start", [str(_system_script("start"))]),
+ ]
+
+ log_lines: list[bytes] = []
+ for name, cmd in steps:
+ t0 = time.monotonic()
+ log_lines.append(f"\n=== {name} ===\n".encode())
+ r = subprocess.run(
+ cmd, cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=PROVISION_TIMEOUT,
+ )
+ dt = time.monotonic() - t0
+ log_lines.append(r.stdout or b"")
+ log_lines.append(f"=== {name} done rc={r.returncode} in {dt:.1f}s ===\n".encode())
+ if r.returncode != 0:
+ PROVISION_LOG.write_bytes(b"".join(log_lines))
+ return r.returncode, b"".join(log_lines)
+
+ # Wait for ./check to succeed for up to 300s
+ check = SYSTEM_DIR / "check"
+ ok = False
+ t0 = time.monotonic()
+ last_check: subprocess.CompletedProcess | None = None
+ while time.monotonic() - t0 < 300:
+ last_check = subprocess.run(
+ [str(check)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ )
+ if last_check.returncode == 0:
+ ok = True
+ break
+ time.sleep(1)
+ if not ok:
+ log_lines.append(b"\n=== check did not succeed within 300s ===\n")
+ if last_check is not None:
+ log_lines.append(last_check.stderr or b"")
+ PROVISION_LOG.write_bytes(b"".join(log_lines))
+ return 1, b"".join(log_lines)
+ log_lines.append(b"\n=== check ok ===\n")
+
+ # Data files are pre-staged on the per-system disk by the host-side
+ # build-system-rootfs.sh, so the load script's relative references
+ # (hits.parquet, hits.tsv, etc.) already resolve to local files it
+ # can chown / mv / rm without worrying about a RO source mount.
+
+ # Run load.
+ t0 = time.monotonic()
+ log_lines.append(b"\n=== load ===\n")
+ r = subprocess.run(
+ [str(_system_script("load"))], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=PROVISION_TIMEOUT,
+ )
+ dt = time.monotonic() - t0
+ log_lines.append(r.stdout or b"")
+ log_lines.append(f"=== load done rc={r.returncode} in {dt:.1f}s ===\n".encode())
+ if r.returncode != 0:
+ PROVISION_LOG.write_bytes(b"".join(log_lines))
+ return r.returncode, b"".join(log_lines)
+
+ subprocess.run(["sync"], check=False)
+ PROVISION_DONE.write_text(f"ok {time.time()}\n")
+ PROVISION_LOG.write_bytes(b"".join(log_lines))
+ return 0, b"".join(log_lines)
+
+
+class Handler(http.server.BaseHTTPRequestHandler):
+ server_version = "clickbench-agent/0.1"
+
+ def log_message(self, fmt: str, *args) -> None:
+ sys.stderr.write("[agent] " + (fmt % args) + "\n")
+
+ def _send(self, code: int, body: bytes, headers: dict | None = None) -> None:
+ self.send_response(code)
+ self.send_header("Content-Length", str(len(body)))
+ self.send_header("Content-Type", (headers or {}).pop("Content-Type", "application/json"))
+ for k, v in (headers or {}).items():
+ self.send_header(k, v)
+ self.end_headers()
+ self.wfile.write(body)
+
+ def _send_json(self, code: int, obj) -> None:
+ self._send(code, json.dumps(obj, default=str).encode() + b"\n",
+ {"Content-Type": "application/json"})
+
+ def do_GET(self) -> None:
+ if self.path == "/health":
+ self._send_json(200, {"ok": True, "system": SYSTEM_NAME,
+ "provisioned": PROVISION_DONE.exists()})
+ return
+ if self.path == "/stats":
+ self._send_json(200, _stats_snapshot())
+ return
+ if self.path == "/provision-log":
+ data = PROVISION_LOG.read_bytes() if PROVISION_LOG.exists() else b""
+ self._send(200, data, {"Content-Type": "text/plain; charset=utf-8"})
+ return
+ self._send_json(404, {"error": "not found", "path": self.path})
+
+ def do_POST(self) -> None:
+ if self.path == "/provision":
+ rc, log = _provision()
+ self._send(200 if rc == 0 else 500, log[-OUTPUT_LIMIT:],
+ {"Content-Type": "text/plain; charset=utf-8",
+ "X-Provision-Status": "ok" if rc == 0 else f"err-{rc}"})
+ return
+ if self.path == "/query":
+ if not PROVISION_DONE.exists():
+ self._send_json(409, {"error": "not provisioned"})
+ return
+ sql = _read_body(self)
+ if not sql.strip():
+ self._send_json(400, {"error": "empty query"})
+ return
+ with _query_lock:
+ rc, out, err, wall = _run_query(sql)
+ script_t = _extract_script_timing(err)
+ body, truncated = _cap(out)
+ headers = {
+ "Content-Type": "application/octet-stream",
+ "X-Query-Wall-Time": f"{wall:.6f}",
+ "X-Output-Bytes": str(len(out)),
+ "X-Output-Truncated": "1" if truncated else "0",
+ "X-Exit-Code": str(rc),
+ "X-System": SYSTEM_NAME,
+ }
+ if script_t is not None:
+ headers["X-Query-Time"] = f"{script_t:.6f}"
+ if rc != 0:
+ # Surface a snippet of stderr so the client sees *something*.
+ err_snip = err[-1024:].decode("utf-8", errors="replace")
+ headers["X-Error"] = err_snip.replace("\n", " | ")[:512]
+ self._send(200 if rc == 0 else 502, body, headers)
+ return
+ self._send_json(404, {"error": "not found", "path": self.path})
+
+
+class ReusableServer(socketserver.ThreadingTCPServer):
+ allow_reuse_address = True
+ daemon_threads = True
+
+
+def main() -> None:
+ addr = ("0.0.0.0", LISTEN_PORT)
+ print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} "
+ f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True)
+ with ReusableServer(addr, Handler) as srv:
+ srv.serve_forever()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service
new file mode 100644
index 0000000000..c02fe20cbb
--- /dev/null
+++ b/playground/agent/clickbench-agent.service
@@ -0,0 +1,23 @@
+[Unit]
+Description=ClickBench in-VM playground agent
+# The kernel's `ip=` cmdline sets the static IP before init, so network is
+# already up when we start. We deliberately don't depend on network-online.
+# target — that gate is fed by systemd-networkd-wait-online, which is
+# disabled. The system disk mount is similarly best-effort: the agent's
+# /provision and /query paths report 404/409 if /opt/clickbench/system isn't
+# populated, which is the correct behaviour and lets /health stay up so the
+# host can still talk to it during provisioning.
+After=local-fs.target
+
+[Service]
+Type=simple
+Environment=PYTHONUNBUFFERED=1
+Environment=HOME=/root
+ExecStart=/usr/bin/python3 /opt/clickbench-agent/agent.py
+Restart=on-failure
+RestartSec=2
+KillMode=mixed
+TimeoutStopSec=10
+
+[Install]
+WantedBy=multi-user.target
diff --git a/playground/clickbench-playground.service b/playground/clickbench-playground.service
new file mode 100644
index 0000000000..979d31867b
--- /dev/null
+++ b/playground/clickbench-playground.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=ClickBench Playground API
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+User=ubuntu
+WorkingDirectory=/home/ubuntu/ClickBench
+EnvironmentFile=-/home/ubuntu/ClickBench/playground/.env
+ExecStart=/usr/bin/python3 -m playground.server.main
+Restart=on-failure
+RestartSec=3
+
+[Install]
+WantedBy=multi-user.target
diff --git a/playground/docs/architecture.md b/playground/docs/architecture.md
new file mode 100644
index 0000000000..0507740c41
--- /dev/null
+++ b/playground/docs/architecture.md
@@ -0,0 +1,115 @@
+# ClickBench Playground architecture
+
+## Components
+
+```
+┌──────────────────────────────────────────────────────────────────────────┐
+│ Browser (vanilla JS) │
+│ picks a system, types SQL, POST /api/query │
+└────────────────────────────┬─────────────────────────────────────────────┘
+ │ HTTP/1.1
+┌────────────────────────────▼─────────────────────────────────────────────┐
+│ Host API server (aiohttp) │
+│ ┌─────────────────┐ ┌──────────────┐ ┌────────────────┐ │
+│ │ HTTP routes │ │ VMManager │ │ Monitor │ │
+│ │ /api/systems │ │ per-VM │ │ 1Hz polling │ │
+│ │ /api/query │──▶│ lifecycle │◀──│ CPU/mem/disk │ │
+│ │ /api/state │ │ snapshots │ │ watchdog │ │
+│ └─────────────────┘ └──────┬───────┘ └────────────────┘ │
+│ ┌─────────────────────────────▼─────────────────────────────────────┐ │
+│ │ LoggingSink: batched INSERT into ClickHouse Cloud + local JSONL │ │
+│ └───────────────────────────────────────────────────────────────────┘ │
+└────────────────────────────┬─────────────────────────────────────────────┘
+ │ HTTP over per-VM TAP /24
+┌────────────────────────────▼─────────────────────────────────────────────┐
+│ Firecracker microVM (Ubuntu 22.04, 4 vCPU / 16 GB / 200 GB sparse) │
+│ ┌──────────────────────────────────────────────────────────────────┐ │
+│ │ in-VM agent (stdlib python HTTP server) │ │
+│ │ /health, /stats, /provision, /query │ │
+│ └────────────────────────┬─────────────────────────────────────────┘ │
+│ ▼ runs │
+│ ┌──────────────────────────────────────────────────────────────────┐ │
+│ │ /opt/clickbench/system/ — system's ClickBench scripts (RW) │ │
+│ │ /opt/clickbench/datasets/ — shared dataset image (RO) │ │
+│ └──────────────────────────────────────────────────────────────────┘ │
+└──────────────────────────────────────────────────────────────────────────┘
+```
+
+## State machine (per system)
+
+```
+ ┌───────┐ no snapshot ┌───────────────┐
+ │ down │─────────────▶│ provisioning │
+ └───┬───┘ └───────┬───────┘
+ │ │ provision OK
+ │ snapshot ok ▼
+ │ ┌──────────────┐
+ ▼ │ snapshotted │
+ ┌───────────┐ restore └──────┬───────┘
+ │ ready │◀──────────────── │
+ └─────┬─────┘ │
+ │ watchdog / failed query │
+ └─────────────────────────┘
+```
+
+`ready` is the only state that accepts /query. Any restart (watchdog or
+explicit kick) returns to `snapshotted`; the next /query restores from the
+on-disk snapshot.
+
+## Snapshots
+
+Created the first time a system is requested. Two artifacts:
+
+- `/systems//snapshot.state` — Firecracker VM state metadata
+- `/systems//snapshot.bin` — guest memory dump (16 GB in
+ size as configured, but sparse)
+
+The `rootfs.ext4` and `system.ext4` files persist across snapshots and are
+re-attached at restore time. Drive paths in the snapshot are remapped to
+their current host locations on restore so we don't have to re-snapshot if
+the playground gets moved or rebooted.
+
+## Networking
+
+A `/24` per VM, with the host owning `.1` and the guest owning `.2`. Each
+TAP is `fc-tap-`, where `` is the deterministic per-system index
+assigned in `VMManager.__init__`.
+
+```
+host 10.200..1/24 ◀── TAP ─▶ 10.200..2/24 guest
+```
+
+During the provision phase only, iptables NAT/FORWARD rules are added so
+the guest can `apt-get` / `curl`. After the snapshot, those rules are
+deleted — outbound traffic is dropped, the host↔guest link remains.
+
+## Output truncation
+
+Truncation is applied **inside the agent**, before bytes leave the VM:
+
+- Stdout from the system's `./query` script is capped at
+ `CLICKBENCH_OUTPUT_LIMIT` bytes (default 10 KB).
+- The agent's response sets `X-Output-Truncated: 1` and
+ `X-Output-Bytes: ` so the client can show "this is a
+ partial result of N bytes."
+- The host API server passes the headers through unchanged.
+
+## Watchdog rules
+
+The `Monitor` thread samples every running Firecracker process once per
+second:
+
+- **CPU**: if per-VM CPU usage (utime+stime / wallclock / vcpus) stays
+ ≥ `VM_CPU_BUSY_THRESHOLD` (default 97%) for `VM_CPU_BUSY_WINDOW_SEC`
+ contiguous seconds (default 120), the VM is killed.
+- **Disk**: if the sparse `rootfs.ext4` has allocated more than
+ `VM_DISK_FULL_PCT` (default 97%) of `VM_ROOTFS_SIZE_GB`, the VM is
+ killed.
+- **Host RAM**: if `MemAvailable` drops below `HOST_MIN_FREE_RAM_GB`
+ (default 32 GiB), the watchdog kills the VM with the largest RSS.
+- **Host disk**: if free space on `PLAYGROUND_STATE_DIR` drops below
+ `HOST_MIN_FREE_DISK_GB` (default 500 GiB), the watchdog kills the VM
+ with the largest allocated rootfs.
+
+A "kill" leaves the snapshot intact. The next user query restores from
+snapshot, paying ~1 s of memory restore cost.
diff --git a/playground/docs/build-progress.md b/playground/docs/build-progress.md
new file mode 100644
index 0000000000..a710246518
--- /dev/null
+++ b/playground/docs/build-progress.md
@@ -0,0 +1,110 @@
+# Playground build progress — checkpoint 2026-05-12 ~19:58 UTC
+
+## What is built and committed
+
+- `playground/` directory scaffolded with subdirs `server/`, `agent/`,
+ `images/`, `web/`, `scripts/`, `docs/`.
+- Architecture notes in `playground/README.md` and
+ `playground/docs/architecture.md`.
+- Host-side API server (`playground/server/*.py`):
+ - `config.py` — env-driven config with sensible defaults
+ - `systems.py` — discovers 97 playground-eligible ClickBench systems
+ - `firecracker.py` — async unix-socket client for Firecracker API
+ - `net.py` — per-VM TAP + /24 + NAT toggle
+ - `vm_manager.py` — VM lifecycle (boot, provision, snapshot, restore)
+ - `monitor.py` — CPU/disk/host-memory watchdog (1 Hz)
+ - `logging_sink.py` — batched async logger → ClickHouse Cloud + JSONL fallback
+ - `main.py` — aiohttp routes + static SPA serving
+- In-VM agent (`playground/agent/agent.py`, stdlib-only) with endpoints
+ `/health`, `/stats`, `/provision`, `/query`, `/provision-log`.
+- systemd unit `playground/agent/clickbench-agent.service` installed in the
+ rootfs and enabled.
+- Vanilla JS SPA (`playground/web/`): system picker, query box, timing display,
+ truncation indicator. Talks to `/api/systems`, `/api/system/`,
+ `/api/query?system=...`.
+- Build scripts:
+ - `images/build-base-rootfs.sh` — Ubuntu 22.04 cloud image → flat 8 GB
+ ext4 with agent + systemd unit pre-installed.
+ - `images/build-system-rootfs.sh` — per-system 200 GB sparse rootfs +
+ sized system disk (16/88 GB depending on data format) containing the
+ ClickBench scripts + the dataset files this system needs (no symlinks
+ into a RO mount, because many systems' load scripts `chown`).
+ - `scripts/install-firecracker.sh` — idempotent host setup.
+ - `scripts/download-datasets.sh` — eager dataset download into
+ `/opt/clickbench-playground/datasets/`.
+ - `scripts/smoke-boot.sh` — boots the base rootfs alone in a VM; confirms
+ kernel + rootfs + agent path before per-system testing.
+ - `scripts/agent-selftest.sh` — runs the agent on the host (no VM) and
+ exercises all endpoints with a fake "system" dir. PASSES.
+
+## What is provisioned on disk (host)
+
+```
+/opt/clickbench-playground/
+├── bin/firecracker, bin/jailer (firecracker v1.13.1)
+├── kernel/vmlinux (Linux 6.1.141, IP_PNP + virtio enabled)
+├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent
+├── datasets/
+│ ├── hits.parquet 14.7 GB (single)
+│ ├── hits_partitioned/ 14 GB (100 partitioned files)
+│ ├── hits.tsv 74 GB (decompressed)
+│ ├── hits.csv ~14 GB partial (kill-stopped)
+│ └── hits.csv.gz 16 GB
+└── systems/clickhouse/
+ ├── rootfs.ext4 8.2 MB physical / 200 GB sparse
+ └── system.ext4 16 GB (parquet files staged)
+```
+
+## What works
+
+- Python module imports clean (`python3 -m playground.server.main`).
+- API server serves 97 systems via `/api/systems`.
+- UI loads at `/ui/`.
+- Firecracker smoke-boot (base rootfs only): agent comes up in 2 s,
+ `/health` and `/stats` respond OK.
+- Agent self-test (no VM): all 4 endpoints behave correctly, output
+ truncation works (2 KB → 64 B with `X-Output-Truncated: 1`).
+- Provision started on ClickHouse VM at 19:51:59 UTC:
+ - VM booted, agent up, internet enabled via MASQUERADE on `ens33`
+ - Install ran (ClickHouse binary downloaded + apt deps)
+ - Load is in progress — `cpu_busy=0.8-1.0` sustained, `disk_used`
+ grew from 17 GB → 30 GB, indicating MergeTree INSERT.
+ - At 19:57:33 the agent stopped responding to /health (timeout).
+ Firecracker process is still running (PID 19230, 16 min of CPU).
+ Likely cause: agent's HTTP server starved by the load process,
+ or a fork race in stdlib `socketserver`. Needs investigation.
+
+## What's left
+
+- Decide whether to add eager liveness pings or move agent to aiohttp
+ to avoid the stdlib threading server's quirks under heavy load.
+- Once provision completes: snapshot → restore → /query test path.
+- Build system disks for the other 96 systems (template is ready).
+- Wire up ClickHouse Cloud credentials for the logging sink (currently
+ falling back to JSONL under `/opt/clickbench-playground/logs/`).
+
+## Known issues / things to revisit
+
+- TSV/CSV decompression contends with rootfs build for nvme writeback.
+ Workaround: pre-build the base rootfs before kicking off the heavy
+ decompressions, or rate-limit pigz.
+- The "External" exclusion list in `systems.py` is conservative; some
+ entries (umbra, hyper, cedardb) actually run locally and should be
+ added back when verified.
+- /etc/resolv.conf in the base rootfs is a static fallback (1.1.1.1 +
+ 8.8.8.8). Once we cut internet post-snapshot, DNS doesn't matter, but
+ during provision it does — sanity check that NAT + resolv.conf actually
+ let `apt-get update` work.
+- KVM permissions were opened to mode 666 via a udev rule. Tighten to
+ the `kvm` group when the playground user is properly added.
+
+## Operator notes
+
+- The base rootfs ships with serial autologin as root on ttyS0 — good for
+ attaching the Firecracker console for debugging.
+- Firecracker logs land in `/opt/clickbench-playground/logs/firecracker-.log`.
+- The host's `/dev/kvm` group/mode was changed: `chown root:kvm`, `chmod 666`,
+ with a persistent udev rule at `/etc/udev/rules.d/65-kvm.rules`.
+- `vm.dirty_writeback_centisecs` is set to 10 on the host (down from 500)
+ to reduce sfdisk hang during heavy concurrent writeback. Revert if it
+ causes other problems.
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
new file mode 100755
index 0000000000..2cffbaf52f
--- /dev/null
+++ b/playground/images/build-base-rootfs.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+# Build a base Ubuntu 22.04 rootfs for the Firecracker microVMs.
+#
+# Strategy: start from the official Ubuntu 22.04 cloud image (qcow2), convert
+# to raw, mount it, install python3 + sudo + curl + iproute2, drop the agent in
+# place, install a systemd unit that runs the agent on boot, and add a
+# /etc/fstab line that mounts the dataset disk read-only.
+#
+# The resulting image is /opt/clickbench-playground/base-rootfs.ext4. Per-system
+# images are produced by overlaying the system's ClickBench scripts onto a copy
+# of this base.
+#
+# Idempotent: re-running just re-builds the file from scratch.
+
+set -euo pipefail
+
+STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
+TMP="${STATE_DIR}/tmp/base-build"
+OUT="${STATE_DIR}/base-rootfs.ext4"
+SIZE_GB="${BASE_ROOTFS_SIZE_GB:-8}"
+CLOUDIMG_URL="${UBUNTU_CLOUDIMG_URL:-https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img}"
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+AGENT_DIR="${REPO_DIR}/playground/agent"
+
+echo "[base] state=$STATE_DIR out=$OUT size=${SIZE_GB}G"
+
+mkdir -p "$TMP"
+mkdir -p "$STATE_DIR/cache"
+
+CLOUDIMG="$STATE_DIR/cache/jammy-cloudimg.img"
+if [ ! -f "$CLOUDIMG" ]; then
+ echo "[base] downloading cloud image"
+ curl -fsSL "$CLOUDIMG_URL" -o "${CLOUDIMG}.part"
+ mv "${CLOUDIMG}.part" "$CLOUDIMG"
+fi
+
+# Plan: rather than grow the cloud image's partition (which involves
+# sfdisk/growpart/resize2fs — all of which call `sync` and therefore stall
+# whenever the host is under unrelated writeback pressure), we work in two
+# fixed-size hops:
+#
+# 1. Loop-mount the cloud image's existing partition (2 GB) and use that
+# as a read-only source.
+# 2. Create a fresh, no-partition-table ext4 image of SIZE_GB and mount it
+# as the build root. Copy the cloud image's content into it. The new
+# image is what Firecracker boots from (it expects a flat ext4, no
+# partition table).
+#
+# No growpart, no resize2fs, no waiting on the kernel to flush GBs of
+# unrelated dirty pages just to update a partition table.
+
+RAW="$TMP/base.raw"
+echo "[base] converting cloud image to raw"
+qemu-img convert -O raw "$CLOUDIMG" "$RAW"
+
+SRC_LOOP="$(sudo losetup --find --show --partscan "$RAW")"
+trap 'sudo losetup -d "$SRC_LOOP" 2>/dev/null || true' EXIT
+for i in $(seq 1 20); do
+ if [ -b "${SRC_LOOP}p1" ]; then break; fi
+ sleep 0.5
+done
+
+SRC_MNT="$TMP/src"
+mkdir -p "$SRC_MNT"
+sudo mount -o ro "${SRC_LOOP}p1" "$SRC_MNT"
+
+# Now build the *target* image: a plain ext4 file of SIZE_GB with no partition
+# table. Firecracker boots root=/dev/vda directly off this.
+echo "[base] mkfs.ext4 -> ${SIZE_GB}G no-partition flat image"
+FLAT="$TMP/base.flat.ext4"
+fallocate -l "${SIZE_GB}G" "$FLAT"
+mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$FLAT" >/dev/null
+
+DST_LOOP="$(sudo losetup --find --show "$FLAT")"
+MNT="$TMP/mnt"
+mkdir -p "$MNT"
+sudo mount "$DST_LOOP" "$MNT"
+trap '
+ sudo umount "'"$SRC_MNT"'" 2>/dev/null || true
+ sudo umount "'"$MNT"'" 2>/dev/null || true
+ sudo losetup -d "'"$SRC_LOOP"'" 2>/dev/null || true
+ sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true
+' EXIT
+
+# Stage the cloud image contents into the new rootfs.
+echo "[base] copying cloud image content into flat rootfs"
+sudo cp -a "$SRC_MNT"/. "$MNT"/
+sudo umount "$SRC_MNT"
+sudo losetup -d "$SRC_LOOP"
+trap '
+ sudo umount "'"$MNT"'" 2>/dev/null || true
+ sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true
+' EXIT
+
+# Bind /dev /proc /sys for the chroot.
+for d in dev proc sys; do
+ sudo mkdir -p "$MNT/$d"
+ sudo mount --rbind "/$d" "$MNT/$d"
+done
+trap '
+ for d in dev proc sys; do sudo umount -lR "'"$MNT"'/$d" 2>/dev/null || true; done
+ sudo umount "'"$MNT"'" 2>/dev/null || true
+ sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true
+' EXIT
+
+# Resolve DNS from host inside the chroot. The cloud image ships
+# /etc/resolv.conf as a symlink into /run/systemd/resolve/ which is empty
+# until systemd-resolved starts; we need a real file for the chroot's apt
+# to work.
+sudo rm -f "$MNT/etc/resolv.conf"
+sudo install -m 0644 /etc/resolv.conf "$MNT/etc/resolv.conf"
+
+# Run system customization inside the chroot.
+sudo tee "$MNT/tmp/customize.sh" >/dev/null <<'CUSTOMIZE'
+#!/bin/bash
+set -euxo pipefail
+export DEBIAN_FRONTEND=noninteractive
+
+# Disable cloud-init's network configuration so eth0 just comes up via
+# /etc/network/interfaces-style config we install below.
+echo 'network: {config: disabled}' > /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg
+
+# Keep the image small: turn off heavy services that we don't need on a
+# query-serving microVM.
+systemctl disable snapd.service snapd.socket snapd.seeded.service 2>/dev/null || true
+systemctl mask snapd.service snapd.socket snapd.seeded.service 2>/dev/null || true
+systemctl disable unattended-upgrades.service apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true
+systemctl mask unattended-upgrades.service apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true
+
+apt-get update -qq
+apt-get install -y --no-install-recommends \
+ python3 python3-yaml ca-certificates curl wget gnupg sudo less vim-tiny \
+ iproute2 iputils-ping net-tools openssh-server lsb-release \
+ htop sysstat strace ncdu pigz unzip xz-utils zstd \
+ build-essential netbase
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+# Network: the host sets up the VM's IP via the kernel `ip=` cmdline so the
+# guest comes up with the right /24 for its slot. systemd-networkd in the
+# guest must NOT fight the kernel's static config — disable it and rely on
+# the kernel-supplied address. /etc/resolv.conf gets a static fallback so DNS
+# works in case any post-snapshot tooling still wants it (it shouldn't —
+# internet is dropped after the snapshot).
+systemctl disable systemd-networkd 2>/dev/null || true
+systemctl disable systemd-resolved 2>/dev/null || true
+rm -f /etc/resolv.conf
+cat > /etc/resolv.conf < /etc/systemd/system/serial-getty@ttyS0.service.d/override.conf < /etc/fstab < flat re-copy step.
+sudo umount -lR "$MNT/dev" "$MNT/proc" "$MNT/sys"
+sudo umount "$MNT"
+sudo losetup -d "$DST_LOOP"
+trap - EXIT
+
+mv "$FLAT" "$OUT"
+rm -rf "$TMP"
+echo "[base] done: $OUT ($(du -h "$OUT" | cut -f1) physical, $(du -h --apparent-size "$OUT" | cut -f1) apparent)"
diff --git a/playground/images/build-datasets-image.sh b/playground/images/build-datasets-image.sh
new file mode 100755
index 0000000000..492857c9bf
--- /dev/null
+++ b/playground/images/build-datasets-image.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Bundle the downloaded datasets directory into a single read-only ext4 image
+# (datasets.ext4) that gets attached to every Firecracker VM as a virtio-blk
+# device. Mounted at /opt/clickbench/datasets in the guest.
+#
+# A single shared read-only image is much more efficient than virtio-fs (which
+# Firecracker doesn't ship) or per-VM copies of a ~250 GB dataset.
+
+set -euo pipefail
+
+STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
+SRC="$STATE_DIR/datasets"
+OUT="$STATE_DIR/datasets.ext4"
+
+if [ ! -d "$SRC" ]; then
+ echo "no datasets dir: $SRC" >&2
+ exit 1
+fi
+
+bytes=$(du -sb "$SRC" | awk '{print $1}')
+overhead=$(( 4 * 1024 * 1024 * 1024 )) # 4 GiB headroom for ext4 metadata
+size=$(( bytes + overhead ))
+# Round up to MiB
+size_mib=$(( (size + 1024*1024 - 1) / (1024*1024) ))
+
+echo "[datasets] payload=$bytes B image=$size_mib MiB out=$OUT"
+
+rm -f "$OUT"
+truncate -s "${size_mib}M" "$OUT"
+mkfs.ext4 -F -L cbdata -m 0 -E lazy_itable_init=1,lazy_journal_init=1 -O ^has_journal "$OUT" >/dev/null
+
+MNT="$(mktemp -d)"
+trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT
+sudo mount -o loop "$OUT" "$MNT"
+sudo rsync -a --info=progress2 "$SRC"/. "$MNT"/
+sudo sync
+sudo umount "$MNT"
+trap - EXIT
+
+echo "[datasets] done"
+ls -lh "$OUT"
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
new file mode 100755
index 0000000000..90efe01c99
--- /dev/null
+++ b/playground/images/build-system-rootfs.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Build a per-system rootfs and "system disk" image for Firecracker.
+#
+# Outputs (under /opt/clickbench-playground/systems//):
+# rootfs.ext4 CoW-ish copy of base-rootfs.ext4 (sparse 200 GB)
+# system.ext4 ext4 holding ClickBench scripts + the dataset files
+# this system needs. Mounted RW at /opt/clickbench/system
+# in the VM. We include the data here (not a separate
+# read-only datasets disk) because many load scripts do
+# `sudo chown` on the source files, and chown follows
+# symlinks — i.e. it tries to mutate the RO-mounted
+# dataset and fails. Putting the data on the RW system
+# disk sidesteps the problem entirely.
+#
+# The disk is sized based on the system's data format:
+# parquet, parquet-partitioned 16 GB
+# tsv, csv 88 GB
+# none/unknown 2 GB
+#
+# Usage: build-system-rootfs.sh
+
+set -euo pipefail
+
+if [ $# -lt 1 ]; then
+ echo "usage: $0 " >&2
+ exit 2
+fi
+SYSTEM="$1"
+
+STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+BASE="$STATE_DIR/base-rootfs.ext4"
+DATASETS="$STATE_DIR/datasets"
+SRC="$REPO_DIR/$SYSTEM"
+OUT_DIR="$STATE_DIR/systems/$SYSTEM"
+ROOTFS="$OUT_DIR/rootfs.ext4"
+SYSDISK="$OUT_DIR/system.ext4"
+
+ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}"
+
+if [ ! -f "$BASE" ]; then
+ echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2
+ exit 1
+fi
+if [ ! -d "$SRC" ]; then
+ echo "no such system directory: $SRC" >&2
+ exit 1
+fi
+for f in install start load query check stop; do
+ if [ ! -x "$SRC/$f" ]; then
+ echo "system '$SYSTEM' missing executable $f — not playground-ready" >&2
+ exit 1
+ fi
+done
+
+# Discover the data format from the system's benchmark.sh. Source the file in
+# a noop-shell so any of `export BENCH_DOWNLOAD_SCRIPT="..."` /
+# `BENCH_DOWNLOAD_SCRIPT=...` etc. just becomes a variable. Drop everything
+# else by running in a subshell.
+download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \
+ eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \
+ printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")"
+case "$download_script" in
+ *parquet-partitioned*) format=parquet-partitioned; sysdisk_size_gb=16 ;;
+ *parquet-single*) format=parquet; sysdisk_size_gb=16 ;;
+ *tsv*) format=tsv; sysdisk_size_gb=88 ;;
+ *csv*) format=csv; sysdisk_size_gb=88 ;;
+ "") format=none; sysdisk_size_gb=2 ;;
+ *) format=unknown; sysdisk_size_gb=4 ;;
+esac
+echo "[sys:$SYSTEM] format=$format sysdisk_size=${sysdisk_size_gb}G"
+
+mkdir -p "$OUT_DIR"
+
+# 1. Rootfs as a sparse file. Allocate 200 GB but only write blocks when
+# something inside the VM dirties them.
+echo "[sys:$SYSTEM] rootfs.ext4 ${ROOTFS_SIZE_GB}G (sparse)"
+rm -f "$ROOTFS"
+truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS"
+mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$ROOTFS" >/dev/null
+
+BASE_MNT="$(mktemp -d)"
+DST_MNT="$(mktemp -d)"
+trap '
+ sudo umount "'"$BASE_MNT"'" 2>/dev/null || true
+ sudo umount "'"$DST_MNT"'" 2>/dev/null || true
+ rmdir "'"$BASE_MNT"'" "'"$DST_MNT"'" 2>/dev/null || true
+' EXIT
+# A prior smoke-boot likely left the base rootfs's journal dirty. Replay it
+# (fsck -fy is idempotent) before opening read-only — otherwise the loop
+# mount refuses with "cannot mount read-only" and the script blows up
+# silently.
+sudo e2fsck -fy "$BASE" >/dev/null 2>&1 || true
+sudo mount -o loop,ro "$BASE" "$BASE_MNT"
+sudo mount -o loop "$ROOTFS" "$DST_MNT"
+sudo cp -a --reflink=auto "$BASE_MNT"/. "$DST_MNT"/
+echo "$SYSTEM" | sudo tee "$DST_MNT/etc/clickbench-system" >/dev/null
+sudo sync
+sudo umount "$DST_MNT"
+sudo umount "$BASE_MNT"
+trap - EXIT
+
+# 2. System disk: ClickBench scripts + the data files this system needs.
+# Sized per-format. The agent runs ./install/./start/./load with cwd here, so
+# the load script's relative references to hits.parquet / hits.tsv / etc. all
+# resolve to local files it owns.
+echo "[sys:$SYSTEM] system.ext4 ${sysdisk_size_gb}G"
+rm -f "$SYSDISK"
+truncate -s "${sysdisk_size_gb}G" "$SYSDISK"
+mkfs.ext4 -F -L cbsystem -E lazy_itable_init=1,lazy_journal_init=1 "$SYSDISK" >/dev/null
+
+SYS_MNT="$(mktemp -d)"
+trap 'sudo umount "'"$SYS_MNT"'" 2>/dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT
+sudo mount -o loop "$SYSDISK" "$SYS_MNT"
+
+# Scripts.
+sudo rsync -a --exclude 'results/' --exclude '*.json' --exclude 'README*' \
+ "$SRC"/ "$SYS_MNT"/
+
+# Some systems' scripts use ../lib/... — provide it.
+sudo mkdir -p "$SYS_MNT/_lib"
+sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/_lib"/
+
+# Data files.
+case "$format" in
+ parquet)
+ if [ -f "$DATASETS/hits.parquet" ]; then
+ echo "[sys:$SYSTEM] copying hits.parquet"
+ sudo cp --reflink=auto "$DATASETS/hits.parquet" "$SYS_MNT/hits.parquet"
+ else
+ echo "[sys:$SYSTEM] WARN hits.parquet not present in datasets dir"
+ fi
+ ;;
+ parquet-partitioned)
+ if [ -d "$DATASETS/hits_partitioned" ]; then
+ echo "[sys:$SYSTEM] copying 100 partitioned parquet files"
+ sudo cp --reflink=auto "$DATASETS/hits_partitioned"/hits_*.parquet "$SYS_MNT/"
+ else
+ echo "[sys:$SYSTEM] WARN hits_partitioned/ not present"
+ fi
+ ;;
+ tsv)
+ if [ -f "$DATASETS/hits.tsv" ]; then
+ echo "[sys:$SYSTEM] copying hits.tsv (large)"
+ sudo cp --reflink=auto "$DATASETS/hits.tsv" "$SYS_MNT/hits.tsv"
+ else
+ echo "[sys:$SYSTEM] WARN hits.tsv not present"
+ fi
+ ;;
+ csv)
+ if [ -f "$DATASETS/hits.csv" ]; then
+ echo "[sys:$SYSTEM] copying hits.csv (large)"
+ sudo cp --reflink=auto "$DATASETS/hits.csv" "$SYS_MNT/hits.csv"
+ else
+ echo "[sys:$SYSTEM] WARN hits.csv not present"
+ fi
+ ;;
+ none|unknown)
+ echo "[sys:$SYSTEM] no data staging for format=$format"
+ ;;
+esac
+
+sudo chown -R 0:0 "$SYS_MNT"
+sudo chmod -R u+rwX,go+rX "$SYS_MNT"
+sudo sync
+sudo umount "$SYS_MNT"
+trap - EXIT
+
+echo "[sys:$SYSTEM] done"
+ls -lh "$OUT_DIR"
diff --git a/playground/scripts/agent-selftest.sh b/playground/scripts/agent-selftest.sh
new file mode 100755
index 0000000000..afad1ca506
--- /dev/null
+++ b/playground/scripts/agent-selftest.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Spin up the agent in a local sandbox and hit its HTTP endpoints. Useful for
+# iterating on agent.py without rebuilding a Firecracker image.
+#
+# The sandbox is just two temp directories that mimic the in-VM mounts:
+# /tmp/clickbench-selftest/system — copy of the duckdb system dir
+# /tmp/clickbench-selftest/datasets — empty
+#
+# We exercise:
+# GET /health expects 200 with provisioned=false
+# GET /stats expects 200 with cpu/mem/disk
+# POST /provision expects 200 (will fail unless duckdb is locally installed)
+# POST /query expects 200 with timing headers, output bytes capped
+#
+# Cleanup: kills the agent on exit.
+
+set -euo pipefail
+
+SANDBOX="${SANDBOX:-/tmp/clickbench-selftest}"
+SYS="${SANDBOX}/system"
+DATA="${SANDBOX}/datasets"
+PORT="${PORT:-18080}"
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+rm -rf "$SANDBOX"
+mkdir -p "$SYS" "$DATA"
+cp -a "$REPO_DIR/duckdb"/. "$SYS"/
+
+# A trivial "system" that doesn't need provisioning: replace install/start/load
+# with no-ops so the smoke test focuses on the agent's HTTP path.
+cat > "$SYS/install" <<'EOF'
+#!/bin/bash
+echo "fake install"
+EOF
+cat > "$SYS/start" <<'EOF'
+#!/bin/bash
+exit 0
+EOF
+cat > "$SYS/check" <<'EOF'
+#!/bin/bash
+exit 0
+EOF
+cat > "$SYS/load" <<'EOF'
+#!/bin/bash
+echo "fake load"
+EOF
+# A query script that echoes the request and reports 0.123s.
+cat > "$SYS/query" <<'EOF'
+#!/bin/bash
+cat
+echo "0.123" >&2
+EOF
+chmod +x "$SYS"/{install,start,check,load,query}
+
+echo "selftest: starting agent on :$PORT"
+CLICKBENCH_SYSTEM_DIR="$SYS" \
+CLICKBENCH_DATASETS_DIR="$DATA" \
+CLICKBENCH_AGENT_STATE="$SANDBOX/state" \
+CLICKBENCH_SYSTEM_NAME=selftest \
+CLICKBENCH_AGENT_PORT="$PORT" \
+CLICKBENCH_OUTPUT_LIMIT=64 \
+python3 "$REPO_DIR/playground/agent/agent.py" &
+AGENT_PID=$!
+trap 'kill $AGENT_PID 2>/dev/null || true' EXIT
+
+# wait for listen
+for i in {1..30}; do
+ if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
+ break
+ fi
+ sleep 0.2
+done
+
+echo "--- /health ---"
+curl -fsS "http://127.0.0.1:$PORT/health"
+echo "--- /stats ---"
+curl -fsS "http://127.0.0.1:$PORT/stats"
+echo "--- POST /provision ---"
+curl -fsS -X POST "http://127.0.0.1:$PORT/provision" | head -c 500; echo
+
+echo "--- POST /query (capped output) ---"
+LONG_BODY="$(printf 'X%.0s' {1..2048})" # 2 KB of X
+curl -sS -X POST --data-binary "$LONG_BODY" "http://127.0.0.1:$PORT/query" -D - -o /tmp/clickbench-selftest.out
+echo
+echo "Output size: $(wc -c < /tmp/clickbench-selftest.out) bytes (cap was 64)"
+echo "First chars: $(head -c 32 /tmp/clickbench-selftest.out)"
+
+echo "--- POST /query (without provisioning state) ---"
+rm -rf "$SANDBOX/state"
+mkdir -p "$SANDBOX/state"
+curl -sS -X POST --data-binary "SELECT 1" "http://127.0.0.1:$PORT/query" -D - -o /dev/null | head -3
+
+echo "OK"
diff --git a/playground/scripts/download-datasets.sh b/playground/scripts/download-datasets.sh
new file mode 100755
index 0000000000..b30fff4473
--- /dev/null
+++ b/playground/scripts/download-datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Eagerly download every ClickBench dataset format into the playground
+# datasets dir. Run idempotent: each download script is `wget --continue`-based
+# so re-running picks up where the previous run left off.
+#
+# Output:
+# /opt/clickbench-playground/datasets/
+# hits.parquet single-file Athena parquet
+# hits_partitioned/hits_0..99.parquet partitioned parquet
+# hits.tsv decompressed TSV (~75 GB)
+# hits.csv decompressed CSV (~75 GB)
+#
+# These files are read-only-mounted into every Firecracker VM via a virtio-blk
+# device built by `build-datasets-image.sh`.
+
+set -e
+
+STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
+DATASETS="${STATE_DIR}/datasets"
+LIB="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../.. && pwd)/lib"
+
+mkdir -p "$DATASETS"
+mkdir -p "$DATASETS/hits_partitioned"
+
+step() { echo "[$(date -u +%FT%TZ)] $*"; }
+
+step "parquet (single)"
+if [ ! -f "$DATASETS/hits.parquet" ] || [ "$(stat -c%s "$DATASETS/hits.parquet" 2>/dev/null || echo 0)" -lt 14000000000 ]; then
+ "$LIB/download-hits-parquet-single" "$DATASETS"
+else
+ step " cached"
+fi
+
+step "parquet (partitioned)"
+need=0
+for i in $(seq 0 99); do
+ f="$DATASETS/hits_partitioned/hits_${i}.parquet"
+ if [ ! -f "$f" ] || [ "$(stat -c%s "$f" 2>/dev/null || echo 0)" -lt 100000000 ]; then
+ need=1
+ break
+ fi
+done
+if [ "$need" = "1" ]; then
+ "$LIB/download-hits-parquet-partitioned" "$DATASETS/hits_partitioned"
+else
+ step " cached"
+fi
+
+step "tsv"
+if [ ! -f "$DATASETS/hits.tsv" ] || [ "$(stat -c%s "$DATASETS/hits.tsv" 2>/dev/null || echo 0)" -lt 70000000000 ]; then
+ "$LIB/download-hits-tsv" "$DATASETS"
+else
+ step " cached"
+fi
+
+step "csv"
+if [ ! -f "$DATASETS/hits.csv" ] || [ "$(stat -c%s "$DATASETS/hits.csv" 2>/dev/null || echo 0)" -lt 70000000000 ]; then
+ "$LIB/download-hits-csv" "$DATASETS"
+else
+ step " cached"
+fi
+
+step "done"
+du -sh "$DATASETS"/*
diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh
new file mode 100755
index 0000000000..f2dfe9cd84
--- /dev/null
+++ b/playground/scripts/install-firecracker.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Idempotent: download firecracker + jailer if they're not in
+# /opt/clickbench-playground/bin/, and fetch the guest kernel.
+
+set -euo pipefail
+
+STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
+FC_VERSION="${FIRECRACKER_VERSION:-v1.13.1}"
+KERNEL_URL="${GUEST_KERNEL_URL:-https://s3.amazonaws.com/spec.ccfc.min/firecracker-ci/v1.13/x86_64/vmlinux-6.1.141}"
+
+sudo mkdir -p "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache}
+sudo chown -R "$(id -u):$(id -g)" "$STATE_DIR"
+
+if [ ! -x "$STATE_DIR/bin/firecracker" ]; then
+ arch="$(uname -m)"
+ url="https://github.com/firecracker-microvm/firecracker/releases/download/${FC_VERSION}/firecracker-${FC_VERSION}-${arch}.tgz"
+ echo "[install] firecracker ${FC_VERSION}"
+ tmpdir="$(mktemp -d)"
+ curl -fsSL "$url" -o "$tmpdir/firecracker.tgz"
+ tar -C "$tmpdir" -xzf "$tmpdir/firecracker.tgz" --strip-components=1
+ install -m 0755 "$tmpdir/firecracker-${FC_VERSION}-${arch}" "$STATE_DIR/bin/firecracker"
+ install -m 0755 "$tmpdir/jailer-${FC_VERSION}-${arch}" "$STATE_DIR/bin/jailer"
+ rm -rf "$tmpdir"
+fi
+
+if [ ! -f "$STATE_DIR/kernel/vmlinux" ]; then
+ echo "[install] guest kernel"
+ curl -fsSL "$KERNEL_URL" -o "$STATE_DIR/kernel/vmlinux"
+fi
+
+# IP forwarding for the per-VM TAPs.
+sudo sysctl -w net.ipv4.ip_forward=1 >/dev/null
+echo "net.ipv4.ip_forward=1" | sudo tee /etc/sysctl.d/99-clickbench-playground.conf >/dev/null
+
+echo "[install] done"
+"$STATE_DIR/bin/firecracker" --version
diff --git a/playground/scripts/run-server.sh b/playground/scripts/run-server.sh
new file mode 100755
index 0000000000..b3bc56b959
--- /dev/null
+++ b/playground/scripts/run-server.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Convenience wrapper to start the playground API server in the foreground.
+# Looks for .env in the repo root for ClickHouse Cloud creds.
+
+set -euo pipefail
+
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+if [ -f "$REPO_DIR/playground/.env" ]; then
+ # shellcheck disable=SC2046
+ export $(grep -v '^#' "$REPO_DIR/playground/.env" | xargs)
+fi
+
+cd "$REPO_DIR"
+exec python3 -m playground.server.main
diff --git a/playground/scripts/smoke-boot.sh b/playground/scripts/smoke-boot.sh
new file mode 100755
index 0000000000..d79ecc8c87
--- /dev/null
+++ b/playground/scripts/smoke-boot.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# Boot a single Firecracker VM with the playground's base rootfs, attaching
+# only the rootfs (no system disk, no dataset disk). Confirms the kernel +
+# rootfs + agent path works end-to-end before we start asking it to install
+# a database. Tears down on exit.
+#
+# Usage: smoke-boot.sh [slot]
+# Logs go to /opt/clickbench-playground/logs/smoke-boot.log
+
+set -euo pipefail
+
+STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
+SLOT="${1:-250}" # high slot to avoid clashing with the real registry
+SOCK="$STATE_DIR/vms/smoke-boot.sock"
+LOG="$STATE_DIR/logs/smoke-boot.log"
+TAP="fc-tap-${SLOT}"
+HOST_IP="10.200.${SLOT}.1"
+GUEST_IP="10.200.${SLOT}.2"
+
+cleanup() {
+ echo "[smoke] cleanup"
+ pkill -f "firecracker.*${SOCK}" 2>/dev/null || true
+ sleep 0.3
+ sudo ip link set "$TAP" down 2>/dev/null || true
+ sudo ip tuntap del dev "$TAP" mode tap 2>/dev/null || true
+ rm -f "$SOCK"
+}
+trap cleanup EXIT
+
+mkdir -p "$STATE_DIR/vms" "$STATE_DIR/logs"
+rm -f "$SOCK"
+
+if ! ip link show "$TAP" >/dev/null 2>&1; then
+ sudo ip tuntap add dev "$TAP" mode tap
+fi
+sudo ip addr flush dev "$TAP" 2>/dev/null || true
+sudo ip addr add "${HOST_IP}/24" dev "$TAP"
+sudo ip link set "$TAP" up
+
+# Start Firecracker
+"$STATE_DIR/bin/firecracker" --api-sock "$SOCK" --id smoke-boot >"$LOG" 2>&1 &
+FC_PID=$!
+echo "[smoke] firecracker pid=$FC_PID sock=$SOCK"
+
+# Wait for socket
+for _ in $(seq 1 40); do
+ [ -S "$SOCK" ] && break
+ sleep 0.1
+done
+
+api() {
+ local m="$1" path="$2" body="${3:-}"
+ if [ -n "$body" ]; then
+ curl --unix-socket "$SOCK" -fsS -X "$m" "http://localhost$path" \
+ -H 'Content-Type: application/json' --data "$body"
+ else
+ curl --unix-socket "$SOCK" -fsS -X "$m" "http://localhost$path"
+ fi
+}
+
+api PUT /boot-source "$(cat </dev/null 2>&1; then
+ ok=1
+ break
+ fi
+ sleep 1
+done
+
+if [ "$ok" = "1" ]; then
+ echo "[smoke] OK — agent responded after ${i}s"
+ curl -fsS "http://${GUEST_IP}:8080/health" | head -c 200; echo
+ echo "[smoke] /stats:"
+ curl -fsS "http://${GUEST_IP}:8080/stats" | head -c 400; echo
+else
+ echo "[smoke] FAIL — agent never responded; firecracker log tail:"
+ tail -30 "$LOG"
+fi
diff --git a/playground/server/__init__.py b/playground/server/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/playground/server/config.py b/playground/server/config.py
new file mode 100644
index 0000000000..6a08189d85
--- /dev/null
+++ b/playground/server/config.py
@@ -0,0 +1,98 @@
+"""Central configuration for the playground server.
+
+All knobs are read from environment variables so a single systemd unit can drop
+them in. Falls back to sensible defaults for local development.
+"""
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+
+def _env_int(name: str, default: int) -> int:
+ v = os.environ.get(name)
+ if not v:
+ return default
+ try:
+ return int(v)
+ except ValueError:
+ return default
+
+
+def _env_bytes(name: str, default: int) -> int:
+ return _env_int(name, default)
+
+
+@dataclass(frozen=True)
+class Config:
+ # Where on the host disk we keep VM artifacts and dataset images.
+ state_dir: Path
+ repo_dir: Path
+ # HTTP listen target for the playground API server. Plain host:port string;
+ # aiohttp parses it.
+ listen_host: str
+ listen_port: int
+ # Per-VM resources.
+ vm_vcpus: int
+ vm_mem_mib: int
+ vm_rootfs_size_gb: int
+ # Output cap applied at the host edge (the agent enforces a per-VM cap too).
+ output_limit_bytes: int
+ # Max number of VMs we'll keep "warm" (resumed from snapshot, ready to
+ # answer) concurrently.
+ max_warm_vms: int
+ # Watchdog thresholds.
+ cpu_busy_window_sec: int
+ cpu_busy_threshold: float
+ host_min_free_ram_gb: int
+ host_min_free_disk_gb: int
+ # Per-system disk full check.
+ vm_disk_pct_kill_threshold: float
+ # ClickHouse Cloud logging.
+ ch_cloud_url: str
+ ch_cloud_user: str
+ ch_cloud_password: str
+ ch_cloud_db: str
+
+ @property
+ def kernel_path(self) -> Path: return self.state_dir / "kernel" / "vmlinux"
+ @property
+ def base_rootfs(self) -> Path: return self.state_dir / "base-rootfs.ext4"
+ @property
+ def datasets_image(self) -> Path: return self.state_dir / "datasets.ext4"
+ @property
+ def systems_dir(self) -> Path: return self.state_dir / "systems"
+ @property
+ def vms_dir(self) -> Path: return self.state_dir / "vms"
+ @property
+ def logs_dir(self) -> Path: return self.state_dir / "logs"
+ @property
+ def firecracker_bin(self) -> Path: return self.state_dir / "bin" / "firecracker"
+
+
+def load() -> Config:
+ state_dir = Path(os.environ.get("PLAYGROUND_STATE_DIR", "/opt/clickbench-playground"))
+ repo_dir = Path(os.environ.get("PLAYGROUND_REPO_DIR", "/home/ubuntu/ClickBench"))
+ listen = os.environ.get("PLAYGROUND_LISTEN", "0.0.0.0:8000")
+ host, _, port = listen.rpartition(":")
+ return Config(
+ state_dir=state_dir,
+ repo_dir=repo_dir,
+ listen_host=host or "0.0.0.0",
+ listen_port=int(port or 8000),
+ vm_vcpus=_env_int("VM_VCPUS", 4),
+ vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024),
+ vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200),
+ output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024),
+ max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16),
+ cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120),
+ cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")),
+ host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32),
+ host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 500),
+ vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")),
+ ch_cloud_url=os.environ.get("CLICKHOUSE_CLOUD_URL", ""),
+ ch_cloud_user=os.environ.get("CLICKHOUSE_CLOUD_USER", ""),
+ ch_cloud_password=os.environ.get("CLICKHOUSE_CLOUD_PASSWORD", ""),
+ ch_cloud_db=os.environ.get("CLICKHOUSE_CLOUD_DB", "playground"),
+ )
diff --git a/playground/server/firecracker.py b/playground/server/firecracker.py
new file mode 100644
index 0000000000..62aba74dca
--- /dev/null
+++ b/playground/server/firecracker.py
@@ -0,0 +1,117 @@
+"""Thin async wrapper around Firecracker's REST API (Unix socket).
+
+We talk to the Firecracker process through its API socket, not the JSON config
+file, because that's the only way to drive snapshot create/load and to mutate
+runtime state.
+
+The HTTP layer is hand-rolled (single-shot HTTP/1.1 over Unix socket) so we
+don't pull in extra deps just to send a few PUTs. Each call opens a new
+connection — Firecracker's API socket is single-threaded and that's fine.
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+from typing import Any
+
+
+class FirecrackerError(RuntimeError):
+ pass
+
+
+async def _request(socket_path: str, method: str, path: str, body: Any = None,
+ timeout: float = 30.0) -> tuple[int, bytes]:
+ payload = b""
+ if body is not None:
+ payload = json.dumps(body).encode()
+ req_lines = [
+ f"{method} {path} HTTP/1.1",
+ "Host: localhost",
+ "Accept: application/json",
+ "Connection: close",
+ ]
+ if payload:
+ req_lines.append("Content-Type: application/json")
+ req_lines.append(f"Content-Length: {len(payload)}")
+ req_lines.append("")
+ req_lines.append("")
+ head = "\r\n".join(req_lines).encode()
+
+ reader, writer = await asyncio.wait_for(
+ asyncio.open_unix_connection(socket_path), timeout=timeout
+ )
+ try:
+ writer.write(head + payload)
+ await writer.drain()
+ # Read response head line-by-line until the blank line that ends the
+ # header block. Don't `read(-1)` — Firecracker keeps the connection
+ # open after small responses (204s in particular), so EOF-based reads
+ # block until our timeout despite the response being fully on the
+ # wire. Once we have headers we know the Content-Length and can read
+ # exactly that many body bytes.
+ head_lines: list[bytes] = []
+ while True:
+ line = await asyncio.wait_for(reader.readline(), timeout=timeout)
+ if not line:
+ # Server closed the connection mid-headers.
+ break
+ head_lines.append(line)
+ if line == b"\r\n" or line == b"\n":
+ break
+
+ if not head_lines:
+ raise FirecrackerError(f"no response from firecracker for {method} {path}")
+ status_line = head_lines[0].rstrip(b"\r\n").decode("ascii", errors="replace")
+ parts = status_line.split(" ", 2)
+ if len(parts) < 2:
+ raise FirecrackerError(f"bad status line: {status_line!r}")
+ code = int(parts[1])
+
+ content_length = 0
+ for raw_h in head_lines[1:]:
+ h = raw_h.rstrip(b"\r\n")
+ if not h:
+ continue
+ name, _, value = h.partition(b":")
+ if name.strip().lower() == b"content-length":
+ try:
+ content_length = int(value.strip())
+ except ValueError:
+ content_length = 0
+
+ body_b = b""
+ if content_length > 0:
+ body_b = await asyncio.wait_for(
+ reader.readexactly(content_length), timeout=timeout
+ )
+ finally:
+ try:
+ writer.close()
+ await writer.wait_closed()
+ except Exception:
+ pass
+ return code, body_b
+
+
+async def put(socket_path: str, path: str, body: Any = None, timeout: float = 30.0) -> None:
+ code, b = await _request(socket_path, "PUT", path, body, timeout)
+ if code >= 300:
+ raise FirecrackerError(f"PUT {path} -> {code}: {b!r}")
+
+
+async def patch(socket_path: str, path: str, body: Any = None, timeout: float = 30.0) -> None:
+ code, b = await _request(socket_path, "PATCH", path, body, timeout)
+ if code >= 300:
+ raise FirecrackerError(f"PATCH {path} -> {code}: {b!r}")
+
+
+async def get(socket_path: str, path: str, timeout: float = 30.0) -> dict:
+ code, b = await _request(socket_path, "GET", path, timeout=timeout)
+ if code >= 300:
+ raise FirecrackerError(f"GET {path} -> {code}: {b!r}")
+ if not b:
+ return {}
+ try:
+ return json.loads(b)
+ except Exception as e:
+ raise FirecrackerError(f"GET {path} -> non-JSON body: {b!r}") from e
diff --git a/playground/server/logging_sink.py b/playground/server/logging_sink.py
new file mode 100644
index 0000000000..6ba444c4e0
--- /dev/null
+++ b/playground/server/logging_sink.py
@@ -0,0 +1,190 @@
+"""Batched, async logger that writes events to ClickHouse Cloud over HTTPS.
+
+Two tables (auto-created on first connect if writeable):
+
+ playground.requests
+ ts DateTime64(6)
+ client_addr String
+ user_agent String
+ system String
+ query String
+ output_bytes UInt64
+ output_truncated UInt8
+ query_time Nullable(Float64) from agent X-Query-Time
+ wall_time Float64 host-side end-to-end
+ status UInt16 HTTP status returned to client
+ error String
+
+ playground.events
+ ts DateTime64(6)
+ system String
+ kind String "restart" / "oom-kick" / "boot" / ...
+ detail String
+
+When CLICKHOUSE_CLOUD_URL is unset, both tables are mirrored to
+/opt/clickbench-playground/logs/requests.jsonl and events.jsonl so the
+service still has an audit trail in dev.
+"""
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any
+
+import aiohttp
+
+from .config import Config
+
+log = logging.getLogger("logging_sink")
+
+
+_REQUESTS_DDL = """
+CREATE TABLE IF NOT EXISTS playground.requests (
+ ts DateTime64(6) DEFAULT now64(6),
+ client_addr String,
+ user_agent String,
+ system String,
+ query String,
+ output_bytes UInt64,
+ output_truncated UInt8,
+ query_time Nullable(Float64),
+ wall_time Float64,
+ status UInt16,
+ error String
+) ENGINE = MergeTree ORDER BY (system, ts)
+"""
+
+_EVENTS_DDL = """
+CREATE TABLE IF NOT EXISTS playground.events (
+ ts DateTime64(6) DEFAULT now64(6),
+ system String,
+ kind String,
+ detail String
+) ENGINE = MergeTree ORDER BY (system, ts)
+"""
+
+
+class LoggingSink:
+ def __init__(self, cfg: Config):
+ self.cfg = cfg
+ self._queue: asyncio.Queue[tuple[str, dict]] = asyncio.Queue(maxsize=10000)
+ self._task: asyncio.Task | None = None
+ self._session: aiohttp.ClientSession | None = None
+ self._local_files: dict[str, Path] = {}
+ self._enabled = bool(cfg.ch_cloud_url and cfg.ch_cloud_user and cfg.ch_cloud_password)
+
+ async def start(self) -> None:
+ self.cfg.logs_dir.mkdir(parents=True, exist_ok=True)
+ self._local_files = {
+ "requests": self.cfg.logs_dir / "requests.jsonl",
+ "events": self.cfg.logs_dir / "events.jsonl",
+ }
+ if self._enabled:
+ try:
+ self._session = aiohttp.ClientSession()
+ await self._run_ddl()
+ except Exception as e:
+ log.warning("ClickHouse Cloud DDL failed (%r); falling back to JSONL only", e)
+ self._enabled = False
+ self._task = asyncio.create_task(self._flusher(), name="logging-sink")
+
+ async def stop(self) -> None:
+ if self._task:
+ self._task.cancel()
+ with contextlib.suppress(asyncio.CancelledError):
+ await self._task
+ if self._session:
+ await self._session.close()
+
+ def write_request(self, **row: Any) -> None:
+ self._enqueue("requests", row)
+
+ def write_event(self, **row: Any) -> None:
+ self._enqueue("events", row)
+
+ def _enqueue(self, table: str, row: dict) -> None:
+ row.setdefault("ts", _now_dt64())
+ try:
+ self._queue.put_nowait((table, row))
+ except asyncio.QueueFull:
+ # Backpressure: drop oldest log lines first so we never block the
+ # query path on the audit trail.
+ try:
+ self._queue.get_nowait()
+ self._queue.put_nowait((table, row))
+ except Exception:
+ pass
+
+ async def _run_ddl(self) -> None:
+ await self._exec_ch(f"CREATE DATABASE IF NOT EXISTS {self.cfg.ch_cloud_db}")
+ await self._exec_ch(_REQUESTS_DDL.replace("playground.", f"{self.cfg.ch_cloud_db}."))
+ await self._exec_ch(_EVENTS_DDL.replace("playground.", f"{self.cfg.ch_cloud_db}."))
+
+ async def _exec_ch(self, sql: str) -> None:
+ assert self._session is not None
+ async with self._session.post(
+ self.cfg.ch_cloud_url,
+ data=sql,
+ auth=aiohttp.BasicAuth(self.cfg.ch_cloud_user, self.cfg.ch_cloud_password),
+ timeout=aiohttp.ClientTimeout(total=30),
+ ) as r:
+ if r.status >= 300:
+ txt = await r.text()
+ raise RuntimeError(f"CH error {r.status}: {txt[:500]}")
+
+ async def _insert_ch(self, table: str, rows: list[dict]) -> None:
+ if not rows:
+ return
+ body = "\n".join(json.dumps(r, default=str) for r in rows)
+ sql = f"INSERT INTO {self.cfg.ch_cloud_db}.{table} FORMAT JSONEachRow\n{body}"
+ await self._exec_ch(sql)
+
+ async def _flusher(self) -> None:
+ buf: dict[str, list[dict]] = {"requests": [], "events": []}
+ last_flush = time.monotonic()
+ try:
+ while True:
+ timeout = 1.0
+ try:
+ table, row = await asyncio.wait_for(self._queue.get(), timeout=timeout)
+ buf[table].append(row)
+ except asyncio.TimeoutError:
+ pass
+ # Flush every 1s or when batch >= 256 rows for any table
+ now = time.monotonic()
+ full = any(len(v) >= 256 for v in buf.values())
+ if full or now - last_flush > 1.0:
+ await self._do_flush(buf)
+ for k in buf:
+ buf[k] = []
+ last_flush = now
+ except asyncio.CancelledError:
+ await self._do_flush(buf)
+ raise
+
+ async def _do_flush(self, buf: dict[str, list[dict]]) -> None:
+ for table, rows in buf.items():
+ if not rows:
+ continue
+ # Always write to the local JSONL too — gives us a tail for
+ # debugging and a buffer if CH Cloud rejects.
+ try:
+ with open(self._local_files[table], "ab") as f:
+ for r in rows:
+ f.write((json.dumps(r, default=str) + "\n").encode())
+ except Exception:
+ pass
+ if self._enabled:
+ try:
+ await self._insert_ch(table, rows)
+ except Exception as e:
+ log.warning("CH insert failed (%r); rows preserved in JSONL", e)
+
+
+def _now_dt64() -> str:
+ t = time.time()
+ return time.strftime("%Y-%m-%d %H:%M:%S.", time.gmtime(t)) + f"{int((t % 1) * 1e6):06d}"
diff --git a/playground/server/main.py b/playground/server/main.py
new file mode 100644
index 0000000000..fe6cc86274
--- /dev/null
+++ b/playground/server/main.py
@@ -0,0 +1,252 @@
+"""Playground HTTP API + static UI server.
+
+Endpoints:
+
+ GET / redirects to /ui/
+ GET /ui/... static-serves files from ../web/
+ GET /api/systems JSON list of all playground-eligible systems
+ GET /api/state JSON snapshot of every VM's state
+ GET /api/system/{name} detail for a single system
+ POST /api/query?system=X body is the SQL; returns application/octet-stream
+ with timing in headers
+ GET /api/provision-log/{name} the system's most recent provision log
+ POST /api/admin/provision/{name}
+ manual trigger for first-time provision; convenient
+ for warming a system before the first user query
+
+The /api/query path tries once, then on failure tears down + restores from
+snapshot and retries exactly once, matching the spec.
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import signal
+import time
+from pathlib import Path
+
+import aiohttp
+from aiohttp import web
+
+from . import config as config_mod
+from . import systems as systems_mod
+from .logging_sink import LoggingSink
+from .monitor import Monitor
+from .vm_manager import VMManager
+
+log = logging.getLogger("playground")
+
+
+class App:
+ def __init__(self) -> None:
+ self.cfg = config_mod.load()
+ self.systems = systems_mod.discover(self.cfg.repo_dir)
+ self.vmm = VMManager(self.cfg, self.systems)
+ self.sink = LoggingSink(self.cfg)
+ self.monitor = Monitor(self.cfg, self.vmm, self.sink)
+
+ async def on_startup(self, _app: web.Application) -> None:
+ await self.sink.start()
+ await self.monitor.start()
+
+ async def on_cleanup(self, _app: web.Application) -> None:
+ await self.monitor.stop()
+ await self.sink.stop()
+
+ # ── handlers ─────────────────────────────────────────────────────────
+
+ async def handle_systems(self, _r: web.Request) -> web.Response:
+ return web.json_response([s.asdict() for s in self.systems.values()])
+
+ async def handle_state(self, _r: web.Request) -> web.Response:
+ return web.json_response(self.vmm.list_all())
+
+ async def handle_system(self, req: web.Request) -> web.Response:
+ name = req.match_info["name"]
+ if name not in self.systems:
+ raise web.HTTPNotFound(reason=f"unknown system: {name}")
+ vm = self.vmm.vms[name]
+ return web.json_response({
+ **self.systems[name].asdict(),
+ "state": vm.state,
+ "has_snapshot": vm.snapshot_bin.exists(),
+ "provisioned_at": vm.provisioned_at,
+ "last_used": vm.last_used,
+ "last_error": vm.last_error,
+ "agent_url": self.vmm.agent_url(vm),
+ })
+
+ async def handle_provision_log(self, req: web.Request) -> web.Response:
+ name = req.match_info["name"]
+ if name not in self.systems:
+ raise web.HTTPNotFound()
+ log_path = self.cfg.logs_dir / f"firecracker-{name}.log"
+ if not log_path.exists():
+ return web.Response(text="", content_type="text/plain")
+ try:
+ # Tail at most 64 KB so the browser doesn't choke.
+ data = log_path.read_bytes()[-64 * 1024:]
+ except Exception as e:
+ data = f"(failed to read: {e})".encode()
+ return web.Response(body=data, content_type="text/plain")
+
+ async def handle_admin_provision(self, req: web.Request) -> web.Response:
+ name = req.match_info["name"]
+ if name not in self.systems:
+ raise web.HTTPNotFound()
+ # Fire-and-forget; the client polls /api/system/{name} for state.
+ asyncio.create_task(self._provision_bg(name))
+ return web.json_response({"started": True, "system": name})
+
+ async def _provision_bg(self, name: str) -> None:
+ try:
+ await self.vmm.ensure_ready_for_query(name)
+ except Exception as e:
+ log.exception("background provision failed for %s", name)
+ self.sink.write_event(system=name, kind="provision-failed", detail=repr(e))
+
+ async def handle_query(self, req: web.Request) -> web.StreamResponse:
+ system_name = req.query.get("system", "")
+ if system_name not in self.systems:
+ return web.json_response({"error": f"unknown system: {system_name!r}"},
+ status=400)
+ sql = await req.read()
+ if not sql.strip():
+ return web.json_response({"error": "empty SQL"}, status=400)
+
+ client_addr = req.headers.get("X-Forwarded-For", req.remote or "?")
+ ua = req.headers.get("User-Agent", "")
+ wall_t0 = time.monotonic()
+ status = 500
+ body = b""
+ headers: dict[str, str] = {}
+ err: str | None = None
+ try:
+ body, headers, status = await self._dispatch_query(system_name, sql)
+ except Exception as e:
+ err = repr(e)
+ log.exception("[%s] query dispatch failed", system_name)
+ finally:
+ wall = time.monotonic() - wall_t0
+ try:
+ self.sink.write_request(
+ client_addr=client_addr, user_agent=ua,
+ system=system_name,
+ query=sql.decode("utf-8", errors="replace")[:65536],
+ output_bytes=int(headers.get("X-Output-Bytes", "0") or 0),
+ output_truncated=int(headers.get("X-Output-Truncated", "0") or 0),
+ query_time=(float(headers["X-Query-Time"])
+ if "X-Query-Time" in headers else None),
+ wall_time=wall,
+ status=status,
+ error=err or "",
+ )
+ except Exception:
+ log.exception("logging request failed")
+
+ resp = web.Response(body=body, status=status,
+ content_type="application/octet-stream")
+ for k, v in headers.items():
+ resp.headers[k] = v
+ resp.headers["X-Wall-Time"] = f"{wall:.6f}"
+ if err and "X-Error" not in resp.headers:
+ resp.headers["X-Error"] = err[:512]
+ return resp
+
+ async def _dispatch_query(self, system_name: str, sql: bytes
+ ) -> tuple[bytes, dict[str, str], int]:
+ """Run the query once. On low-level failure (VM unreachable, transport
+ error) tear down and retry once. Higher-level errors (non-2xx from the
+ agent itself, e.g. a SQL syntax error) are NOT retried — they're real
+ results."""
+ last_exc: Exception | None = None
+ for attempt in (1, 2):
+ try:
+ vm = await self.vmm.ensure_ready_for_query(system_name)
+ except Exception as e:
+ last_exc = e
+ if attempt == 1:
+ self.sink.write_event(system=system_name, kind="ensure-failed",
+ detail=f"attempt {attempt}: {e!r}")
+ await asyncio.sleep(0.5)
+ continue
+ raise
+ url = self.vmm.agent_url(vm) + "/query"
+ try:
+ async with aiohttp.ClientSession() as s:
+ async with s.post(url, data=sql,
+ timeout=aiohttp.ClientTimeout(total=600)) as r:
+ body = await r.read()
+ headers = {k: r.headers[k] for k in r.headers if k.startswith("X-")}
+ headers.setdefault("X-Output-Bytes", str(len(body)))
+ return body, headers, r.status
+ except Exception as e:
+ last_exc = e
+ self.sink.write_event(system=system_name, kind="agent-error",
+ detail=f"attempt {attempt}: {e!r}")
+ if attempt == 1:
+ # Hard kill, will trigger snapshot restore on next ensure.
+ await self.vmm.kick(system_name, "agent-error-retry")
+ await asyncio.sleep(0.5)
+ continue
+ raise
+ # unreachable, but keep mypy happy
+ raise RuntimeError(str(last_exc))
+
+
+def build_app() -> web.Application:
+ obj = App()
+ app = web.Application(client_max_size=4 * 1024 * 1024)
+ app.on_startup.append(obj.on_startup)
+ app.on_cleanup.append(obj.on_cleanup)
+
+ app.router.add_get("/api/systems", obj.handle_systems)
+ app.router.add_get("/api/state", obj.handle_state)
+ app.router.add_get("/api/system/{name}", obj.handle_system)
+ app.router.add_get("/api/provision-log/{name}", obj.handle_provision_log)
+ app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision)
+ app.router.add_post("/api/query", obj.handle_query)
+
+ # Static UI
+ web_dir = Path(__file__).resolve().parent.parent / "web"
+
+ async def root_redirect(_r: web.Request) -> web.Response:
+ raise web.HTTPFound("/ui/")
+
+ async def ui_index(_r: web.Request) -> web.FileResponse:
+ return web.FileResponse(web_dir / "index.html")
+
+ app.router.add_get("/", root_redirect)
+ app.router.add_get("/ui/", ui_index)
+ app.router.add_get("/ui", ui_index)
+ app.router.add_static("/ui/", path=str(web_dir), show_index=False, follow_symlinks=True)
+
+ return app
+
+
+def main() -> None:
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(name)s %(levelname)s %(message)s",
+ )
+ cfg = config_mod.load()
+ app = build_app()
+ # Wire signals to a clean shutdown.
+ runner = web.AppRunner(app)
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ loop.run_until_complete(runner.setup())
+ site = web.TCPSite(runner, cfg.listen_host, cfg.listen_port)
+ loop.run_until_complete(site.start())
+ log.info("playground listening on http://%s:%d", cfg.listen_host, cfg.listen_port)
+
+ stop = asyncio.Event()
+ for sig in (signal.SIGTERM, signal.SIGINT):
+ loop.add_signal_handler(sig, stop.set)
+ loop.run_until_complete(stop.wait())
+ loop.run_until_complete(runner.cleanup())
+ loop.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/playground/server/monitor.py b/playground/server/monitor.py
new file mode 100644
index 0000000000..1bdefc6b82
--- /dev/null
+++ b/playground/server/monitor.py
@@ -0,0 +1,215 @@
+"""Background watchdog.
+
+Runs alongside the API server. Once per second:
+
+ * For every running VM, sample CPU% (from /proc//stat), RSS, and the
+ rootfs file's current physical size (via stat). Update the VM record.
+ * If a VM has been at >= cpu_busy_threshold for cpu_busy_window_sec
+ contiguous seconds, restart it.
+ * If a VM's rootfs is filled past vm_disk_pct_kill_threshold of its nominal
+ cap (200 GB) — i.e. the sparse file is using more than that fraction —
+ restart it.
+ * Sample host free memory / free disk on the state_dir filesystem. If under
+ threshold, find the largest live VM (by RSS for memory pressure, by
+ rootfs_used_bytes for disk pressure) and kick it.
+
+`kick` is implemented via vm_manager.kick(name, reason), which leaves the
+snapshot intact. A subsequent /query will trigger a restore.
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import shutil
+import time
+from pathlib import Path
+
+from .config import Config
+from .logging_sink import LoggingSink
+from .vm_manager import VM, VMManager
+
+log = logging.getLogger("monitor")
+
+
+class Monitor:
+ def __init__(self, cfg: Config, vmm: VMManager, sink: LoggingSink):
+ self.cfg = cfg
+ self.vmm = vmm
+ self.sink = sink
+ self._cpu_history: dict[str, tuple[int, int, float]] = {} # name -> (utime+stime, total, ts)
+ self._task: asyncio.Task | None = None
+
+ async def start(self) -> None:
+ self._task = asyncio.create_task(self._loop(), name="monitor")
+
+ async def stop(self) -> None:
+ if self._task:
+ self._task.cancel()
+ try:
+ await self._task
+ except asyncio.CancelledError:
+ pass
+
+ async def _loop(self) -> None:
+ try:
+ while True:
+ await self._tick()
+ await asyncio.sleep(1.0)
+ except asyncio.CancelledError:
+ raise
+ except Exception:
+ log.exception("monitor loop crashed; restarting in 5s")
+ await asyncio.sleep(5)
+ self._task = asyncio.create_task(self._loop(), name="monitor")
+
+ async def _tick(self) -> None:
+ # Per-VM sampling
+ for name, vm in self.vmm.vms.items():
+ if vm.pid is None or not _pid_alive(vm.pid):
+ self._cpu_history.pop(name, None)
+ vm.cpu_busy_since = None
+ continue
+ cpu_pct = self._sample_cpu(name, vm.pid)
+ vm.rss_bytes = _rss(vm.pid)
+ rootfs = self.cfg.systems_dir / name / "rootfs.ext4"
+ try:
+ st = rootfs.stat()
+ vm.rootfs_used_bytes = st.st_blocks * 512 # actual allocated bytes
+ except FileNotFoundError:
+ vm.rootfs_used_bytes = 0
+ await self._check_per_vm(vm, cpu_pct)
+
+ # Host-wide checks
+ await self._check_host_pressure()
+
+ def _sample_cpu(self, name: str, pid: int) -> float | None:
+ """Return ratio of CPU used since last sample, normalized by vcpu count."""
+ stat_path = Path(f"/proc/{pid}/stat")
+ try:
+ stat = stat_path.read_text()
+ except FileNotFoundError:
+ self._cpu_history.pop(name, None)
+ return None
+ # The comm field can contain spaces — split around the last ')'
+ end = stat.rfind(")")
+ parts = stat[end + 2:].split()
+ utime = int(parts[11])
+ stime = int(parts[12])
+ now = time.monotonic()
+ prev = self._cpu_history.get(name)
+ self._cpu_history[name] = (utime, stime, now)
+ if prev is None:
+ return None
+ dt = now - prev[2]
+ d_jiffies = (utime + stime) - (prev[0] + prev[1])
+ clk = os.sysconf("SC_CLK_TCK")
+ if dt <= 0 or clk <= 0:
+ return None
+ # Normalize by the number of vCPUs the VM was allocated.
+ cpu_seconds = d_jiffies / clk
+ return cpu_seconds / (dt * self.cfg.vm_vcpus)
+
+ async def _check_per_vm(self, vm: VM, cpu_pct: float | None) -> None:
+ # CPU saturation watchdog
+ if cpu_pct is None:
+ vm.cpu_busy_since = None
+ elif cpu_pct >= self.cfg.cpu_busy_threshold:
+ if vm.cpu_busy_since is None:
+ vm.cpu_busy_since = time.monotonic()
+ elif time.monotonic() - vm.cpu_busy_since > self.cfg.cpu_busy_window_sec:
+ self.sink.write_event(
+ system=vm.system.name, kind="cpu-watchdog",
+ detail=f"sustained CPU >= {self.cfg.cpu_busy_threshold:.0%} for "
+ f"{self.cfg.cpu_busy_window_sec}s",
+ )
+ await self.vmm.kick(vm.system.name, "cpu-watchdog")
+ vm.cpu_busy_since = None
+ return
+ else:
+ vm.cpu_busy_since = None
+
+ # Disk usage watchdog
+ cap = self.cfg.vm_rootfs_size_gb * (1 << 30)
+ if vm.rootfs_used_bytes and vm.rootfs_used_bytes / cap >= self.cfg.vm_disk_pct_kill_threshold:
+ self.sink.write_event(
+ system=vm.system.name, kind="disk-watchdog",
+ detail=f"rootfs used {vm.rootfs_used_bytes}/{cap}",
+ )
+ await self.vmm.kick(vm.system.name, "disk-watchdog")
+
+ async def _check_host_pressure(self) -> None:
+ # Memory pressure
+ info = _meminfo()
+ free_ram_gb = info.get("MemAvailable", 0) / (1024 * 1024) # MemAvailable is in KB
+ if free_ram_gb < self.cfg.host_min_free_ram_gb:
+ target = self._largest_running(by="rss")
+ if target:
+ self.sink.write_event(
+ system=target.system.name, kind="oom-kick",
+ detail=f"host free RAM {free_ram_gb:.1f}G < {self.cfg.host_min_free_ram_gb}G; "
+ f"largest is {target.system.name} ({target.rss_bytes/1e9:.1f}G)",
+ )
+ await self.vmm.kick(target.system.name, "host-mem-pressure")
+
+ # Disk pressure on the state dir
+ st = shutil.disk_usage(self.cfg.state_dir)
+ free_disk_gb = st.free / (1 << 30)
+ if free_disk_gb < self.cfg.host_min_free_disk_gb:
+ target = self._largest_running(by="disk")
+ if target:
+ self.sink.write_event(
+ system=target.system.name, kind="disk-kick",
+ detail=f"host free disk {free_disk_gb:.1f}G < {self.cfg.host_min_free_disk_gb}G; "
+ f"largest is {target.system.name} ({target.rootfs_used_bytes/1e9:.1f}G)",
+ )
+ await self.vmm.kick(target.system.name, "host-disk-pressure")
+
+ def _largest_running(self, *, by: str) -> VM | None:
+ running = [v for v in self.vmm.vms.values()
+ if v.pid is not None and _pid_alive(v.pid)]
+ if not running:
+ return None
+ key = (lambda v: v.rss_bytes) if by == "rss" else (lambda v: v.rootfs_used_bytes)
+ return max(running, key=key)
+
+
+def _pid_alive(pid: int) -> bool:
+ try:
+ os.kill(pid, 0)
+ return True
+ except ProcessLookupError:
+ return False
+ except PermissionError:
+ return True
+
+
+def _rss(pid: int) -> int:
+ try:
+ text = Path(f"/proc/{pid}/status").read_text()
+ except FileNotFoundError:
+ return 0
+ for line in text.splitlines():
+ if line.startswith("VmRSS:"):
+ parts = line.split()
+ return int(parts[1]) * 1024 # KB -> bytes
+ return 0
+
+
+def _meminfo() -> dict[str, int]:
+ out: dict[str, int] = {}
+ try:
+ text = Path("/proc/meminfo").read_text()
+ except FileNotFoundError:
+ return out
+ for line in text.splitlines():
+ if ":" not in line:
+ continue
+ k, v = line.split(":", 1)
+ parts = v.split()
+ if parts:
+ try:
+ out[k.strip()] = int(parts[0])
+ except ValueError:
+ continue
+ return out
diff --git a/playground/server/net.py b/playground/server/net.py
new file mode 100644
index 0000000000..2c8fdac96c
--- /dev/null
+++ b/playground/server/net.py
@@ -0,0 +1,130 @@
+"""Per-VM TAP networking setup for Firecracker.
+
+Each VM gets its own /24 subnet on a dedicated TAP device:
+
+ fc-tap- host: 10.200..1/24 vm: 10.200..2
+
+Where is a small integer derived from the system slot (1..N). The /24 has
+plenty of headroom but only two addresses are used — one /24 per VM keeps the
+host's routing trivial: no shared bridge, no ARP nonsense, no collisions.
+
+During the *provision* phase we masquerade outbound traffic from the VM so it
+can apt-get / curl. After the snapshot we drop the FORWARD rules; the VM can
+still talk to the host (and therefore the agent endpoint) but cannot reach the
+internet.
+"""
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import re
+
+# The /16 we hand out from. 10.200.0.0/16 -> 256 /24 subnets, plenty for our use.
+_BASE = "10.200"
+
+
+def addr_for(slot: int) -> tuple[str, str, str]:
+ """Return (host_ip, vm_ip, cidr) for the given slot id."""
+ if not 1 <= slot <= 250:
+ raise ValueError(f"slot out of range: {slot}")
+ return f"{_BASE}.{slot}.1", f"{_BASE}.{slot}.2", f"{_BASE}.{slot}.0/24"
+
+
+def tap_name(slot: int) -> str:
+ return f"fc-tap-{slot}"
+
+
+def mac_for(slot: int) -> str:
+ # Locally administered, unicast, deterministic by slot.
+ return f"02:fc:00:00:{slot // 256:02x}:{slot % 256:02x}"
+
+
+async def _run(*args: str, check: bool = True) -> tuple[int, bytes, bytes]:
+ p = await asyncio.create_subprocess_exec(
+ *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+ )
+ o, e = await p.communicate()
+ if check and p.returncode != 0:
+ raise RuntimeError(f"cmd failed: {' '.join(args)}: {e.decode(errors='replace')}")
+ return p.returncode or 0, o, e
+
+
+async def ensure_tap(slot: int) -> None:
+ """Create the TAP device and assign the host-side address. Idempotent."""
+ tap = tap_name(slot)
+ host_ip, _, _ = addr_for(slot)
+ # Does the device already exist?
+ rc, out, _ = await _run("ip", "-br", "link", "show", "dev", tap, check=False)
+ if rc != 0:
+ await _run("sudo", "ip", "tuntap", "add", "dev", tap, "mode", "tap")
+ # Make sure the IP is there
+ rc, addrs, _ = await _run("ip", "-br", "addr", "show", "dev", tap, check=False)
+ if rc != 0 or host_ip not in addrs.decode(errors="replace"):
+ # Strip any old IPs then add the canonical one.
+ await _run("sudo", "ip", "addr", "flush", "dev", tap, check=False)
+ await _run("sudo", "ip", "addr", "add", f"{host_ip}/24", "dev", tap)
+ await _run("sudo", "ip", "link", "set", tap, "up")
+
+
+async def teardown_tap(slot: int) -> None:
+ tap = tap_name(slot)
+ with contextlib.suppress(Exception):
+ await _run("sudo", "ip", "link", "set", tap, "down", check=False)
+ with contextlib.suppress(Exception):
+ await _run("sudo", "ip", "tuntap", "del", "dev", tap, "mode", "tap", check=False)
+
+
+_NAT_RULE_PAT = re.compile(r"^-A POSTROUTING.*-o\s+(\S+).*-j\s+MASQUERADE", re.MULTILINE)
+
+
+async def _host_default_iface() -> str:
+ """Return the host's default outbound interface (e.g. eth0)."""
+ rc, out, _ = await _run("ip", "-o", "-4", "route", "show", "default")
+ text = out.decode(errors="replace")
+ # "default via 1.2.3.4 dev eth0 ..."
+ parts = text.split()
+ for i, p in enumerate(parts):
+ if p == "dev" and i + 1 < len(parts):
+ return parts[i + 1]
+ raise RuntimeError(f"could not find default route: {text!r}")
+
+
+async def enable_internet(slot: int) -> None:
+ """Allow the VM to reach the outside world via MASQUERADE + FORWARD."""
+ iface = await _host_default_iface()
+ _, _, cidr = addr_for(slot)
+ # MASQUERADE rule: add only if not already present.
+ rc, out, _ = await _run("sudo", "iptables", "-t", "nat", "-S", "POSTROUTING")
+ if f"-s {cidr}" not in out.decode(errors="replace"):
+ await _run("sudo", "iptables", "-t", "nat", "-A", "POSTROUTING",
+ "-s", cidr, "-o", iface, "-j", "MASQUERADE")
+ # FORWARD rules
+ for rule in (
+ ("-i", tap_name(slot), "-o", iface, "-j", "ACCEPT"),
+ ("-i", iface, "-o", tap_name(slot), "-m", "state", "--state",
+ "RELATED,ESTABLISHED", "-j", "ACCEPT"),
+ ):
+ rc, out, _ = await _run("sudo", "iptables", "-C", "FORWARD", *rule, check=False)
+ if rc != 0:
+ await _run("sudo", "iptables", "-A", "FORWARD", *rule)
+
+
+async def disable_internet(slot: int) -> None:
+ """Drop the masquerade + forward rules added by enable_internet."""
+ iface = await _host_default_iface()
+ _, _, cidr = addr_for(slot)
+ # Best-effort removal — repeat until iptables reports the rule isn't there.
+ while True:
+ rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-D", "POSTROUTING",
+ "-s", cidr, "-o", iface, "-j", "MASQUERADE", check=False)
+ if rc != 0:
+ break
+ for rule in (
+ ("-i", tap_name(slot), "-o", iface, "-j", "ACCEPT"),
+ ("-i", iface, "-o", tap_name(slot), "-m", "state", "--state",
+ "RELATED,ESTABLISHED", "-j", "ACCEPT"),
+ ):
+ while True:
+ rc, _, _ = await _run("sudo", "iptables", "-D", "FORWARD", *rule, check=False)
+ if rc != 0:
+ break
diff --git a/playground/server/systems.py b/playground/server/systems.py
new file mode 100644
index 0000000000..c41599858a
--- /dev/null
+++ b/playground/server/systems.py
@@ -0,0 +1,134 @@
+"""Registry of ClickBench systems that can be exposed through the playground.
+
+A system is *playground-eligible* if its directory contains the canonical
+unified script set (install/start/load/query/check/stop) AND there is no
+external service required (no `aurora-*`, `redshift*`, `bigquery`, `snowflake`,
+etc. — those need API keys and live on someone else's infra).
+
+The registry is built by scanning the repo at startup. Each `System` carries:
+
+ * name the directory name (also the URL-safe identifier)
+ * display_name pulled from template.json "system" field if present
+ * tags from template.json
+ * download_script from `BENCH_DOWNLOAD_SCRIPT=` line in benchmark.sh
+ * data_format inferred from download_script (parquet / parquet-partitioned / tsv / csv / none)
+ * durable BENCH_DURABLE=yes/no (default yes)
+ * restartable BENCH_RESTARTABLE=yes/no (default yes)
+"""
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+# Systems we explicitly skip — they all require external infrastructure
+# (managed cloud DBs / API keys) we can't run inside an isolated microVM.
+# Local-only systems (umbra, hyper, cedardb, etc.) stay in the catalog
+# even though some need a free-trial license at install time — those
+# scripts fetch the binary themselves and we don't second-guess them.
+_EXTERNAL = {
+ "alloydb", "athena", "athena-partitioned", "aurora-mysql",
+ "aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt",
+ "clickhouse-cloud", "clickhouse-tencent", "clickhouse-web",
+ "crunchy-bridge-for-analytics", "databend", "databricks", "exasol",
+ "firebolt", "firebolt-parquet", "firebolt-parquet-partitioned",
+ "gravitons", "heavyai", "hologres", "hydrolix", "kinetica",
+ "motherduck", "oxla", "pgpro_tam", "redshift", "redshift-serverless",
+ "s3select", "singlestore", "snowflake", "supabase",
+ "tembo-olap", "timescale-cloud", "tinybird", "ursa", "velodb",
+ "vertica", "ydb",
+}
+
+
+@dataclass(frozen=True)
+class System:
+ name: str
+ display_name: str
+ tags: tuple[str, ...]
+ download_script: str
+ data_format: str # parquet / parquet-partitioned / tsv / csv / none
+ durable: bool
+ restartable: bool
+
+ def asdict(self) -> dict:
+ return {
+ "name": self.name,
+ "display_name": self.display_name,
+ "tags": list(self.tags),
+ "download_script": self.download_script,
+ "data_format": self.data_format,
+ "durable": self.durable,
+ "restartable": self.restartable,
+ }
+
+
+def _read_template(p: Path) -> dict:
+ tpl = p / "template.json"
+ if not tpl.exists():
+ return {}
+ try:
+ return json.loads(tpl.read_text())
+ except Exception:
+ return {}
+
+
+def _parse_benchmark_sh(p: Path) -> dict:
+ """Best-effort parse of `export FOO=bar` lines in benchmark.sh."""
+ bench = p / "benchmark.sh"
+ if not bench.exists():
+ return {}
+ out: dict[str, str] = {}
+ pat = re.compile(r'^\s*export\s+([A-Z_]+)=("([^"]*)"|([^\s]+))', re.MULTILINE)
+ text = bench.read_text(errors="replace")
+ for m in pat.finditer(text):
+ key = m.group(1)
+ out[key] = m.group(3) if m.group(3) is not None else m.group(4)
+ return out
+
+
+def _data_format(download_script: str) -> str:
+ if not download_script:
+ return "none"
+ if "parquet-partitioned" in download_script:
+ return "parquet-partitioned"
+ if "parquet-single" in download_script:
+ return "parquet"
+ if "tsv" in download_script:
+ return "tsv"
+ if "csv" in download_script:
+ return "csv"
+ return "unknown"
+
+
+def _is_playground_eligible(p: Path) -> bool:
+ if p.name in _EXTERNAL:
+ return False
+ for f in ("install", "start", "load", "query", "check", "stop"):
+ s = p / f
+ if not s.exists():
+ return False
+ return True
+
+
+def discover(repo_dir: Path) -> dict[str, System]:
+ """Walk the repo and return name -> System."""
+ out: dict[str, System] = {}
+ for child in sorted(repo_dir.iterdir()):
+ if not child.is_dir():
+ continue
+ if not _is_playground_eligible(child):
+ continue
+ tpl = _read_template(child)
+ env = _parse_benchmark_sh(child)
+ download = env.get("BENCH_DOWNLOAD_SCRIPT", "")
+ out[child.name] = System(
+ name=child.name,
+ display_name=tpl.get("system", child.name),
+ tags=tuple(tpl.get("tags", []) or []),
+ download_script=download,
+ data_format=_data_format(download),
+ durable=env.get("BENCH_DURABLE", "yes") != "no",
+ restartable=env.get("BENCH_RESTARTABLE", "yes") != "no",
+ )
+ return out
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
new file mode 100644
index 0000000000..7b97e4d46c
--- /dev/null
+++ b/playground/server/vm_manager.py
@@ -0,0 +1,391 @@
+"""Per-system Firecracker microVM lifecycle.
+
+For each ClickBench system we manage a VM with this lifecycle:
+
+ [DOWN] --build_images--> [DOWN(images-ready)]
+ --first_boot--> [PROVISIONING] (internet ON, /provision called)
+ --snapshot--> [SNAPSHOTTED(internet OFF)]
+ --restore--> [READY] (handles /query requests)
+ --idle / OOM / disk full / forced--> kill -> [SNAPSHOTTED]
+
+State transitions are gated by `VM.lock`. The public API
+`ensure_ready_for_query(system)` returns an `(agent_url, vm)` ready to take a
+POST /query, doing whatever transitions are needed.
+
+We avoid the jailer for now: the playground host already runs as a dedicated
+user; the chroot/cgroups layer would complicate dataset disk attach and
+the value-add over a vanilla firecracker process is small for our threat
+model (untrusted SQL but cooperatively-built rootfs).
+"""
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import dataclasses
+import json
+import logging
+import os
+import shutil
+import signal
+import time
+from pathlib import Path
+from typing import Optional
+
+import aiohttp
+
+from . import firecracker as fc
+from . import net
+from .config import Config
+from .systems import System
+
+log = logging.getLogger("vm_manager")
+
+
+# Lifecycle states for VM.state:
+# "down" no firecracker process for this system
+# "provisioning" firecracker is up, install/start/load running inside
+# "ready" firecracker is up, snapshotted at least once, /query OK
+# "snapshotted" firecracker process is down, but snapshot.bin exists
+@dataclasses.dataclass
+class VM:
+ system: System
+ slot: int
+ # Firecracker config
+ api_sock: Path
+ log_sock: Path # we just point this at /dev/null actually
+ pid: Optional[int] = None
+ state: str = "down"
+ # Snapshot artifacts
+ snapshot_bin: Path = dataclasses.field(default_factory=lambda: Path())
+ snapshot_state: Path = dataclasses.field(default_factory=lambda: Path())
+ # Provision metadata
+ provisioned_at: Optional[float] = None
+ last_used: float = 0.0
+ last_error: Optional[str] = None
+ lock: asyncio.Lock = dataclasses.field(default_factory=asyncio.Lock)
+ # Runtime stats refreshed by the monitor
+ cpu_busy_since: Optional[float] = None
+ rss_bytes: int = 0
+ rootfs_used_bytes: int = 0
+
+
+class VMManager:
+ """Owns the registry of per-system VMs."""
+
+ def __init__(self, config: Config, systems: dict[str, System]):
+ self.cfg = config
+ self.systems = systems
+ self.vms: dict[str, VM] = {}
+ # Stable slot allocation: sort systems alphabetically so each system
+ # always gets the same slot id (and therefore the same TAP/IP).
+ for i, name in enumerate(sorted(systems.keys()), start=1):
+ sys = systems[name]
+ sys_state_dir = config.systems_dir / name
+ sys_state_dir.mkdir(parents=True, exist_ok=True)
+ self.vms[name] = VM(
+ system=sys,
+ slot=i,
+ api_sock=config.vms_dir / f"{name}.sock",
+ log_sock=config.vms_dir / f"{name}.log.sock",
+ snapshot_bin=sys_state_dir / "snapshot.bin",
+ snapshot_state=sys_state_dir / "snapshot.state",
+ )
+
+ # ── public API ───────────────────────────────────────────────────────
+
+ async def ensure_ready_for_query(self, system: str) -> VM:
+ """Make sure system is up and responsive to /query. Boot/resume as needed.
+
+ On success the returned VM is in state "ready" and self.last_used has
+ been touched.
+ """
+ if system not in self.vms:
+ raise KeyError(system)
+ vm = self.vms[system]
+ async with vm.lock:
+ if vm.state == "ready" and vm.pid and await self._agent_healthy(vm):
+ vm.last_used = time.time()
+ return vm
+ # The state machine: drive to "ready" by the cheapest available path.
+ if vm.state == "ready":
+ # Process is gone or unresponsive. Treat as snapshotted.
+ vm.state = "snapshotted"
+ if vm.state == "down":
+ if not vm.snapshot_bin.exists():
+ # No snapshot yet — need a full provision.
+ await self._initial_provision(vm)
+ else:
+ await self._restore_snapshot(vm)
+ elif vm.state == "snapshotted":
+ await self._restore_snapshot(vm)
+ elif vm.state == "provisioning":
+ raise RuntimeError(f"{system}: provisioning in progress")
+ vm.last_used = time.time()
+ return vm
+
+ async def kick(self, system: str, reason: str) -> None:
+ """Forcibly tear down the VM. Caller (monitor) is responsible for logging."""
+ vm = self.vms.get(system)
+ if vm is None:
+ return
+ async with vm.lock:
+ await self._teardown(vm, reason)
+
+ def list_all(self) -> list[dict]:
+ out = []
+ for name, vm in self.vms.items():
+ out.append({
+ "name": name,
+ "system": vm.system.display_name,
+ "state": vm.state,
+ "slot": vm.slot,
+ "agent_url": self.agent_url(vm),
+ "provisioned_at": vm.provisioned_at,
+ "last_used": vm.last_used,
+ "tags": list(vm.system.tags),
+ "data_format": vm.system.data_format,
+ "last_error": vm.last_error,
+ "rss_bytes": vm.rss_bytes,
+ "rootfs_used_bytes": vm.rootfs_used_bytes,
+ "has_snapshot": vm.snapshot_bin.exists(),
+ })
+ return out
+
+ def agent_url(self, vm: VM) -> str:
+ _, vm_ip, _ = net.addr_for(vm.slot)
+ return f"http://{vm_ip}:8080"
+
+ # ── boot / shutdown ──────────────────────────────────────────────────
+
+ async def _spawn_firecracker(self, vm: VM) -> None:
+ """Start a fresh firecracker process listening on vm.api_sock."""
+ with contextlib.suppress(FileNotFoundError):
+ vm.api_sock.unlink()
+ vm.api_sock.parent.mkdir(parents=True, exist_ok=True)
+
+ env = os.environ.copy()
+ env["RUST_BACKTRACE"] = "1"
+
+ log_path = self.cfg.logs_dir / f"firecracker-{vm.system.name}.log"
+ log_path.parent.mkdir(parents=True, exist_ok=True)
+ # Append to the existing log so prior runs are kept for postmortems.
+ log_fh = open(log_path, "ab", buffering=0)
+
+ proc = await asyncio.create_subprocess_exec(
+ str(self.cfg.firecracker_bin),
+ "--api-sock", str(vm.api_sock),
+ "--id", vm.system.name,
+ stdout=log_fh, stderr=log_fh, env=env, start_new_session=True,
+ )
+ vm.pid = proc.pid
+ # Wait for the API socket to exist
+ for _ in range(80):
+ if vm.api_sock.exists():
+ break
+ await asyncio.sleep(0.05)
+ if not vm.api_sock.exists():
+ raise RuntimeError("firecracker did not create API socket in time")
+
+ def _kernel_cmdline(self, vm: VM) -> str:
+ # console=ttyS0 so we get a serial-attached login (in case we drop a
+ # console socket for debugging); reboot=k for clean halt-on-panic.
+ # The kernel's built-in IP autoconfig statically assigns the VM's
+ # /24 from its slot, sidestepping any DHCP/networkd in userland.
+ host_ip, vm_ip, _ = net.addr_for(vm.slot)
+ return (
+ "console=ttyS0 reboot=k panic=1 pci=off "
+ f"ip={vm_ip}::{host_ip}:255.255.255.0::eth0:off "
+ "root=/dev/vda rw "
+ "init=/lib/systemd/systemd "
+ )
+
+ async def _initial_provision(self, vm: VM) -> None:
+ """First-time boot: build per-system images, boot with internet, run
+ agent /provision, snapshot, shut down."""
+ if vm.state != "down":
+ raise RuntimeError(f"unexpected state for initial provision: {vm.state}")
+
+ log.info("[%s] initial provision begin", vm.system.name)
+ vm.state = "provisioning"
+ try:
+ await self._build_images_if_needed(vm)
+ await net.ensure_tap(vm.slot)
+ await net.enable_internet(vm.slot)
+ await self._boot(vm, restore_snapshot=False)
+ await self._wait_for_agent(vm, timeout=180)
+ await self._call_agent_provision(vm)
+ await self._snapshot(vm)
+ await self._shutdown(vm)
+ await net.disable_internet(vm.slot)
+ vm.state = "snapshotted"
+ vm.provisioned_at = time.time()
+ log.info("[%s] initial provision complete", vm.system.name)
+ except Exception as e:
+ vm.last_error = f"provision: {e!r}"
+ log.exception("[%s] provision failed", vm.system.name)
+ await self._teardown(vm, "provision-failed")
+ raise
+
+ async def _build_images_if_needed(self, vm: VM) -> None:
+ sys_dir = self.cfg.systems_dir / vm.system.name
+ rootfs = sys_dir / "rootfs.ext4"
+ sysdisk = sys_dir / "system.ext4"
+ if rootfs.exists() and sysdisk.exists():
+ return
+ log.info("[%s] building rootfs + system disk", vm.system.name)
+ script = self.cfg.repo_dir / "playground" / "images" / "build-system-rootfs.sh"
+ p = await asyncio.create_subprocess_exec(
+ "bash", str(script), vm.system.name,
+ stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT,
+ env={**os.environ, "PLAYGROUND_STATE_DIR": str(self.cfg.state_dir)},
+ )
+ out, _ = await p.communicate()
+ if p.returncode != 0:
+ raise RuntimeError(f"build-system-rootfs failed: {out.decode(errors='replace')[-2000:]}")
+
+ async def _boot(self, vm: VM, *, restore_snapshot: bool) -> None:
+ """Configure and start a Firecracker instance. If restore_snapshot is
+ True, we load from the snapshot files; else we cold-boot from kernel +
+ rootfs."""
+ await self._spawn_firecracker(vm)
+ sock = str(vm.api_sock)
+
+ # Network: must be configured *before* either boot path.
+ await fc.put(sock, f"/network-interfaces/eth0", {
+ "iface_id": "eth0",
+ "guest_mac": net.mac_for(vm.slot),
+ "host_dev_name": net.tap_name(vm.slot),
+ })
+
+ rootfs = self.cfg.systems_dir / vm.system.name / "rootfs.ext4"
+ sysdisk = self.cfg.systems_dir / vm.system.name / "system.ext4"
+
+ if restore_snapshot:
+ # Drives must match the layout that existed when the snapshot was
+ # taken, but Firecracker re-reads file paths on restore. We rebind
+ # them here in case the absolute paths changed (e.g. snapshot moved).
+ await fc.put(sock, "/snapshot/load", {
+ "snapshot_path": str(vm.snapshot_state),
+ "mem_backend": {"backend_type": "File", "backend_path": str(vm.snapshot_bin)},
+ "enable_diff_snapshots": False,
+ "resume_vm": True,
+ })
+ return
+
+ # Cold boot.
+ await fc.put(sock, "/boot-source", {
+ "kernel_image_path": str(self.cfg.kernel_path),
+ "boot_args": self._kernel_cmdline(vm),
+ })
+ await fc.put(sock, "/drives/rootfs", {
+ "drive_id": "rootfs",
+ "path_on_host": str(rootfs),
+ "is_root_device": True,
+ "is_read_only": False,
+ })
+ await fc.put(sock, "/drives/system", {
+ "drive_id": "system",
+ "path_on_host": str(sysdisk),
+ "is_root_device": False,
+ "is_read_only": False,
+ })
+ await fc.put(sock, "/machine-config", {
+ "vcpu_count": self.cfg.vm_vcpus,
+ "mem_size_mib": self.cfg.vm_mem_mib,
+ "smt": False,
+ })
+ await fc.put(sock, "/actions", {"action_type": "InstanceStart"})
+
+ async def _snapshot(self, vm: VM) -> None:
+ sock = str(vm.api_sock)
+ await fc.patch(sock, "/vm", {"state": "Paused"})
+ try:
+ await fc.put(sock, "/snapshot/create", {
+ "snapshot_type": "Full",
+ "snapshot_path": str(vm.snapshot_state),
+ "mem_file_path": str(vm.snapshot_bin),
+ }, timeout=600.0)
+ finally:
+ # Try to resume so we can shut down cleanly; ignore failures.
+ with contextlib.suppress(Exception):
+ await fc.patch(sock, "/vm", {"state": "Resumed"})
+
+ async def _restore_snapshot(self, vm: VM) -> None:
+ log.info("[%s] restore from snapshot", vm.system.name)
+ await net.ensure_tap(vm.slot)
+ # internet stays OFF post-snapshot
+ await self._boot(vm, restore_snapshot=True)
+ await self._wait_for_agent(vm, timeout=60)
+ vm.state = "ready"
+
+ async def _shutdown(self, vm: VM) -> None:
+ """Best-effort clean shutdown of the firecracker process."""
+ if not vm.pid:
+ return
+ with contextlib.suppress(Exception):
+ await fc.put(str(vm.api_sock), "/actions", {"action_type": "SendCtrlAltDel"})
+ # Wait briefly for graceful exit
+ for _ in range(50):
+ if not _pid_alive(vm.pid):
+ break
+ await asyncio.sleep(0.1)
+ if _pid_alive(vm.pid):
+ with contextlib.suppress(ProcessLookupError):
+ os.kill(vm.pid, signal.SIGKILL)
+ vm.pid = None
+ with contextlib.suppress(FileNotFoundError):
+ vm.api_sock.unlink()
+
+ async def _teardown(self, vm: VM, reason: str) -> None:
+ log.warning("[%s] teardown: %s", vm.system.name, reason)
+ with contextlib.suppress(Exception):
+ await self._shutdown(vm)
+ vm.state = "snapshotted" if vm.snapshot_bin.exists() else "down"
+
+ # ── agent helpers ────────────────────────────────────────────────────
+
+ async def _agent_healthy(self, vm: VM) -> bool:
+ if not vm.pid or not _pid_alive(vm.pid):
+ return False
+ url = self.agent_url(vm) + "/health"
+ try:
+ async with aiohttp.ClientSession() as s:
+ async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r:
+ return r.status == 200
+ except Exception:
+ return False
+
+ async def _wait_for_agent(self, vm: VM, *, timeout: float) -> None:
+ url = self.agent_url(vm) + "/health"
+ t0 = time.monotonic()
+ last_err: Exception | None = None
+ async with aiohttp.ClientSession() as s:
+ while time.monotonic() - t0 < timeout:
+ try:
+ async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r:
+ if r.status == 200:
+ return
+ except Exception as e:
+ last_err = e
+ await asyncio.sleep(0.5)
+ raise RuntimeError(f"agent unreachable after {timeout}s: {last_err!r}")
+
+ async def _call_agent_provision(self, vm: VM) -> None:
+ url = self.agent_url(vm) + "/provision"
+ async with aiohttp.ClientSession() as s:
+ # Provision can take a very long time (apt-get install jdk, etc.)
+ async with s.post(url, timeout=aiohttp.ClientTimeout(total=7200)) as r:
+ body = await r.read()
+ if r.status >= 300:
+ raise RuntimeError(f"agent /provision failed: {r.status}: "
+ f"{body[-2000:].decode(errors='replace')}")
+
+
+def _pid_alive(pid: int) -> bool:
+ try:
+ os.kill(pid, 0)
+ return True
+ except ProcessLookupError:
+ return False
+ except PermissionError:
+ return True
diff --git a/playground/web/app.js b/playground/web/app.js
new file mode 100644
index 0000000000..fb29eb0d7f
--- /dev/null
+++ b/playground/web/app.js
@@ -0,0 +1,128 @@
+// ClickBench Playground — minimal vanilla-JS client.
+//
+// Talks to the host API. Three things happen here:
+// 1. On load, fetch /api/systems and populate the system dropdown. Pre-select
+// whatever's in the URL hash (e.g. #clickhouse) or the first one.
+// 2. On selection change, poll /api/system/ every 2s and update the
+// state pill so the user can see when provisioning finishes / a VM is
+// restarted by the watchdog.
+// 3. On "Run query", POST the SQL to /api/query?system=, parse the
+// response headers for timing, render bytes as text (best-effort UTF-8).
+
+const $ = (sel) => document.querySelector(sel);
+
+const sysSelect = $("#system");
+const queryEl = $("#query");
+const runBtn = $("#run");
+const statePill = $("#state-pill");
+const outEl = $("#output");
+const timeEl = $("#time");
+const wallEl = $("#wall");
+const bytesEl = $("#bytes");
+const truncEl = $("#truncated");
+const exitEl = $("#exit");
+const stateBlob = $("#state-blob");
+
+let pollTimer = null;
+let knownSystems = [];
+
+async function loadSystems() {
+ const r = await fetch("/api/systems");
+ knownSystems = await r.json();
+ knownSystems.sort((a, b) => a.display_name.localeCompare(b.display_name));
+ sysSelect.innerHTML = "";
+ for (const s of knownSystems) {
+ const o = document.createElement("option");
+ o.value = s.name;
+ o.textContent = `${s.display_name} (${s.data_format})`;
+ sysSelect.appendChild(o);
+ }
+ // Allow #clickhouse style deep links
+ const hash = (location.hash || "").slice(1);
+ if (hash && knownSystems.some(s => s.name === hash)) {
+ sysSelect.value = hash;
+ }
+ onSystemChange();
+}
+
+async function pollState() {
+ const name = sysSelect.value;
+ if (!name) return;
+ try {
+ const r = await fetch(`/api/system/${encodeURIComponent(name)}`);
+ if (!r.ok) throw new Error(`HTTP ${r.status}`);
+ const j = await r.json();
+ statePill.textContent = j.state || "?";
+ statePill.className = `pill ${j.state || ""}`;
+ stateBlob.textContent = JSON.stringify(j, null, 2);
+ } catch (e) {
+ statePill.textContent = "err";
+ statePill.className = "pill down";
+ stateBlob.textContent = String(e);
+ }
+}
+
+function onSystemChange() {
+ if (pollTimer) clearInterval(pollTimer);
+ location.hash = sysSelect.value;
+ pollState();
+ pollTimer = setInterval(pollState, 2000);
+}
+
+async function runQuery() {
+ const name = sysSelect.value;
+ const sql = queryEl.value;
+ if (!sql.trim()) return;
+ runBtn.disabled = true;
+ outEl.textContent = "(running …)";
+ timeEl.textContent = "…";
+ wallEl.textContent = "…";
+ bytesEl.textContent = "—";
+ truncEl.textContent = "—";
+ exitEl.textContent = "—";
+
+ const t0 = performance.now();
+ try {
+ const r = await fetch(`/api/query?system=${encodeURIComponent(name)}`, {
+ method: "POST",
+ body: sql,
+ headers: {"Content-Type": "application/octet-stream"},
+ });
+ const body = await r.arrayBuffer();
+ const txt = bytesToText(body);
+ outEl.textContent = txt || "(no output)";
+
+ const h = (k) => r.headers.get(k);
+ const qt = h("X-Query-Time");
+ const wt = h("X-Wall-Time");
+ timeEl.textContent = qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—";
+ wallEl.textContent = wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`;
+ bytesEl.textContent = h("X-Output-Bytes") || body.byteLength;
+ truncEl.textContent = h("X-Output-Truncated") === "1" ? "yes" : "no";
+ exitEl.textContent = h("X-Exit-Code") || r.status;
+ if (r.status >= 400) {
+ const err = h("X-Error");
+ if (err) outEl.textContent = `(error)\n${err}\n\n` + outEl.textContent;
+ }
+ } catch (e) {
+ outEl.textContent = `(client error)\n${e}`;
+ } finally {
+ runBtn.disabled = false;
+ }
+}
+
+function bytesToText(buf) {
+ try {
+ return new TextDecoder("utf-8", {fatal: false}).decode(buf);
+ } catch {
+ return [...new Uint8Array(buf)].map(b => String.fromCharCode(b)).join("");
+ }
+}
+
+sysSelect.addEventListener("change", onSystemChange);
+runBtn.addEventListener("click", runQuery);
+queryEl.addEventListener("keydown", (e) => {
+ if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery();
+});
+
+loadSystems();
diff --git a/playground/web/index.html b/playground/web/index.html
new file mode 100644
index 0000000000..e415a5ecff
--- /dev/null
+++ b/playground/web/index.html
@@ -0,0 +1,62 @@
+
+
+
+
+
+ClickBench Playground
+
+
+
+
+
ClickBench Playground
+
+ Run SQL against any of the database systems in
+ ClickBench,
+ each isolated in its own Firecracker microVM. The dataset is the
+ standard hits table — 100 M rows.
+
+
+
+
+
+
+
+
+ …
+
+
+
+
+
+
+
+
+
Time: —
+
Wall: —
+
Bytes: —
+
Truncated: —
+
Exit: —
+
+
+
+
+
+
+
+
+
+ System status
+
loading…
+
+
+
+
+
+
+
+
+
diff --git a/playground/web/style.css b/playground/web/style.css
new file mode 100644
index 0000000000..e630327026
--- /dev/null
+++ b/playground/web/style.css
@@ -0,0 +1,71 @@
+:root {
+ --fg: #1f2328;
+ --muted: #6e7681;
+ --border: #d0d7de;
+ --bg: #ffffff;
+ --bg-alt: #f6f8fa;
+ --accent: #fb1f00;
+ --accent-fg: #ffffff;
+ --pill-bg: #e7eaef;
+ --pill-fg: #1f2328;
+ --good: #1f883d;
+ --bad: #cf222e;
+}
+
+* { box-sizing: border-box; }
+html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); }
+body { font: 14px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif; }
+header, main, footer { max-width: 960px; margin: 0 auto; padding: 0 16px; }
+
+header { padding-top: 24px; padding-bottom: 12px; border-bottom: 1px solid var(--border); }
+header h1 { margin: 0 0 4px; font-size: 22px; font-weight: 600; }
+header .lead { margin: 0; color: var(--muted); }
+.muted { color: var(--muted); font-weight: normal; }
+
+main { padding-top: 16px; padding-bottom: 32px; }
+main > section { margin: 12px 0; }
+
+label { display: block; font-weight: 600; font-size: 12px; text-transform: uppercase;
+ letter-spacing: 0.04em; color: var(--muted); margin-bottom: 4px; }
+
+select, textarea, pre, input {
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
+ font-size: 13px;
+ border: 1px solid var(--border);
+ background: var(--bg);
+ color: var(--fg);
+ border-radius: 6px;
+}
+
+select { padding: 6px 8px; min-width: 280px; }
+textarea { width: 100%; padding: 10px; resize: vertical; }
+pre { padding: 10px; background: var(--bg-alt); margin: 0; max-height: 360px;
+ overflow: auto; white-space: pre-wrap; word-break: break-word; }
+
+button {
+ background: var(--accent); color: var(--accent-fg);
+ border: none; border-radius: 6px; padding: 6px 16px;
+ font-weight: 600; cursor: pointer;
+}
+button:disabled { opacity: 0.6; cursor: not-allowed; }
+button:hover:not(:disabled) { filter: brightness(0.95); }
+
+.row { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; }
+.row label { margin: 0; }
+.stats { font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+ font-size: 12px; color: var(--muted); padding: 8px 0;
+ border-top: 1px solid var(--border); border-bottom: 1px solid var(--border); }
+.stats span { color: var(--fg); }
+
+.pill { display: inline-block; padding: 2px 8px; border-radius: 999px;
+ font-size: 11px; font-weight: 600; background: var(--pill-bg); color: var(--pill-fg);
+ text-transform: uppercase; letter-spacing: 0.04em; }
+.pill.ready { background: #ddf4e4; color: var(--good); }
+.pill.snapshotted { background: #fff4d1; color: #9a6700; }
+.pill.provisioning { background: #ddf0ff; color: #0969da; }
+.pill.down { background: #ffd7d6; color: var(--bad); }
+
+footer { color: var(--muted); padding-top: 16px; padding-bottom: 32px;
+ border-top: 1px solid var(--border); font-size: 12px; }
+a { color: var(--accent); text-decoration: none; }
+a:hover { text-decoration: underline; }
From d1e144c2f5d601cdc85e3e5ee791ef59b3aaed42 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 20:00:34 +0000
Subject: [PATCH 002/221] playground: mark chroot's /dev /proc /sys as rslave
A later `umount -lR` on the chroot's /dev was propagating through the
shared mount group and tearing down the host's /dev/pts, breaking sshd's
PTY allocation. `--make-rslave` keeps mount events flowing *into* the
chroot but blocks unmounts from leaking back to the host.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 2cffbaf52f..6e4323bc71 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -92,10 +92,16 @@ trap '
sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true
' EXIT
-# Bind /dev /proc /sys for the chroot.
+# Bind /dev /proc /sys for the chroot. Use `--rbind` so submounts (devpts,
+# mqueue, hugepages, /sys/fs/cgroup, …) come along. Critically, mark each
+# new mount `--make-rslave` immediately afterwards. Without that, a later
+# `umount -lR` on the chroot's `/dev` propagates back through the shared
+# mount group and tears down the *host's* `/dev/pts` — at which point sshd
+# can't allocate a PTY and the operator gets locked out.
for d in dev proc sys; do
sudo mkdir -p "$MNT/$d"
sudo mount --rbind "/$d" "$MNT/$d"
+ sudo mount --make-rslave "$MNT/$d"
done
trap '
for d in dev proc sys; do sudo umount -lR "'"$MNT"'/$d" 2>/dev/null || true; done
From 41ed4b37972b4a28755444bfa6708c8479179b1d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 20:18:34 +0000
Subject: [PATCH 003/221] playground: shrink snapshots with restart +
drop_caches + zstd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A 16 GB guest snapshot.bin compresses to ~2 GB once we
1) stop+start the system daemon (sheds INSERT-time heap arenas,
buffers, fresh allocator pages),
2) echo 3 > drop_caches (turns 3-5 GB of page cache into zero
pages),
3) zstd -T0 -3 --long=27 (parallel, big match window — most of
the savings come from those zero pages).
Restart is skipped for in-process engines where stop/start is a
no-op AND the data lives in the process; wiping it would defeat
the whole point.
The host now keeps snapshot.bin.zst as the canonical artifact and
decompresses on demand right before /snapshot/load. snapshot.bin
itself is deleted after a successful restore + teardown.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 49 ++++++++++
playground/server/vm_manager.py | 153 ++++++++++++++++++++++++++++----
2 files changed, 183 insertions(+), 19 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 62f5aad59f..3a2135155d 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -264,6 +264,55 @@ def _provision() -> tuple[int, bytes]:
return r.returncode, b"".join(log_lines)
subprocess.run(["sync"], check=False)
+
+ # Pre-snapshot trim:
+ #
+ # 1. Restart the daemon if the system is restartable. After ./load
+ # finishes, most engines have hundreds of MB of fresh per-INSERT
+ # state on the heap: ClickHouse's MergeTree merge thread arenas,
+ # Postgres' aborted-batch buffers, etc. Stop/start sheds that
+ # private memory back to zero. Wait for ./check to confirm the
+ # server is ready again so the snapshot we take is on a quiesced
+ # process whose first user query will be a cold *query*, not a
+ # cold *startup*. Skip this for in-process engines (chdb, polars,
+ # pandas, …) where stop/start is a no-op AND the data lives in
+ # the process — wiping it would defeat the whole point.
+ restartable = (SYSTEM_DIR / "start").exists() and (SYSTEM_DIR / "stop").exists()
+ if restartable:
+ # Best effort: don't bail on errors. We try to stop, wait, start,
+ # check; if any step fails we proceed with whatever state we
+ # have. The host will see ./check fail and refuse to snapshot.
+ subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ timeout=120, check=False)
+ for _ in range(60):
+ rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ timeout=10, check=False).returncode
+ if rc != 0:
+ break
+ time.sleep(0.5)
+ subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ timeout=300, check=False)
+ for _ in range(300):
+ rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ timeout=10, check=False).returncode
+ if rc == 0:
+ break
+ time.sleep(1)
+
+ # 2. Drop the kernel's page+dentry+inode cache. The page cache holds
+ # 3-5 GB of file data the system would re-read on demand anyway;
+ # those pages become zero-fill in the snapshot, which zstd
+ # compresses ~50:1 vs random data.
+ subprocess.run(["sync"], check=False)
+ try:
+ Path("/proc/sys/vm/drop_caches").write_text("3\n")
+ except Exception:
+ pass
+
PROVISION_DONE.write_text(f"ok {time.time()}\n")
PROVISION_LOG.write_bytes(b"".join(log_lines))
return 0, b"".join(log_lines)
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 7b97e4d46c..45003c86aa 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -54,6 +54,14 @@ class VM:
api_sock: Path
log_sock: Path # we just point this at /dev/null actually
pid: Optional[int] = None
+ # Keep the asyncio.subprocess.Process handle for the running firecracker.
+ # Without holding it, Python eventually garbage-collects the wrapper and
+ # the underlying child sits as a zombie until the host server
+ # exits — the kernel keeps the zombie's open TAP fd around with it, and a
+ # subsequent restore for the same slot then fails to open the TAP with
+ # "Resource busy". Holding the handle lets us `await proc.wait()` on
+ # shutdown and reap immediately.
+ proc: Optional[asyncio.subprocess.Process] = None
state: str = "down"
# Snapshot artifacts
snapshot_bin: Path = dataclasses.field(default_factory=lambda: Path())
@@ -111,8 +119,8 @@ async def ensure_ready_for_query(self, system: str) -> VM:
# Process is gone or unresponsive. Treat as snapshotted.
vm.state = "snapshotted"
if vm.state == "down":
- if not vm.snapshot_bin.exists():
- # No snapshot yet — need a full provision.
+ if not _has_snapshot(vm):
+ # No snapshot (raw or compressed) yet — full provision.
await self._initial_provision(vm)
else:
await self._restore_snapshot(vm)
@@ -177,6 +185,7 @@ async def _spawn_firecracker(self, vm: VM) -> None:
"--id", vm.system.name,
stdout=log_fh, stderr=log_fh, env=env, start_new_session=True,
)
+ vm.proc = proc
vm.pid = proc.pid
# Wait for the API socket to exist
for _ in range(80):
@@ -248,31 +257,43 @@ async def _boot(self, vm: VM, *, restore_snapshot: bool) -> None:
True, we load from the snapshot files; else we cold-boot from kernel +
rootfs."""
await self._spawn_firecracker(vm)
- sock = str(vm.api_sock)
+ try:
+ await self._configure_boot(vm, restore_snapshot=restore_snapshot)
+ except Exception:
+ # If config fails partway, the firecracker process still owns the
+ # TAP fd; without reaping it, the next attempt sees "Resource
+ # busy" because the kernel hasn't released the TAP. Kill +
+ # wait() before propagating.
+ await self._shutdown(vm)
+ raise
- # Network: must be configured *before* either boot path.
- await fc.put(sock, f"/network-interfaces/eth0", {
- "iface_id": "eth0",
- "guest_mac": net.mac_for(vm.slot),
- "host_dev_name": net.tap_name(vm.slot),
- })
+ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None:
+ sock = str(vm.api_sock)
rootfs = self.cfg.systems_dir / vm.system.name / "rootfs.ext4"
sysdisk = self.cfg.systems_dir / vm.system.name / "system.ext4"
if restore_snapshot:
- # Drives must match the layout that existed when the snapshot was
- # taken, but Firecracker re-reads file paths on restore. We rebind
- # them here in case the absolute paths changed (e.g. snapshot moved).
+ # Firecracker's rule: `PUT /snapshot/load` must be the *first*
+ # configuring action — no boot-source, no drives, no network
+ # interfaces, no machine-config beforehand. The snapshot itself
+ # encodes all of that. We just need the same TAP available on
+ # the host with the same name (host_ensure_tap below handles
+ # this).
await fc.put(sock, "/snapshot/load", {
"snapshot_path": str(vm.snapshot_state),
"mem_backend": {"backend_type": "File", "backend_path": str(vm.snapshot_bin)},
"enable_diff_snapshots": False,
"resume_vm": True,
- })
+ }, timeout=120.0)
return
# Cold boot.
+ await fc.put(sock, "/network-interfaces/eth0", {
+ "iface_id": "eth0",
+ "guest_mac": net.mac_for(vm.slot),
+ "host_dev_name": net.tap_name(vm.slot),
+ })
await fc.put(sock, "/boot-source", {
"kernel_image_path": str(self.cfg.kernel_path),
"boot_args": self._kernel_cmdline(vm),
@@ -310,8 +331,73 @@ async def _snapshot(self, vm: VM) -> None:
with contextlib.suppress(Exception):
await fc.patch(sock, "/vm", {"state": "Resumed"})
+ # Compress the memory dump with parallel zstd. Firecracker writes the
+ # *full* 16 GB of guest memory regardless of how much was actually
+ # used; zstd at -3 with -T0 turns that into ~10-12 GB in a few
+ # seconds (most of the savings come from the agent's drop_caches
+ # right before /snapshot — page cache zero-fills compress 50:1).
+ # snapshot.state stays as-is; it's tiny (~60 KB).
+ await self._compress_snapshot(vm)
+
+ async def _compress_snapshot(self, vm: VM) -> None:
+ bin_path = vm.snapshot_bin
+ zst_path = vm.snapshot_bin.with_suffix(".bin.zst")
+ if not bin_path.exists():
+ return
+ log.info("[%s] zstd -T0 -3 snapshot.bin (%s)",
+ vm.system.name, _fmt_size(bin_path.stat().st_size))
+ t0 = time.monotonic()
+ # Stream from snapshot.bin to .zst, multi-threaded. `--long=27`
+ # widens the matching window to 128 MB which helps with repetitive
+ # zero-region patterns common in guest RAM.
+ proc = await asyncio.create_subprocess_exec(
+ "zstd", "-T0", "-3", "--long=27", "-q", "-f",
+ str(bin_path), "-o", str(zst_path),
+ )
+ rc = await proc.wait()
+ dt = time.monotonic() - t0
+ if rc != 0:
+ log.warning("[%s] zstd compression failed rc=%d; keeping raw .bin",
+ vm.system.name, rc)
+ zst_path.unlink(missing_ok=True)
+ return
+ new = zst_path.stat().st_size
+ log.info("[%s] zstd done in %.1fs: %s -> %s (%.1fx)",
+ vm.system.name, dt,
+ _fmt_size(bin_path.stat().st_size), _fmt_size(new),
+ bin_path.stat().st_size / max(1, new))
+ # The raw .bin can go; restore re-decompresses into a temp file.
+ bin_path.unlink(missing_ok=True)
+
+ async def _decompress_snapshot(self, vm: VM) -> None:
+ """If the snapshot lives as .bin.zst, decompress to .bin in place.
+ Idempotent: a no-op if .bin already exists.
+ """
+ bin_path = vm.snapshot_bin
+ zst_path = vm.snapshot_bin.with_suffix(".bin.zst")
+ if bin_path.exists():
+ return
+ if not zst_path.exists():
+ return
+ log.info("[%s] unzstd snapshot.bin.zst (%s)",
+ vm.system.name, _fmt_size(zst_path.stat().st_size))
+ t0 = time.monotonic()
+ proc = await asyncio.create_subprocess_exec(
+ "zstd", "-T0", "-d", "-q", "-f", "--long=27",
+ str(zst_path), "-o", str(bin_path),
+ )
+ rc = await proc.wait()
+ dt = time.monotonic() - t0
+ if rc != 0:
+ raise RuntimeError(f"zstd decompress failed rc={rc}")
+ log.info("[%s] unzstd done in %.1fs -> %s",
+ vm.system.name, dt, _fmt_size(bin_path.stat().st_size))
+
async def _restore_snapshot(self, vm: VM) -> None:
log.info("[%s] restore from snapshot", vm.system.name)
+ # If we only have the zstd-compressed memory dump, expand it before
+ # Firecracker tries to mmap it.
+ await self._decompress_snapshot(vm)
await net.ensure_tap(vm.slot)
# internet stays OFF post-snapshot
await self._boot(vm, restore_snapshot=True)
@@ -319,19 +405,30 @@ async def _restore_snapshot(self, vm: VM) -> None:
vm.state = "ready"
async def _shutdown(self, vm: VM) -> None:
- """Best-effort clean shutdown of the firecracker process."""
- if not vm.pid:
+ """Best-effort clean shutdown of the firecracker process.
+
+ Always reap the asyncio.subprocess.Process handle so the kernel
+ releases its open file descriptors (notably the TAP — without this
+ the next /restore for the same slot fails with `Resource busy`).
+ """
+ if not vm.pid and not vm.proc:
return
with contextlib.suppress(Exception):
await fc.put(str(vm.api_sock), "/actions", {"action_type": "SendCtrlAltDel"})
- # Wait briefly for graceful exit
+ # Wait briefly for graceful exit.
for _ in range(50):
- if not _pid_alive(vm.pid):
+ if vm.pid is None or not _pid_alive(vm.pid):
break
await asyncio.sleep(0.1)
- if _pid_alive(vm.pid):
+ if vm.pid is not None and _pid_alive(vm.pid):
with contextlib.suppress(ProcessLookupError):
os.kill(vm.pid, signal.SIGKILL)
+ # Reap the process. asyncio.Process.wait() drains the exit status so
+ # the kernel can release the resources (TAP fd, memory mappings).
+ if vm.proc is not None:
+ with contextlib.suppress(Exception):
+ await asyncio.wait_for(vm.proc.wait(), timeout=5.0)
+ vm.proc = None
vm.pid = None
with contextlib.suppress(FileNotFoundError):
vm.api_sock.unlink()
@@ -340,7 +437,13 @@ async def _teardown(self, vm: VM, reason: str) -> None:
log.warning("[%s] teardown: %s", vm.system.name, reason)
with contextlib.suppress(Exception):
await self._shutdown(vm)
- vm.state = "snapshotted" if vm.snapshot_bin.exists() else "down"
+ vm.state = "snapshotted" if _has_snapshot(vm) else "down"
+ # Drop the decompressed snapshot.bin if we still have the .zst — it's
+ # ~16 GB of redundancy on disk. Keep .zst as the canonical artifact.
+ zst = vm.snapshot_bin.with_suffix(".bin.zst")
+ if vm.snapshot_bin.exists() and zst.exists():
+ with contextlib.suppress(FileNotFoundError):
+ vm.snapshot_bin.unlink()
# ── agent helpers ────────────────────────────────────────────────────
@@ -381,6 +484,18 @@ async def _call_agent_provision(self, vm: VM) -> None:
f"{body[-2000:].decode(errors='replace')}")
+def _has_snapshot(vm: VM) -> bool:
+ return vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists()
+
+
+def _fmt_size(n: int) -> str:
+ for u in ("B", "KiB", "MiB", "GiB", "TiB"):
+ if n < 1024:
+ return f"{n:.1f}{u}"
+ n //= 1024
+ return f"{n}PiB"
+
+
def _pid_alive(pid: int) -> bool:
try:
os.kill(pid, 0)
From db4625a6176fd6f76ab4fa0aa98601ae9a1f818a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 20:25:28 +0000
Subject: [PATCH 004/221] playground: capture stop/start output in provision
log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The previous version threw away stdout/stderr from the pre-snapshot
stop/start cycle, so a silent failure (`sudo clickhouse start` failing
because the data dir was still locked by the dying daemon, etc.) left
us with a snapshot of a dead clickhouse-server — restored VMs then
returned "Connection refused (localhost:9000)" on every query and the
only way to recover was to manually delete the snapshot.
Capture stdout+stderr into the provision log so the failure mode is
visible via GET /provision-log, and refuse to mark PROVISION_DONE if
./check doesn't recover within the timeout. The host then sees /provision
return 500 and skips the snapshot step entirely.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 43 +++++++++++++++++++++++++--------------
1 file changed, 28 insertions(+), 15 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 3a2135155d..45dce9f667 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -271,20 +271,22 @@ def _provision() -> tuple[int, bytes]:
# finishes, most engines have hundreds of MB of fresh per-INSERT
# state on the heap: ClickHouse's MergeTree merge thread arenas,
# Postgres' aborted-batch buffers, etc. Stop/start sheds that
- # private memory back to zero. Wait for ./check to confirm the
- # server is ready again so the snapshot we take is on a quiesced
- # process whose first user query will be a cold *query*, not a
- # cold *startup*. Skip this for in-process engines (chdb, polars,
- # pandas, …) where stop/start is a no-op AND the data lives in
- # the process — wiping it would defeat the whole point.
+ # private memory back to zero. We capture stop/start output into
+ # the provision log so a broken restart can be diagnosed, and
+ # bail out of /provision if ./check doesn't recover — the host
+ # must NOT snapshot a dead daemon, since post-restore /query
+ # would then hit "Connection refused" until the user manually
+ # kicks the VM. Skip the whole dance for in-process engines
+ # (chdb, polars, pandas, …) where stop/start is a no-op AND the
+ # data lives in the process; wiping it would defeat the point.
restartable = (SYSTEM_DIR / "start").exists() and (SYSTEM_DIR / "stop").exists()
if restartable:
- # Best effort: don't bail on errors. We try to stop, wait, start,
- # check; if any step fails we proceed with whatever state we
- # have. The host will see ./check fail and refuse to snapshot.
- subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR),
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
- timeout=120, check=False)
+ log_lines.append(b"\n=== pre-snapshot restart ===\n")
+ r = subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=120, check=False)
+ log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n")
+ log_lines.append(r.stdout or b"")
for _ in range(60):
rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR),
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
@@ -292,16 +294,27 @@ def _provision() -> tuple[int, bytes]:
if rc != 0:
break
time.sleep(0.5)
- subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR),
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
- timeout=300, check=False)
+ r = subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=300, check=False)
+ log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n")
+ log_lines.append(r.stdout or b"")
+ restart_ok = False
for _ in range(300):
rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR),
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
timeout=10, check=False).returncode
if rc == 0:
+ restart_ok = True
break
time.sleep(1)
+ if not restart_ok:
+ log_lines.append(b"=== pre-snapshot restart FAILED ===\n")
+ PROVISION_LOG.write_bytes(b"".join(log_lines))
+ # Do NOT set PROVISION_DONE; force /provision to return 500
+ # so the host doesn't snapshot a dead daemon.
+ return 1, b"".join(log_lines)
+ log_lines.append(b"=== pre-snapshot restart ok ===\n")
# 2. Drop the kernel's page+dentry+inode cache. The page cache holds
# 3-5 GB of file data the system would re-read on demand anyway;
From f9aed82925d92451ad014ba2411e79f469042c3d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 20:34:04 +0000
Subject: [PATCH 005/221] playground: kick daemon on agent boot, refresh rootfs
on re-provision
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
PROVISION_DONE lives on the rootfs disk (/var/lib/clickbench-agent/),
which persists across VM cold-boots. So on the second provision after
the host deleted the snapshot files, the agent saw PROVISION_DONE
already set and returned "already provisioned" — but the daemon
itself wasn't running (cold boot, no clickhouse-server in systemd),
so the host snapshotted an empty VM and every restored query came back
with "Connection refused (localhost:9000)".
Two fixes:
1. Agent: on every startup, if PROVISION_DONE is set, kick ./start
in a background thread. start is idempotent for the systems that
have a daemon, so it costs nothing when the daemon is already up
(post-restore) and brings it up when the rootfs is being re-used
across a cold reboot.
2. Host: when (re-)provisioning a system with no snapshot, drop the
existing rootfs.ext4 so install/start/load run fresh. The
system.ext4 (which holds ~14 GB of pre-staged dataset) is preserved.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 34 +++++++++++++++++++++++++++++++++
playground/server/vm_manager.py | 9 +++++++++
2 files changed, 43 insertions(+)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 45dce9f667..d2b0def356 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -407,10 +407,44 @@ class ReusableServer(socketserver.ThreadingTCPServer):
daemon_threads = True
+def _kick_daemon_if_provisioned() -> None:
+ """On every agent boot, if the system has been provisioned, make sure
+ the daemon is also running.
+
+ The rootfs is persistent across boots, so PROVISION_DONE survives a
+ cold restart of the VM. But the *process* doesn't — anything that was
+ in the snapshot's memory image goes away when the host takes a cold
+ boot (not a restore). Without this kick, a query would arrive at the
+ agent, the agent would see PROVISION_DONE and skip install/start,
+ and then ./query would hit a dead daemon and return "Connection
+ refused (localhost:9000)" forever.
+
+ Run start asynchronously: blocking the agent's listen until the
+ daemon is ready would defeat /health, which the host uses to gate
+ snapshot creation and restore-wait timeouts.
+ """
+ if not PROVISION_DONE.exists():
+ return
+ start = SYSTEM_DIR / "start"
+ if not start.exists() or not os.access(start, os.X_OK):
+ return
+
+ def _bg() -> None:
+ try:
+ subprocess.run([str(start)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=300, check=False)
+ except Exception as e:
+ sys.stderr.write(f"[agent] daemon-kick failed: {e}\n")
+
+ threading.Thread(target=_bg, daemon=True, name="daemon-kick").start()
+
+
def main() -> None:
addr = ("0.0.0.0", LISTEN_PORT)
print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} "
f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True)
+ _kick_daemon_if_provisioned()
with ReusableServer(addr, Handler) as srv:
srv.serve_forever()
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 45003c86aa..8d75a23751 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -239,6 +239,15 @@ async def _build_images_if_needed(self, vm: VM) -> None:
sys_dir = self.cfg.systems_dir / vm.system.name
rootfs = sys_dir / "rootfs.ext4"
sysdisk = sys_dir / "system.ext4"
+ # If we're (re-)provisioning a system whose rootfs already has
+ # /var/lib/clickbench-agent/provisioned set, drop just the rootfs so
+ # the agent reruns the full install/start/load flow on the next
+ # boot. The system.ext4 (scripts + ~14 GB of dataset) is preserved —
+ # rebuilding it copies 14 GB unnecessarily.
+ if rootfs.exists() and not _has_snapshot(vm):
+ log.info("[%s] rootfs exists but no snapshot — dropping it for "
+ "a fresh agent state", vm.system.name)
+ rootfs.unlink()
if rootfs.exists() and sysdisk.exists():
return
log.info("[%s] building rootfs + system disk", vm.system.name)
From b3c05ee8df0870fae988399f14a4c0598d72fc62 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 20:34:44 +0000
Subject: [PATCH 006/221] playground: pre-stamp 'ubuntu' in /etc/hosts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The cloud image ships hostname=ubuntu but /etc/hosts only maps
'localhost' to 127.0.0.1. Every sudo invocation inside the VM then
tries to reverse-resolve 'ubuntu' against the network — which has no
DNS after the snapshot drops internet — and pays the ~2 s resolver
timeout. With several sudos per ./query, that's a multi-second floor
on every query, visible in the firecracker log as repeated
'sudo: unable to resolve host ubuntu: Name or service not known'.
Mapping ubuntu to 127.0.0.1 short-circuits the lookup.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 6e4323bc71..e3a102204c 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -181,6 +181,16 @@ EOF
# and break on empty.
mkdir -p /root
chmod 700 /root
+
+# /etc/hosts: ensure both "localhost" and the cloud-image hostname "ubuntu"
+# resolve locally. Without the second entry, every sudo invocation does a
+# reverse DNS lookup that times out (~2 s each) trying to find "ubuntu" on
+# the dropped-internet network, which adds up to a multi-second floor on
+# every /query. Pre-stamping the host name removes the round trip.
+cat > /etc/hosts <
Date: Tue, 12 May 2026 20:54:39 +0000
Subject: [PATCH 007/221] playground: pre-snapshot sync from host + drop daemon
restart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The mid-snapshot checksum-mismatch I attributed to "stopping the
daemon mid-merge" was actually FS corruption: KVM pauses the vcpus
the moment we call /vm Paused, and any ext4 writeback that was in
flight at that instant gets captured by the snapshot as half-flushed.
On restore the page cache references on-disk blocks that never landed,
and the next read sees a torn write.
Fix:
1. Drop the pre-snapshot stop/start. Killing ClickHouse at any
point never corrupts on-disk MergeTree data — only an unflushed
FS can.
2. Add a /sync endpoint to the agent and call it from the host
right before /vm Paused, so all dirty pages have hit virtio-blk
before KVM freezes the vcpus.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 75 ++++++++-------------------------
playground/server/vm_manager.py | 18 ++++++++
2 files changed, 36 insertions(+), 57 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index d2b0def356..a6950e0d9a 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -263,63 +263,10 @@ def _provision() -> tuple[int, bytes]:
PROVISION_LOG.write_bytes(b"".join(log_lines))
return r.returncode, b"".join(log_lines)
- subprocess.run(["sync"], check=False)
-
- # Pre-snapshot trim:
- #
- # 1. Restart the daemon if the system is restartable. After ./load
- # finishes, most engines have hundreds of MB of fresh per-INSERT
- # state on the heap: ClickHouse's MergeTree merge thread arenas,
- # Postgres' aborted-batch buffers, etc. Stop/start sheds that
- # private memory back to zero. We capture stop/start output into
- # the provision log so a broken restart can be diagnosed, and
- # bail out of /provision if ./check doesn't recover — the host
- # must NOT snapshot a dead daemon, since post-restore /query
- # would then hit "Connection refused" until the user manually
- # kicks the VM. Skip the whole dance for in-process engines
- # (chdb, polars, pandas, …) where stop/start is a no-op AND the
- # data lives in the process; wiping it would defeat the point.
- restartable = (SYSTEM_DIR / "start").exists() and (SYSTEM_DIR / "stop").exists()
- if restartable:
- log_lines.append(b"\n=== pre-snapshot restart ===\n")
- r = subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR),
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
- timeout=120, check=False)
- log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n")
- log_lines.append(r.stdout or b"")
- for _ in range(60):
- rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR),
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
- timeout=10, check=False).returncode
- if rc != 0:
- break
- time.sleep(0.5)
- r = subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR),
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
- timeout=300, check=False)
- log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n")
- log_lines.append(r.stdout or b"")
- restart_ok = False
- for _ in range(300):
- rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR),
- stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
- timeout=10, check=False).returncode
- if rc == 0:
- restart_ok = True
- break
- time.sleep(1)
- if not restart_ok:
- log_lines.append(b"=== pre-snapshot restart FAILED ===\n")
- PROVISION_LOG.write_bytes(b"".join(log_lines))
- # Do NOT set PROVISION_DONE; force /provision to return 500
- # so the host doesn't snapshot a dead daemon.
- return 1, b"".join(log_lines)
- log_lines.append(b"=== pre-snapshot restart ok ===\n")
-
- # 2. Drop the kernel's page+dentry+inode cache. The page cache holds
- # 3-5 GB of file data the system would re-read on demand anyway;
- # those pages become zero-fill in the snapshot, which zstd
- # compresses ~50:1 vs random data.
+ # Drop the page+dentry+inode cache. The page cache typically holds
+ # 3-5 GB of file data the system would re-read on demand anyway;
+ # those pages turn into zero-fill in the snapshot, which zstd
+ # compresses 50:1 vs random data.
subprocess.run(["sync"], check=False)
try:
Path("/proc/sys/vm/drop_caches").write_text("3\n")
@@ -365,6 +312,20 @@ def do_GET(self) -> None:
self._send_json(404, {"error": "not found", "path": self.path})
def do_POST(self) -> None:
+ if self.path == "/sync":
+ # Flush all dirty pages to the virtio-blk devices. The host
+ # calls this immediately before /snapshot/create so the
+ # on-disk image captured in the snapshot is consistent with
+ # what the in-memory page cache thinks is there. Without
+ # this, a long-running daemon's writeback may still be in
+ # flight when KVM pauses the vcpus, the snapshot freezes a
+ # mid-flush state, and post-restore reads see torn or
+ # checksum-mismatched data.
+ t0 = time.monotonic()
+ subprocess.run(["sync"], check=False)
+ self._send(200, f"{time.monotonic() - t0:.3f}\n".encode(),
+ {"Content-Type": "text/plain"})
+ return
if self.path == "/provision":
rc, log = _provision()
self._send(200 if rc == 0 else 500, log[-OUTPUT_LIMIT:],
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 8d75a23751..be655a3458 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -327,6 +327,14 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None:
await fc.put(sock, "/actions", {"action_type": "InstanceStart"})
async def _snapshot(self, vm: VM) -> None:
+ # Flush the guest's dirty pages to the virtio-blk devices before we
+ # pause the vcpus. Without an explicit sync here, KVM can freeze
+ # the guest mid-flush — the snapshot then captures memory that
+ # references on-disk blocks that haven't actually landed yet, and
+ # the next read after restore sees a checksum mismatch / torn
+ # write on whatever was being written at the moment of pause.
+ await self._sync_guest(vm)
+
sock = str(vm.api_sock)
await fc.patch(sock, "/vm", {"state": "Paused"})
try:
@@ -492,6 +500,16 @@ async def _call_agent_provision(self, vm: VM) -> None:
raise RuntimeError(f"agent /provision failed: {r.status}: "
f"{body[-2000:].decode(errors='replace')}")
+ async def _sync_guest(self, vm: VM) -> None:
+ url = self.agent_url(vm) + "/sync"
+ try:
+ async with aiohttp.ClientSession() as s:
+ async with s.post(url, timeout=aiohttp.ClientTimeout(total=300)) as r:
+ body = (await r.read()).decode("utf-8", errors="replace").strip()
+ log.info("[%s] guest sync: %s", vm.system.name, body)
+ except Exception as e:
+ log.warning("[%s] guest sync failed (%r); proceeding anyway", vm.system.name, e)
+
def _has_snapshot(vm: VM) -> bool:
return vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists()
From 446b3f7ebbf284f52e70ac671c2d587b6bff5d80 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 20:59:36 +0000
Subject: [PATCH 008/221] playground: stop daemon before snapshot for tiny
snapshot.bin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Now that the host /syncs the FS before pausing the vcpus, the snapshot
captures consistent on-disk state regardless of when the daemon exits
(MergeTree's on-disk format is durable under arbitrary process exit;
only an unflushed *filesystem* corrupts it). So we can shut the daemon
down here to evict its private heap (merge thread arenas, query cache,
mark cache, uncompressed cache, ingest buffers) and snapshot what's
left — mostly zero-fill RAM, which zstd compresses ~300:1.
Restore path is unchanged: _kick_daemon_if_provisioned at agent
startup brings the daemon back up on every cold restore. First query
in a restored VM pays a 1-2 s daemon-start cost instead of carrying
8-12 GB of memory in every snapshot.
In-process engines (chdb, polars, …) keep all state in RAM and have
no daemon to stop; for them, has_daemon is false and we skip the
stop step, falling back to drop_caches alone.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 51 ++++++++++++++++++++++++++++++++++++---
1 file changed, 47 insertions(+), 4 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index a6950e0d9a..743a4068f0 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -263,10 +263,53 @@ def _provision() -> tuple[int, bytes]:
PROVISION_LOG.write_bytes(b"".join(log_lines))
return r.returncode, b"".join(log_lines)
- # Drop the page+dentry+inode cache. The page cache typically holds
- # 3-5 GB of file data the system would re-read on demand anyway;
- # those pages turn into zero-fill in the snapshot, which zstd
- # compresses 50:1 vs random data.
+ # Pre-snapshot trim. The host /sync's the FS right before pausing
+ # the vcpus, so any on-disk data the daemon has already committed
+ # is durable. That means we're free to stop the daemon here:
+ # ClickHouse's MergeTree (and equivalent on-disk stores) never
+ # produce inconsistent on-disk state regardless of when the
+ # process exits — only an unflushed *filesystem* can. With the
+ # host-side /sync in place, we can shut the daemon down to evict
+ # its private heap (merge thread arenas, query cache, mark cache,
+ # uncompressed cache, parquet ingest buffers, …) and snapshot a
+ # mostly-zero RAM image. The agent's startup path
+ # (_kick_daemon_if_provisioned) brings it back up on every
+ # restore, so the first query in a restored VM pays a 1-2 s
+ # daemon-start cost instead of carrying 8-12 GB of memory in
+ # every snapshot.
+ #
+ # Skip for in-process / stateless tools where stop/start is a
+ # no-op AND the data lives in process memory; wiping it would
+ # defeat the point. Those systems can rely on drop_caches alone.
+ stop = SYSTEM_DIR / "stop"
+ start = SYSTEM_DIR / "start"
+ check = SYSTEM_DIR / "check"
+ has_daemon = (stop.exists() and start.exists() and
+ check.exists() and os.access(stop, os.X_OK) and
+ os.access(start, os.X_OK))
+ if has_daemon:
+ log_lines.append(b"\n=== pre-snapshot stop ===\n")
+ r = subprocess.run([str(stop)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=120, check=False)
+ log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n")
+ log_lines.append(r.stdout or b"")
+ # Wait for the daemon to actually exit (./check failing means
+ # it's gone). Tolerant if it never fails — we still proceed.
+ for _ in range(120):
+ rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ timeout=10, check=False).returncode
+ if rc != 0:
+ break
+ time.sleep(0.5)
+ log_lines.append(b"=== pre-snapshot stop done ===\n")
+
+ # Drop the page+dentry+inode cache. With the daemon stopped, this
+ # frees both file cache AND its mmap'd buffers. The result is a
+ # snapshot whose memory is mostly zero-fill, which zstd compresses
+ # ~300:1.
subprocess.run(["sync"], check=False)
try:
Path("/proc/sys/vm/drop_caches").write_text("3\n")
From 2d3ac3f963c2c523ea4f2f23054612644ba2c754 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 21:16:09 +0000
Subject: [PATCH 009/221] playground: init_on_free=1 + ensure-daemon-up on
first /query
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two changes for the small-snapshot path:
1. Pass init_on_free=1 in the guest kernel cmdline. Linux normally
leaves freed page frames with whatever bytes were last written to
them, so the post-`clickhouse stop` free pool was ~10 GB of stale
daemon heap and Firecracker's snapshot dump compressed only ~3:1.
init_on_free=1 zeros every page as it goes onto the free list, so
the snapshot's RAM region is genuinely zero-filled and zstd hits
~300:1.
2. Add `_ensure_daemon_started` at the top of the agent's /query
handler. After a snapshot restore (taken with the daemon stopped),
the restored memory has no daemon process and `localhost:9000`
refuses connections. The cold-boot `_kick_daemon_if_provisioned`
only fires on actual cold boots, not on snapshot resumes, so we
need an explicit check at query time. Lock-protected so concurrent
/query requests don't try to ./start the daemon twice; idempotent
and free once the daemon is up.
Also dropped the userspace _zero_free_ram hack — init_on_free does
it natively at no userspace cost.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 59 ++++++++++++++++++++++++++++++---
playground/server/vm_manager.py | 11 +++++-
2 files changed, 65 insertions(+), 5 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 743a4068f0..40ddbaae5e 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -61,6 +61,12 @@
# scripts hitting the same socket/temp file concurrently would not be safe.
_query_lock = threading.Lock()
_provision_lock = threading.Lock()
+# Tracks whether we've successfully run ./start since this agent process
+# came up. After a snapshot restore the daemon doesn't exist in the
+# restored memory (we stop it pre-snapshot to keep snapshots small), so the
+# first /query has to bring it up.
+_daemon_started = threading.Event()
+_daemon_lock = threading.Lock()
def _cap(b: bytes) -> tuple[bytes, bool]:
@@ -130,6 +136,45 @@ def _stats_snapshot() -> dict:
return out
+def _ensure_daemon_started() -> None:
+ """Bring the system's daemon up if it isn't already.
+
+ Called at the top of every /query handler. The first call after a
+ snapshot restore is where the work happens — the snapshot was taken
+ with the daemon stopped (to keep the memory image compressible), so
+ nothing is listening on the daemon's port until we explicitly run
+ ./start. Subsequent calls are no-ops because _daemon_started is set.
+
+ Wrapping ./start in a thread lock means only one /query in flight
+ pays the start cost, even if several arrive concurrently.
+ """
+ if _daemon_started.is_set():
+ return
+ with _daemon_lock:
+ if _daemon_started.is_set():
+ return
+ start = SYSTEM_DIR / "start"
+ if not start.exists() or not os.access(start, os.X_OK):
+ # No daemon to start (in-process system like chdb/polars).
+ _daemon_started.set()
+ return
+ subprocess.run([str(start)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=300, check=False)
+ # Wait for ./check to confirm before unblocking the /query.
+ check = SYSTEM_DIR / "check"
+ if check.exists():
+ for _ in range(120):
+ rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ timeout=10, check=False).returncode
+ if rc == 0:
+ break
+ time.sleep(0.5)
+ _daemon_started.set()
+
+
def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]:
"""
Invoke ./query with the SQL on stdin.
@@ -306,10 +351,12 @@ def _provision() -> tuple[int, bytes]:
time.sleep(0.5)
log_lines.append(b"=== pre-snapshot stop done ===\n")
- # Drop the page+dentry+inode cache. With the daemon stopped, this
- # frees both file cache AND its mmap'd buffers. The result is a
- # snapshot whose memory is mostly zero-fill, which zstd compresses
- # ~300:1.
+ # Drop the page+dentry+inode cache. With init_on_free=1 set in the
+ # guest kernel cmdline (see vm_manager._kernel_cmdline), every page
+ # the kernel frees gets zero-filled before going back on the free
+ # list. After clickhouse stop + drop_caches, the entire free pool
+ # is genuinely zero-filled, and the snapshot's RAM dump compresses
+ # ~300:1 instead of the ~3:1 we got without init_on_free.
subprocess.run(["sync"], check=False)
try:
Path("/proc/sys/vm/drop_caches").write_text("3\n")
@@ -383,6 +430,10 @@ def do_POST(self) -> None:
if not sql.strip():
self._send_json(400, {"error": "empty query"})
return
+ # First /query after a snapshot restore: start the daemon
+ # (it was stopped pre-snapshot to keep snapshots small).
+ # Subsequent calls are a near-instant no-op.
+ _ensure_daemon_started()
with _query_lock:
rc, out, err, wall = _run_query(sql)
script_t = _extract_script_timing(err)
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index be655a3458..926af22733 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -200,9 +200,18 @@ def _kernel_cmdline(self, vm: VM) -> str:
# console socket for debugging); reboot=k for clean halt-on-panic.
# The kernel's built-in IP autoconfig statically assigns the VM's
# /24 from its slot, sidestepping any DHCP/networkd in userland.
+ #
+ # init_on_free=1: makes the kernel zero every page as it goes back
+ # on the free list. Without it, freed pages keep whatever the last
+ # writer put there — and Firecracker's snapshot dumps *all* RAM,
+ # so 8-12 GB of stale-but-freed daemon heap end up in snapshot.bin
+ # looking random to zstd. With it on, the pre-snapshot daemon
+ # shutdown leaves the guest's free pool genuinely zero-filled, and
+ # zstd compresses the snapshot ~300:1. The cost is a small write
+ # overhead on every free (~negligible vs the snapshot size win).
host_ip, vm_ip, _ = net.addr_for(vm.slot)
return (
- "console=ttyS0 reboot=k panic=1 pci=off "
+ "console=ttyS0 reboot=k panic=1 pci=off init_on_free=1 "
f"ip={vm_ip}::{host_ip}:255.255.255.0::eth0:off "
"root=/dev/vda rw "
"init=/lib/systemd/systemd "
From fd5d74fcfbd03322e448cd89e8ed6d0c21682104 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 21:30:34 +0000
Subject: [PATCH 010/221] =?UTF-8?q?playground:=20checkpoint=20=E2=80=94=20?=
=?UTF-8?q?ClickHouse=20smoke=20test=20passes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
End-to-end working with a 35 MB snapshot (16 GiB raw, ~470x ratio):
SELECT COUNT(*) returns 99997497 cleanly, GROUP BY URL produces the
expected top-N without any checksum errors, output truncation caps a
244 KB result at 10 KB with the right header set.
Cold path (snapshot restore + daemon start): ~10 s.
Warm path (live VM): subsecond on COUNT / MIN-MAX.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/docs/build-progress.md | 191 ++++++++++++++----------------
1 file changed, 92 insertions(+), 99 deletions(-)
diff --git a/playground/docs/build-progress.md b/playground/docs/build-progress.md
index a710246518..0e1b07629b 100644
--- a/playground/docs/build-progress.md
+++ b/playground/docs/build-progress.md
@@ -1,110 +1,103 @@
-# Playground build progress — checkpoint 2026-05-12 ~19:58 UTC
-
-## What is built and committed
-
-- `playground/` directory scaffolded with subdirs `server/`, `agent/`,
- `images/`, `web/`, `scripts/`, `docs/`.
-- Architecture notes in `playground/README.md` and
- `playground/docs/architecture.md`.
-- Host-side API server (`playground/server/*.py`):
- - `config.py` — env-driven config with sensible defaults
- - `systems.py` — discovers 97 playground-eligible ClickBench systems
- - `firecracker.py` — async unix-socket client for Firecracker API
- - `net.py` — per-VM TAP + /24 + NAT toggle
- - `vm_manager.py` — VM lifecycle (boot, provision, snapshot, restore)
- - `monitor.py` — CPU/disk/host-memory watchdog (1 Hz)
- - `logging_sink.py` — batched async logger → ClickHouse Cloud + JSONL fallback
- - `main.py` — aiohttp routes + static SPA serving
-- In-VM agent (`playground/agent/agent.py`, stdlib-only) with endpoints
- `/health`, `/stats`, `/provision`, `/query`, `/provision-log`.
-- systemd unit `playground/agent/clickbench-agent.service` installed in the
- rootfs and enabled.
-- Vanilla JS SPA (`playground/web/`): system picker, query box, timing display,
- truncation indicator. Talks to `/api/systems`, `/api/system/`,
- `/api/query?system=...`.
-- Build scripts:
- - `images/build-base-rootfs.sh` — Ubuntu 22.04 cloud image → flat 8 GB
- ext4 with agent + systemd unit pre-installed.
- - `images/build-system-rootfs.sh` — per-system 200 GB sparse rootfs +
- sized system disk (16/88 GB depending on data format) containing the
- ClickBench scripts + the dataset files this system needs (no symlinks
- into a RO mount, because many systems' load scripts `chown`).
- - `scripts/install-firecracker.sh` — idempotent host setup.
- - `scripts/download-datasets.sh` — eager dataset download into
- `/opt/clickbench-playground/datasets/`.
- - `scripts/smoke-boot.sh` — boots the base rootfs alone in a VM; confirms
- kernel + rootfs + agent path before per-system testing.
- - `scripts/agent-selftest.sh` — runs the agent on the host (no VM) and
- exercises all endpoints with a fake "system" dir. PASSES.
-
-## What is provisioned on disk (host)
+# Playground build progress — checkpoint 2026-05-12 ~21:30 UTC
+
+## Status: ClickHouse end-to-end works
+
+```
+$ printf 'SELECT COUNT(*) FROM hits' | curl -sS -X POST --data-binary @- \
+ 'http://127.0.0.1:8000/api/query?system=clickhouse' -D -
+HTTP/1.1 200 OK
+X-Query-Wall-Time: 0.122721
+X-Output-Bytes: 9
+X-Output-Truncated: 0
+X-Query-Time: 0.003000
+X-Wall-Time: 10.112950
+Content-Length: 9
+
+99997497
+```
+
+Cold path (snapshot restore + daemon start): ~10 s.
+Warm path (live VM): subsecond on COUNT / MIN-MAX, ~24 s on top-of-URL.
+Output truncation: 244 KB result correctly capped to 10 KB with
+`X-Output-Truncated: 1` set.
+
+## Snapshot footprint
+
+`snapshot.bin.zst` for ClickHouse: **35 MB** (down from 16 GB raw RAM dump,
+~470× compression). The combination that gets us there:
+
+ 1. Agent stops the daemon at the end of /provision (clickhouse stop).
+ 2. Agent drops the page+dentry+inode cache.
+ 3. Guest kernel runs with `init_on_free=1` — every freed page is
+ zero-filled before going back on the free list, so the resulting
+ RAM is genuinely compressible (not just "freed-but-stale" stale
+ bytes that look random to zstd).
+ 4. Host calls a /sync endpoint on the agent immediately before
+ /vm Paused, so ext4 writeback completes before KVM freezes the
+ vcpus — no half-flushed pages in the snapshot.
+ 5. `zstd -T0 -3 --long=27` for parallel compression with a 128 MB
+ match window (helps with repetitive zero patterns).
+
+On restore the agent's first /query brings the daemon back up via
+`_ensure_daemon_started`. That's ~3-5 s of clickhouse startup amortized
+into the first cold query.
+
+## Components shipped
+
+- `playground/server/` — aiohttp API (UI + /api/{systems,system,query,
+ state,admin/provision,provision-log}), per-system Firecracker
+ lifecycle, monitor watchdog, batched ClickHouse-Cloud logging sink
+ with JSONL fallback.
+- `playground/agent/` — stdlib HTTP agent. Endpoints:
+ - GET /health, /stats, /provision-log
+ - POST /provision (install → start → check → load → stop → drop_caches)
+ - POST /sync (guest fsync just before host snapshot)
+ - POST /query (10 KB output cap, fractional-second timing in headers)
+- `playground/images/` — `build-base-rootfs.sh` (Ubuntu 22.04 → flat 8 GB
+ ext4 with agent pre-installed), `build-system-rootfs.sh` (per-system
+ 200 GB sparse rootfs + sized system disk with pre-staged dataset).
+- `playground/web/` — vanilla-JS SPA with system picker, query box,
+ timing display, truncation indicator.
+
+## Host state
```
/opt/clickbench-playground/
-├── bin/firecracker, bin/jailer (firecracker v1.13.1)
-├── kernel/vmlinux (Linux 6.1.141, IP_PNP + virtio enabled)
-├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent
+├── bin/firecracker, bin/jailer firecracker v1.13.1
+├── kernel/vmlinux Linux 6.1.141
+├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent
├── datasets/
-│ ├── hits.parquet 14.7 GB (single)
-│ ├── hits_partitioned/ 14 GB (100 partitioned files)
-│ ├── hits.tsv 74 GB (decompressed)
-│ ├── hits.csv ~14 GB partial (kill-stopped)
-│ └── hits.csv.gz 16 GB
+│ ├── hits.parquet 14.7 GB
+│ ├── hits_partitioned/ 14 GB (100 files)
+│ ├── hits.tsv 74 GB
+│ ├── hits.csv partial (kill-stopped); .gz intact
└── systems/clickhouse/
- ├── rootfs.ext4 8.2 MB physical / 200 GB sparse
- └── system.ext4 16 GB (parquet files staged)
+ ├── rootfs.ext4 sparse 200 GB
+ ├── system.ext4 16 GB (parquet + scripts)
+ ├── snapshot.bin.zst 35 MB
+ └── snapshot.state 58 KB
```
-## What works
-
-- Python module imports clean (`python3 -m playground.server.main`).
-- API server serves 97 systems via `/api/systems`.
-- UI loads at `/ui/`.
-- Firecracker smoke-boot (base rootfs only): agent comes up in 2 s,
- `/health` and `/stats` respond OK.
-- Agent self-test (no VM): all 4 endpoints behave correctly, output
- truncation works (2 KB → 64 B with `X-Output-Truncated: 1`).
-- Provision started on ClickHouse VM at 19:51:59 UTC:
- - VM booted, agent up, internet enabled via MASQUERADE on `ens33`
- - Install ran (ClickHouse binary downloaded + apt deps)
- - Load is in progress — `cpu_busy=0.8-1.0` sustained, `disk_used`
- grew from 17 GB → 30 GB, indicating MergeTree INSERT.
- - At 19:57:33 the agent stopped responding to /health (timeout).
- Firecracker process is still running (PID 19230, 16 min of CPU).
- Likely cause: agent's HTTP server starved by the load process,
- or a fork race in stdlib `socketserver`. Needs investigation.
-
## What's left
-- Decide whether to add eager liveness pings or move agent to aiohttp
- to avoid the stdlib threading server's quirks under heavy load.
-- Once provision completes: snapshot → restore → /query test path.
-- Build system disks for the other 96 systems (template is ready).
-- Wire up ClickHouse Cloud credentials for the logging sink (currently
+- Build system disks for the remaining 96 systems (template is ready;
+ each requires its own provision pass — most should "just work" with
+ the same flow).
+- Tighten the External-only exclusion list in `systems.py` once we've
+ validated which local-only systems actually run.
+- Wire ClickHouse Cloud credentials for the logging sink (currently
falling back to JSONL under `/opt/clickbench-playground/logs/`).
+- Optional: jailer integration for tighter isolation if the host is
+ ever multi-tenant.
+
+## Known sharp edges
-## Known issues / things to revisit
-
-- TSV/CSV decompression contends with rootfs build for nvme writeback.
- Workaround: pre-build the base rootfs before kicking off the heavy
- decompressions, or rate-limit pigz.
-- The "External" exclusion list in `systems.py` is conservative; some
- entries (umbra, hyper, cedardb) actually run locally and should be
- added back when verified.
-- /etc/resolv.conf in the base rootfs is a static fallback (1.1.1.1 +
- 8.8.8.8). Once we cut internet post-snapshot, DNS doesn't matter, but
- during provision it does — sanity check that NAT + resolv.conf actually
- let `apt-get update` work.
-- KVM permissions were opened to mode 666 via a udev rule. Tighten to
- the `kvm` group when the playground user is properly added.
-
-## Operator notes
-
-- The base rootfs ships with serial autologin as root on ttyS0 — good for
- attaching the Firecracker console for debugging.
-- Firecracker logs land in `/opt/clickbench-playground/logs/firecracker-.log`.
-- The host's `/dev/kvm` group/mode was changed: `chown root:kvm`, `chmod 666`,
- with a persistent udev rule at `/etc/udev/rules.d/65-kvm.rules`.
-- `vm.dirty_writeback_centisecs` is set to 10 on the host (down from 500)
- to reduce sfdisk hang during heavy concurrent writeback. Revert if it
- causes other problems.
+- The `chroot` in `build-base-rootfs.sh` previously tore down the host's
+ `/dev/pts` via mount propagation, breaking sshd PTY allocation. Fixed
+ with `mount --make-rslave` (committed); if you see "PTY allocation
+ request failed on channel 0" after a rebuild, `sudo mount -t devpts
+ devpts /dev/pts -o gid=5,mode=620,ptmxmode=000` brings it back.
+- KVM permissions: a udev rule at `/etc/udev/rules.d/65-kvm.rules` keeps
+ `/dev/kvm` group=kvm mode=666 so the playground user can open it.
+- `vm.dirty_writeback_centisecs=10` on the host (down from 500); revert
+ if it causes problems elsewhere.
From f1088ece9beee9f63a62c027351c3b688fd957eb Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 21:37:20 +0000
Subject: [PATCH 011/221] playground: shared RO datasets disk + per-restore
golden-disk clone
Two correctness/efficiency fixes:
1. Shared read-only datasets disk. Previously each per-system rootfs
embedded its own copy of hits.parquet / hits.tsv / hits.csv (14-75 GB
each), so the catalog needed ~1-2 TB of redundant dataset storage on
the host. Build one shared datasets.ext4 instead, attach to every VM
read-only at LABEL=cbdata, and have the agent copy the bytes the
system actually needs from /opt/clickbench/datasets into the writable
per-system disk at provision time only. The agent uses
os.copy_file_range so the in-VM copy is kernel-side, not bounced
through userspace.
2. Golden-disk snapshot/restore. Firecracker's snapshot.bin only saves
memory; the disk image referenced by the in-memory state is the
live file. If anything modifies it between snapshots (background
merges, log writes, /tmp churn) the next /snapshot/load points at
the new disk while replaying old memory references. We were getting
away with this because clickhouse-server happens to be tolerant,
but it's fragile. Now /snapshot also renames the working disks into
`*.golden.ext4`, and /restore-snapshot clones the goldens back into
fresh working copies via `cp --sparse=always`. Every restore starts
from the exact disk state captured at snapshot time.
3. Bound per-system disk builds and provisions via asyncio.Semaphore
(PLAYGROUND_BUILD_CONCURRENCY=6, PLAYGROUND_PROVISION_CONCURRENCY=32)
so kicking off 98 systems at once doesn't thrash the host NVMe or
rate-limit Ubuntu mirrors.
4. Re-enabled `ursa` in the playground catalog (was incorrectly in the
_EXTERNAL exclude list; it runs locally).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 86 +++++++++++++++-
playground/images/build-base-rootfs.sh | 3 +-
playground/images/build-datasets-image.sh | 35 ++++---
playground/images/build-system-rootfs.sh | 117 ++++++----------------
playground/scripts/provision-all.sh | 79 +++++++++++++++
playground/server/systems.py | 2 +-
playground/server/vm_manager.py | 106 +++++++++++++++++++-
7 files changed, 324 insertions(+), 104 deletions(-)
create mode 100755 playground/scripts/provision-all.sh
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 40ddbaae5e..1b0cc55f1f 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -84,6 +84,67 @@ def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes:
return handler.rfile.read(min(n, 1 << 20))
+def _stage_dataset(fmt: str) -> list[Path]:
+ """Copy the dataset file(s) the system's load script needs from the
+ read-only shared mount into the per-system writable disk.
+
+ Returns the list of staged files. Empty list when there's nothing to
+ stage (datalake / in-memory engines whose ./load reads from external
+ sources). Raises if a required file is missing.
+ """
+ staged: list[Path] = []
+ if fmt in ("", "none", "unknown"):
+ return staged
+ if not DATASETS_DIR.exists():
+ raise FileNotFoundError(f"datasets mount missing: {DATASETS_DIR}")
+
+ if fmt == "parquet":
+ srcs = [DATASETS_DIR / "hits.parquet"]
+ elif fmt == "parquet-partitioned":
+ srcs = sorted((DATASETS_DIR / "hits_partitioned").glob("hits_*.parquet"))
+ elif fmt == "tsv":
+ srcs = [DATASETS_DIR / "hits.tsv"]
+ elif fmt == "csv":
+ srcs = [DATASETS_DIR / "hits.csv"]
+ else:
+ srcs = []
+
+ for src in srcs:
+ if not src.exists():
+ raise FileNotFoundError(f"staged source missing: {src}")
+ dst = SYSTEM_DIR / src.name
+ # copy_file_range goes through the kernel without bouncing bytes
+ # through userspace — much faster than shutil.copyfile for the
+ # 14 GB / 75 GB files we deal with.
+ with src.open("rb") as fsrc, dst.open("wb") as fdst:
+ size = src.stat().st_size
+ try:
+ off = 0
+ while off < size:
+ n = os.copy_file_range(
+ fsrc.fileno(), fdst.fileno(),
+ size - off,
+ )
+ if n == 0:
+ break
+ off += n
+ except (AttributeError, OSError):
+ # Fall back to read/write for kernels / filesystems that
+ # don't support copy_file_range across the underlying
+ # device pair (RO ext4 -> RW ext4 should be fine, but
+ # there are kernels that don't allow it).
+ fsrc.seek(0)
+ fdst.seek(0)
+ fdst.truncate(0)
+ while True:
+ chunk = fsrc.read(8 * 1024 * 1024)
+ if not chunk:
+ break
+ fdst.write(chunk)
+ staged.append(dst)
+ return staged
+
+
def _system_script(name: str) -> Path:
"""Return path to a script in the system dir, or raise if missing/not executable."""
p = SYSTEM_DIR / name
@@ -288,10 +349,27 @@ def _provision() -> tuple[int, bytes]:
return 1, b"".join(log_lines)
log_lines.append(b"\n=== check ok ===\n")
- # Data files are pre-staged on the per-system disk by the host-side
- # build-system-rootfs.sh, so the load script's relative references
- # (hits.parquet, hits.tsv, etc.) already resolve to local files it
- # can chown / mv / rm without worrying about a RO source mount.
+ # Stage the dataset files this system needs from the read-only
+ # shared mount into the writable system disk. We copy (rather than
+ # symlink/bind-mount) so the system's load script can mv/chown/rm
+ # them however it likes; the destination is a local file on the
+ # cbsystem disk. After load the script typically `rm`s them, so
+ # the copies are short-lived.
+ fmt_file = SYSTEM_DIR / ".data-format"
+ fmt = fmt_file.read_text().strip() if fmt_file.exists() else ""
+ stage_t0 = time.monotonic()
+ log_lines.append(f"\n=== staging dataset (format={fmt}) ===\n".encode())
+ try:
+ staged = _stage_dataset(fmt)
+ log_lines.append(f"staged {len(staged)} files: ".encode() +
+ ", ".join(s.name for s in staged[:5]).encode() +
+ (b" ..." if len(staged) > 5 else b"") + b"\n")
+ except Exception as e:
+ log_lines.append(f"stage failed: {e!r}\n".encode())
+ PROVISION_LOG.write_bytes(b"".join(log_lines))
+ return 1, b"".join(log_lines)
+ stage_dt = time.monotonic() - stage_t0
+ log_lines.append(f"=== staging done in {stage_dt:.1f}s ===\n".encode())
# Run load.
t0 = time.monotonic()
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index e3a102204c..01e9714da1 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -172,9 +172,10 @@ passwd -d root
# systemd refuses to clear those entries on its own and drops to emergency
# mode when label-based lookups fail. The kernel handles the root mount via
# its `root=/dev/vda` cmdline; we only need fstab for the system disk.
-mkdir -p /opt/clickbench/system
+mkdir -p /opt/clickbench/system /opt/clickbench/datasets
cat > /etc/fstab </dev/null
+# Disable the journal (-O ^has_journal) and reserve 0 blocks for root
+# (-m 0); both make sense for a read-only image.
+mkfs.ext4 -F -L cbdata -m 0 -O ^has_journal \
+ -E lazy_itable_init=1,lazy_journal_init=1 "$OUT" >/dev/null
MNT="$(mktemp -d)"
trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT
sudo mount -o loop "$OUT" "$MNT"
-sudo rsync -a --info=progress2 "$SRC"/. "$MNT"/
+sudo rsync -a "$SRC"/. "$MNT"/
sudo sync
sudo umount "$MNT"
trap - EXIT
+# Mark the image read-only on the host too, so a misconfigured drive (RW
+# attach by mistake) can't scribble.
+chmod a-w "$OUT"
+
echo "[datasets] done"
ls -lh "$OUT"
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
index 90efe01c99..f1065284de 100755
--- a/playground/images/build-system-rootfs.sh
+++ b/playground/images/build-system-rootfs.sh
@@ -3,21 +3,12 @@
#
# Outputs (under /opt/clickbench-playground/systems//):
# rootfs.ext4 CoW-ish copy of base-rootfs.ext4 (sparse 200 GB)
-# system.ext4 ext4 holding ClickBench scripts + the dataset files
-# this system needs. Mounted RW at /opt/clickbench/system
-# in the VM. We include the data here (not a separate
-# read-only datasets disk) because many load scripts do
-# `sudo chown` on the source files, and chown follows
-# symlinks — i.e. it tries to mutate the RO-mounted
-# dataset and fails. Putting the data on the RW system
-# disk sidesteps the problem entirely.
-#
-# The disk is sized based on the system's data format:
-# parquet, parquet-partitioned 16 GB
-# tsv, csv 88 GB
-# none/unknown 2 GB
-#
-# Usage: build-system-rootfs.sh
+# system.ext4 ~2 GB ext4 holding ONLY the system's ClickBench
+# scripts. The dataset is *not* copied in here — it
+# comes from the host-side shared datasets.ext4
+# attached read-only to every VM (build-datasets-
+# image.sh). The agent's /provision step copies
+# only the bytes the load script actually needs.
set -euo pipefail
@@ -30,13 +21,13 @@ SYSTEM="$1"
STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
BASE="$STATE_DIR/base-rootfs.ext4"
-DATASETS="$STATE_DIR/datasets"
SRC="$REPO_DIR/$SYSTEM"
OUT_DIR="$STATE_DIR/systems/$SYSTEM"
ROOTFS="$OUT_DIR/rootfs.ext4"
SYSDISK="$OUT_DIR/system.ext4"
ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}"
+SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-2}"
if [ ! -f "$BASE" ]; then
echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2
@@ -53,27 +44,9 @@ for f in install start load query check stop; do
fi
done
-# Discover the data format from the system's benchmark.sh. Source the file in
-# a noop-shell so any of `export BENCH_DOWNLOAD_SCRIPT="..."` /
-# `BENCH_DOWNLOAD_SCRIPT=...` etc. just becomes a variable. Drop everything
-# else by running in a subshell.
-download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \
- eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \
- printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")"
-case "$download_script" in
- *parquet-partitioned*) format=parquet-partitioned; sysdisk_size_gb=16 ;;
- *parquet-single*) format=parquet; sysdisk_size_gb=16 ;;
- *tsv*) format=tsv; sysdisk_size_gb=88 ;;
- *csv*) format=csv; sysdisk_size_gb=88 ;;
- "") format=none; sysdisk_size_gb=2 ;;
- *) format=unknown; sysdisk_size_gb=4 ;;
-esac
-echo "[sys:$SYSTEM] format=$format sysdisk_size=${sysdisk_size_gb}G"
-
mkdir -p "$OUT_DIR"
-# 1. Rootfs as a sparse file. Allocate 200 GB but only write blocks when
-# something inside the VM dirties them.
+# 1. Rootfs: sparse 200 GB.
echo "[sys:$SYSTEM] rootfs.ext4 ${ROOTFS_SIZE_GB}G (sparse)"
rm -f "$ROOTFS"
truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS"
@@ -86,79 +59,53 @@ trap '
sudo umount "'"$DST_MNT"'" 2>/dev/null || true
rmdir "'"$BASE_MNT"'" "'"$DST_MNT"'" 2>/dev/null || true
' EXIT
-# A prior smoke-boot likely left the base rootfs's journal dirty. Replay it
-# (fsck -fy is idempotent) before opening read-only — otherwise the loop
-# mount refuses with "cannot mount read-only" and the script blows up
-# silently.
+# A prior smoke-boot may have left the base journal dirty; fsck before RO
+# mount, otherwise the loop mount refuses with "cannot mount read-only".
sudo e2fsck -fy "$BASE" >/dev/null 2>&1 || true
sudo mount -o loop,ro "$BASE" "$BASE_MNT"
sudo mount -o loop "$ROOTFS" "$DST_MNT"
-sudo cp -a --reflink=auto "$BASE_MNT"/. "$DST_MNT"/
+sudo cp -a "$BASE_MNT"/. "$DST_MNT"/
echo "$SYSTEM" | sudo tee "$DST_MNT/etc/clickbench-system" >/dev/null
sudo sync
sudo umount "$DST_MNT"
sudo umount "$BASE_MNT"
trap - EXIT
-# 2. System disk: ClickBench scripts + the data files this system needs.
-# Sized per-format. The agent runs ./install/./start/./load with cwd here, so
-# the load script's relative references to hits.parquet / hits.tsv / etc. all
-# resolve to local files it owns.
-echo "[sys:$SYSTEM] system.ext4 ${sysdisk_size_gb}G"
+# 2. System disk: ClickBench scripts only. Sized at SYSDISK_SIZE_GB (2 GB
+# default). The agent populates the dataset files into this disk at
+# provision time by copying from the shared read-only datasets disk.
+echo "[sys:$SYSTEM] system.ext4 ${SYSDISK_SIZE_GB}G"
rm -f "$SYSDISK"
-truncate -s "${sysdisk_size_gb}G" "$SYSDISK"
+truncate -s "${SYSDISK_SIZE_GB}G" "$SYSDISK"
mkfs.ext4 -F -L cbsystem -E lazy_itable_init=1,lazy_journal_init=1 "$SYSDISK" >/dev/null
SYS_MNT="$(mktemp -d)"
trap 'sudo umount "'"$SYS_MNT"'" 2>/dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT
sudo mount -o loop "$SYSDISK" "$SYS_MNT"
-# Scripts.
+# Scripts + sql + helpers.
sudo rsync -a --exclude 'results/' --exclude '*.json' --exclude 'README*' \
"$SRC"/ "$SYS_MNT"/
-# Some systems' scripts use ../lib/... — provide it.
+# Some systems' scripts use ../lib/... — make it visible.
sudo mkdir -p "$SYS_MNT/_lib"
sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/_lib"/
-# Data files.
-case "$format" in
- parquet)
- if [ -f "$DATASETS/hits.parquet" ]; then
- echo "[sys:$SYSTEM] copying hits.parquet"
- sudo cp --reflink=auto "$DATASETS/hits.parquet" "$SYS_MNT/hits.parquet"
- else
- echo "[sys:$SYSTEM] WARN hits.parquet not present in datasets dir"
- fi
- ;;
- parquet-partitioned)
- if [ -d "$DATASETS/hits_partitioned" ]; then
- echo "[sys:$SYSTEM] copying 100 partitioned parquet files"
- sudo cp --reflink=auto "$DATASETS/hits_partitioned"/hits_*.parquet "$SYS_MNT/"
- else
- echo "[sys:$SYSTEM] WARN hits_partitioned/ not present"
- fi
- ;;
- tsv)
- if [ -f "$DATASETS/hits.tsv" ]; then
- echo "[sys:$SYSTEM] copying hits.tsv (large)"
- sudo cp --reflink=auto "$DATASETS/hits.tsv" "$SYS_MNT/hits.tsv"
- else
- echo "[sys:$SYSTEM] WARN hits.tsv not present"
- fi
- ;;
- csv)
- if [ -f "$DATASETS/hits.csv" ]; then
- echo "[sys:$SYSTEM] copying hits.csv (large)"
- sudo cp --reflink=auto "$DATASETS/hits.csv" "$SYS_MNT/hits.csv"
- else
- echo "[sys:$SYSTEM] WARN hits.csv not present"
- fi
- ;;
- none|unknown)
- echo "[sys:$SYSTEM] no data staging for format=$format"
- ;;
+# Discover the data format from benchmark.sh and stamp it; the agent uses
+# this to decide which dataset files to stage from the RO mount.
+download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \
+ eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \
+ printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")"
+case "$download_script" in
+ *parquet-partitioned*) format=parquet-partitioned ;;
+ *parquet-single*) format=parquet ;;
+ *tsv*) format=tsv ;;
+ *csv*) format=csv ;;
+ "") format=none ;;
+ *) format=unknown ;;
esac
+echo "$format" | sudo tee "$SYS_MNT/.data-format" >/dev/null
+echo "[sys:$SYSTEM] format=$format"
sudo chown -R 0:0 "$SYS_MNT"
sudo chmod -R u+rwX,go+rX "$SYS_MNT"
diff --git a/playground/scripts/provision-all.sh b/playground/scripts/provision-all.sh
new file mode 100755
index 0000000000..913f900fa2
--- /dev/null
+++ b/playground/scripts/provision-all.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Kick off /api/admin/provision/ for every playground-eligible system.
+# The server's own semaphore in VMManager bounds the actual concurrency
+# (PLAYGROUND_PROVISION_CONCURRENCY, default 32) — this script just fires
+# the requests as fast as the host can accept them, then polls until the
+# server reports every system as either snapshotted or down-with-error.
+
+set -euo pipefail
+
+BASE="${PLAYGROUND_BASE:-http://127.0.0.1:8000}"
+STATUS_LOG="${STATUS_LOG:-/opt/clickbench-playground/logs/provision-all.status}"
+SKIP_PROVISIONED="${SKIP_PROVISIONED:-yes}"
+
+# Fetch the catalog.
+mapfile -t SYSTEMS < <(
+ curl -fsS "$BASE/api/systems" |
+ python3 -c 'import json,sys; [print(x["name"]) for x in json.load(sys.stdin)]'
+)
+
+echo "$(date -Is) catalog: ${#SYSTEMS[@]} systems"
+
+# Kick off /provision for each system that isn't already snapshotted.
+for sys in "${SYSTEMS[@]}"; do
+ if [ "$SKIP_PROVISIONED" = "yes" ]; then
+ state=$(curl -fsS "$BASE/api/system/$sys" |
+ python3 -c 'import json,sys; print(json.load(sys.stdin)["state"])')
+ if [ "$state" = "snapshotted" ] || [ "$state" = "ready" ]; then
+ echo " $sys: skip (already $state)"
+ continue
+ fi
+ fi
+ echo " $sys: kicking provision"
+ curl -fsS -X POST "$BASE/api/admin/provision/$sys" >/dev/null
+done
+
+echo "$(date -Is) all kicked off; polling state..."
+
+# Poll until every system reaches a terminal state. Emit one line per
+# transition.
+declare -A LAST_STATE
+while true; do
+ in_flight=0
+ succeeded=0
+ failed=0
+ : > "$STATUS_LOG.tmp"
+ for sys in "${SYSTEMS[@]}"; do
+ body=$(curl -fsS --max-time 5 "$BASE/api/system/$sys" 2>/dev/null || echo '{}')
+ state=$(echo "$body" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("state","?"))' 2>/dev/null)
+ err=$(echo "$body" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("last_error") or "")' 2>/dev/null)
+ echo "$sys $state $err" >> "$STATUS_LOG.tmp"
+ prev="${LAST_STATE[$sys]:-}"
+ if [ "$state" != "$prev" ]; then
+ ts=$(date -Is)
+ echo "$ts $sys: $prev -> $state${err:+ (err=$err)}"
+ LAST_STATE[$sys]=$state
+ fi
+ case "$state" in
+ snapshotted|ready) succeeded=$((succeeded+1)) ;;
+ down) [ -n "$err" ] && failed=$((failed+1)) || in_flight=$((in_flight+1)) ;;
+ provisioning) in_flight=$((in_flight+1)) ;;
+ *) in_flight=$((in_flight+1)) ;;
+ esac
+ done
+ mv "$STATUS_LOG.tmp" "$STATUS_LOG"
+ echo "$(date -Is) ok=$succeeded fail=$failed in_flight=$in_flight"
+ if [ "$in_flight" -eq 0 ]; then
+ echo "$(date -Is) done"
+ break
+ fi
+ sleep 30
+done
+
+# Final summary.
+echo ""
+echo "=== FINAL SUMMARY ==="
+awk '{print $2}' "$STATUS_LOG" | sort | uniq -c
+echo ""
+echo "=== FAILED ==="
+awk '$2 == "down" && NF > 2 {print}' "$STATUS_LOG"
diff --git a/playground/server/systems.py b/playground/server/systems.py
index c41599858a..3ba6862d39 100644
--- a/playground/server/systems.py
+++ b/playground/server/systems.py
@@ -36,7 +36,7 @@
"gravitons", "heavyai", "hologres", "hydrolix", "kinetica",
"motherduck", "oxla", "pgpro_tam", "redshift", "redshift-serverless",
"s3select", "singlestore", "snowflake", "supabase",
- "tembo-olap", "timescale-cloud", "tinybird", "ursa", "velodb",
+ "tembo-olap", "timescale-cloud", "tinybird", "velodb",
"vertica", "ydb",
}
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 926af22733..f27abc59cb 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -84,6 +84,19 @@ def __init__(self, config: Config, systems: dict[str, System]):
self.cfg = config
self.systems = systems
self.vms: dict[str, VM] = {}
+ # Bound the number of system-disk builds running concurrently. Each
+ # build copies up to ~88 GB of dataset (for tsv/csv systems) — doing
+ # 98 in parallel would thrash the host's NVMe. 6 is enough to keep
+ # the disk busy without hitting writeback stalls.
+ self._build_sem = asyncio.Semaphore(int(os.environ.get(
+ "PLAYGROUND_BUILD_CONCURRENCY", "6")))
+ # Cap on simultaneous in-flight provisions. Each one needs 4 vCPU +
+ # apt-get downloads from the public internet; running 98 concurrently
+ # gets rate-limited by Ubuntu mirrors and we have to retry. The host
+ # has plenty of headroom for 32, which still finishes the catalog
+ # in one pass.
+ self._provision_sem = asyncio.Semaphore(int(os.environ.get(
+ "PLAYGROUND_PROVISION_CONCURRENCY", "32")))
# Stable slot allocation: sort systems alphabetically so each system
# always gets the same slot id (and therefore the same TAP/IP).
for i, name in enumerate(sorted(systems.keys()), start=1):
@@ -328,6 +341,17 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None:
"is_root_device": False,
"is_read_only": False,
})
+ # Shared dataset disk, attached read-only to every VM (LABEL=cbdata
+ # mount in the guest fstab). Saves ~1-2 TB of host storage compared
+ # to embedding the dataset into each per-system disk.
+ datasets_img = self.cfg.datasets_image
+ if datasets_img.exists():
+ await fc.put(sock, "/drives/datasets", {
+ "drive_id": "datasets",
+ "path_on_host": str(datasets_img),
+ "is_root_device": False,
+ "is_read_only": True,
+ })
await fc.put(sock, "/machine-config", {
"vcpu_count": self.cfg.vm_vcpus,
"mem_size_mib": self.cfg.vm_mem_mib,
@@ -357,6 +381,16 @@ async def _snapshot(self, vm: VM) -> None:
with contextlib.suppress(Exception):
await fc.patch(sock, "/vm", {"state": "Resumed"})
+ # Capture the *disk* state too. The memory snapshot is meaningless on
+ # its own: it has in-flight references to specific inodes / file
+ # positions / mmap'd ranges on the rootfs and system disks, and if
+ # those move under it the restored process malfunctions. We sparse-
+ # copy the disks into a parallel "golden" path; every subsequent
+ # restore boots off a fresh copy of the golden, so background work
+ # the daemon does after restore (clickhouse merges, log writes,
+ # /tmp churn) never persists into the next session.
+ await self._snapshot_disks(vm)
+
# Compress the memory dump with parallel zstd. Firecracker writes the
# *full* 16 GB of guest memory regardless of how much was actually
# used; zstd at -3 with -T0 turns that into ~10-12 GB in a few
@@ -421,6 +455,12 @@ async def _decompress_snapshot(self, vm: VM) -> None:
async def _restore_snapshot(self, vm: VM) -> None:
log.info("[%s] restore from snapshot", vm.system.name)
+ # Always boot from a *fresh copy* of the golden disks captured at
+ # snapshot time. Restore #N inherits zero state from restore #N-1,
+ # which is what makes the playground safe to expose to arbitrary
+ # SQL: the worst a user query can do is dirty the working copy,
+ # which we throw away on the next /teardown.
+ await self._restore_disks(vm)
# If we only have the zstd-compressed memory dump, expand it before
# Firecracker tries to mmap it.
await self._decompress_snapshot(vm)
@@ -430,6 +470,50 @@ async def _restore_snapshot(self, vm: VM) -> None:
await self._wait_for_agent(vm, timeout=60)
vm.state = "ready"
+ def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]:
+ """(working rootfs, working sysdisk, golden rootfs, golden sysdisk)."""
+ sys_dir = self.cfg.systems_dir / vm.system.name
+ return (
+ sys_dir / "rootfs.ext4",
+ sys_dir / "system.ext4",
+ sys_dir / "rootfs.golden.ext4",
+ sys_dir / "system.golden.ext4",
+ )
+
+ async def _snapshot_disks(self, vm: VM) -> None:
+ rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm)
+ # Atomically swap: rename the working images into the golden slot.
+ # Both disks were sync'd via /sync before /snapshot/create, so
+ # what's on disk is consistent with what's in the memory snapshot.
+ # We'll re-create the working images by cloning from the golden
+ # on every restore (see _restore_disks).
+ for src, dst in ((rootfs, rootfs_gold), (sysdisk, sysdisk_gold)):
+ if dst.exists():
+ dst.unlink()
+ os.replace(src, dst)
+ log.info("[%s] golden disks saved (%s, %s)", vm.system.name,
+ _fmt_size(rootfs_gold.stat().st_size),
+ _fmt_size(sysdisk_gold.stat().st_size))
+
+ async def _restore_disks(self, vm: VM) -> None:
+ rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm)
+ if not rootfs_gold.exists() or not sysdisk_gold.exists():
+ raise RuntimeError(
+ f"[{vm.system.name}] missing golden disks; cannot restore")
+ # Clone the goldens into fresh working copies. `cp --sparse=always`
+ # only writes the non-zero blocks, so the cost is proportional to
+ # the actual data on each disk, not its apparent 200 GB.
+ for src, dst in ((rootfs_gold, rootfs), (sysdisk_gold, sysdisk)):
+ if dst.exists():
+ dst.unlink()
+ proc = await asyncio.create_subprocess_exec(
+ "cp", "--sparse=always", str(src), str(dst),
+ )
+ rc = await proc.wait()
+ if rc != 0:
+ raise RuntimeError(f"cp {src} -> {dst} failed rc={rc}")
+ log.info("[%s] working disks cloned from golden", vm.system.name)
+
async def _shutdown(self, vm: VM) -> None:
"""Best-effort clean shutdown of the firecracker process.
@@ -470,6 +554,16 @@ async def _teardown(self, vm: VM, reason: str) -> None:
if vm.snapshot_bin.exists() and zst.exists():
with contextlib.suppress(FileNotFoundError):
vm.snapshot_bin.unlink()
+ # Discard the working disks. Any changes the daemon scribbled into
+ # them during this session (background merges, log writes, /tmp
+ # churn) die with them; the next restore will clone fresh copies
+ # from the golden disks, so user N+1 sees the same starting state
+ # as user N.
+ if _has_snapshot(vm):
+ rootfs, sysdisk, _, _ = self._golden_paths(vm)
+ for p in (rootfs, sysdisk):
+ with contextlib.suppress(FileNotFoundError):
+ p.unlink()
# ── agent helpers ────────────────────────────────────────────────────
@@ -521,7 +615,17 @@ async def _sync_guest(self, vm: VM) -> None:
def _has_snapshot(vm: VM) -> bool:
- return vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists()
+ """A snapshot is complete only when *both* the memory image and the
+ golden disks have been captured. A half-built snapshot (memory present
+ but goldens missing, or vice versa) is treated as no snapshot at all
+ so the next ensure_ready_for_query re-provisions cleanly.
+ """
+ mem_ok = (vm.snapshot_bin.exists() or
+ vm.snapshot_bin.with_suffix(".bin.zst").exists())
+ sys_dir = vm.snapshot_bin.parent
+ disks_ok = ((sys_dir / "rootfs.golden.ext4").exists() and
+ (sys_dir / "system.golden.ext4").exists())
+ return mem_ok and disks_ok
def _fmt_size(n: int) -> str:
From 69f29a446e76618d44dffb36e7490c10216a093d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 22:01:23 +0000
Subject: [PATCH 012/221] playground: overlayfs at /opt/clickbench/system, no
dataset copy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The previous design copied dataset files from the read-only cbdata
mount into the per-VM writable cbsystem disk on every provision —
14 GB for parquet systems, 75 GB for tsv/csv. That worked but was
redundant: the data is already on a read-only mount, the only reason
we copied was that ClickBench's load scripts do `sudo mv` and
`sudo chown` on the dataset files.
Use overlayfs instead:
lowerdir = /opt/clickbench/datasets_ro (RO, the shared image)
upperdir = /opt/clickbench/system_upper (RW per-VM disk with scripts)
merged at /opt/clickbench/system
The system's load runs at cwd=/opt/clickbench/system. It sees scripts
+ dataset files in one tree. When it `mv`s or `chown`s a file from
the lower, overlayfs does a lazy copy-up: only the file's bytes get
materialised into the upper, and only when the script actually
mutates it. Most ClickBench load scripts `rm` the dataset file after
INSERT, which becomes a whiteout in the upper — a few bytes of
metadata, not a 75 GB copy.
Saves ~1-2 TB across the catalog on host disk (no per-system copies)
*and* eliminates the per-provision in-VM stage. Only cost: small
metadata to maintain the overlay (kilobytes).
For partitioned parquet, the source files live in
datasets_ro/hits_partitioned/ but the load globs cwd/hits_*.parquet,
so the agent creates symlinks in the upper pointing at the lower —
~100 symlinks, a few hundred bytes total.
Also: make build-datasets-image.sh idempotent. The 173 GB rsync
into datasets.ext4 only needs to run when the source dir's mtime
has changed; otherwise the cached image is reused.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 107 +++++++---------------
playground/images/build-base-rootfs.sh | 23 ++++-
playground/images/build-datasets-image.sh | 18 +++-
3 files changed, 70 insertions(+), 78 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 1b0cc55f1f..d082082db0 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -38,7 +38,7 @@
from pathlib import Path
SYSTEM_DIR = Path(os.environ.get("CLICKBENCH_SYSTEM_DIR", "/opt/clickbench/system"))
-DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets"))
+DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets_ro"))
STATE_DIR = Path(os.environ.get("CLICKBENCH_AGENT_STATE", "/var/lib/clickbench-agent"))
SYSTEM_NAME = (
os.environ.get("CLICKBENCH_SYSTEM_NAME")
@@ -84,65 +84,33 @@ def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes:
return handler.rfile.read(min(n, 1 << 20))
-def _stage_dataset(fmt: str) -> list[Path]:
- """Copy the dataset file(s) the system's load script needs from the
- read-only shared mount into the per-system writable disk.
+def _stage_dataset_layout(fmt: str) -> None:
+ """Make the system's load script see hits.* in its cwd.
- Returns the list of staged files. Empty list when there's nothing to
- stage (datalake / in-memory engines whose ./load reads from external
- sources). Raises if a required file is missing.
+ The base rootfs's /etc/fstab overlay-mounts /opt/clickbench/system from
+ lower=/opt/clickbench/datasets_ro (the shared dataset image) + upper=/
+ opt/clickbench/system_upper (this VM's writable scripts disk). Most
+ load scripts reference hits.parquet / hits.tsv / hits_*.parquet at
+ cwd, which is /opt/clickbench/system — the overlay already exposes
+ those files there, no copy needed.
+
+ Partitioned parquet lives in a `hits_partitioned/` subdirectory in the
+ lower; the clickhouse load globs `hits_*.parquet` in cwd. Create
+ symlinks (in the upper) pointing at the lower files so the glob
+ matches. Symlinks cost a few bytes per file — far cheaper than the
+ 14 GB physical copy we used to do.
"""
- staged: list[Path] = []
- if fmt in ("", "none", "unknown"):
- return staged
- if not DATASETS_DIR.exists():
- raise FileNotFoundError(f"datasets mount missing: {DATASETS_DIR}")
-
- if fmt == "parquet":
- srcs = [DATASETS_DIR / "hits.parquet"]
- elif fmt == "parquet-partitioned":
- srcs = sorted((DATASETS_DIR / "hits_partitioned").glob("hits_*.parquet"))
- elif fmt == "tsv":
- srcs = [DATASETS_DIR / "hits.tsv"]
- elif fmt == "csv":
- srcs = [DATASETS_DIR / "hits.csv"]
- else:
- srcs = []
-
- for src in srcs:
- if not src.exists():
- raise FileNotFoundError(f"staged source missing: {src}")
- dst = SYSTEM_DIR / src.name
- # copy_file_range goes through the kernel without bouncing bytes
- # through userspace — much faster than shutil.copyfile for the
- # 14 GB / 75 GB files we deal with.
- with src.open("rb") as fsrc, dst.open("wb") as fdst:
- size = src.stat().st_size
- try:
- off = 0
- while off < size:
- n = os.copy_file_range(
- fsrc.fileno(), fdst.fileno(),
- size - off,
- )
- if n == 0:
- break
- off += n
- except (AttributeError, OSError):
- # Fall back to read/write for kernels / filesystems that
- # don't support copy_file_range across the underlying
- # device pair (RO ext4 -> RW ext4 should be fine, but
- # there are kernels that don't allow it).
- fsrc.seek(0)
- fdst.seek(0)
- fdst.truncate(0)
- while True:
- chunk = fsrc.read(8 * 1024 * 1024)
- if not chunk:
- break
- fdst.write(chunk)
- staged.append(dst)
- return staged
+ if fmt == "parquet-partitioned":
+ src_dir = DATASETS_DIR / "hits_partitioned"
+ if not src_dir.exists():
+ raise FileNotFoundError(f"partitioned dir missing: {src_dir}")
+ for f in sorted(src_dir.glob("hits_*.parquet")):
+ link = SYSTEM_DIR / f.name
+ if not link.exists():
+ link.symlink_to(f)
+ # parquet / tsv / csv already appear in cwd via the overlay lower
+ # (their files sit at /opt/clickbench/datasets_ro/hits.parquet etc.
+ # and the overlay merges that path's contents into the merged dir).
def _system_script(name: str) -> Path:
@@ -349,27 +317,20 @@ def _provision() -> tuple[int, bytes]:
return 1, b"".join(log_lines)
log_lines.append(b"\n=== check ok ===\n")
- # Stage the dataset files this system needs from the read-only
- # shared mount into the writable system disk. We copy (rather than
- # symlink/bind-mount) so the system's load script can mv/chown/rm
- # them however it likes; the destination is a local file on the
- # cbsystem disk. After load the script typically `rm`s them, so
- # the copies are short-lived.
+ # Make sure hits.* are visible at cwd for the load script. For
+ # parquet / tsv / csv the overlay already does it (files appear
+ # in /opt/clickbench/system because the lower's hits.parquet etc.
+ # are in datasets_ro). For partitioned parquet we add symlinks in
+ # the upper because the source lives in datasets_ro/hits_partitioned/.
fmt_file = SYSTEM_DIR / ".data-format"
fmt = fmt_file.read_text().strip() if fmt_file.exists() else ""
- stage_t0 = time.monotonic()
- log_lines.append(f"\n=== staging dataset (format={fmt}) ===\n".encode())
+ log_lines.append(f"\n=== layout dataset (format={fmt}) ===\n".encode())
try:
- staged = _stage_dataset(fmt)
- log_lines.append(f"staged {len(staged)} files: ".encode() +
- ", ".join(s.name for s in staged[:5]).encode() +
- (b" ..." if len(staged) > 5 else b"") + b"\n")
+ _stage_dataset_layout(fmt)
except Exception as e:
- log_lines.append(f"stage failed: {e!r}\n".encode())
+ log_lines.append(f"layout failed: {e!r}\n".encode())
PROVISION_LOG.write_bytes(b"".join(log_lines))
return 1, b"".join(log_lines)
- stage_dt = time.monotonic() - stage_t0
- log_lines.append(f"=== staging done in {stage_dt:.1f}s ===\n".encode())
# Run load.
t0 = time.monotonic()
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 01e9714da1..099ee916c7 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -172,10 +172,27 @@ passwd -d root
# systemd refuses to clear those entries on its own and drops to emergency
# mode when label-based lookups fail. The kernel handles the root mount via
# its `root=/dev/vda` cmdline; we only need fstab for the system disk.
-mkdir -p /opt/clickbench/system /opt/clickbench/datasets
+# Three-layer mount plan:
+# 1. The shared read-only dataset disk (cbdata) is attached to every VM
+# and mounted at /opt/clickbench/datasets_ro. Holds hits.parquet,
+# hits.tsv, hits.csv, hits_partitioned/*.parquet — same bytes, one
+# copy on the host, never duplicated per VM or per provision.
+# 2. The per-VM writable system disk (cbsystem) mounts at
+# /opt/clickbench/system_upper. Holds the system's ClickBench
+# scripts (install, start, query, ...).
+# 3. An overlayfs at /opt/clickbench/system merges both. The system's
+# load script runs there with cwd=/opt/clickbench/system and sees a
+# single tree containing scripts + dataset files. When the load
+# does `mv hits.parquet target/` or `chown` on a dataset file,
+# overlayfs copies that one file up from the lower into the upper
+# lazily — only the bytes the script actually mutates land in the
+# per-VM writable layer.
+mkdir -p /opt/clickbench/system /opt/clickbench/datasets_ro \
+ /opt/clickbench/system_upper /opt/clickbench/system_work
cat > /etc/fstab </dev/null || echo 0)
+ out_mtime=$(stat -c%Y "$OUT" 2>/dev/null || echo 0)
+ src_newest=$(find "$SRC" -type f -printf '%T@\n' | sort -rn | head -1 | cut -d. -f1)
+ if [ "$out_size" -ge "$bytes" ] && [ "$out_mtime" -gt "${src_newest:-0}" ]; then
+ echo "[datasets] cached ($(du -h "$OUT" | cut -f1)); set REBUILD=1 to force"
+ ls -lh "$OUT"
+ exit 0
+ fi
+fi
+
rm -f "$OUT"
truncate -s "${size_mib}M" "$OUT"
# Disable the journal (-O ^has_journal) and reserve 0 blocks for root
From ec999832a572468e433b44d41ab4487465ad1f54 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 22:26:26 +0000
Subject: [PATCH 013/221] playground: enforce build/provision semaphores,
clone-based rootfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two fixes for the parallel-provisioning-98-systems path:
1. The _build_sem and _provision_sem fields were defined but never
acquired — `provision-all.sh` kicked all 98 provisions at once and
they each independently spawned build-system-rootfs.sh, which
tried to write ~8 GB of rootfs base content × 98 in parallel
(~780 GB of writes against a single NVMe). Disk got saturated and
nothing finished. Use `async with self._build_sem:` and `async
with self._provision_sem:` around the heavy phases.
2. build-system-rootfs.sh now clones the base image at block level
with `cp --sparse=always` and resizes the filesystem to 200 GB
in place, instead of mkfs.ext4 + mount + rsync-of-base-contents.
The block-level clone touches only the ~2 GB of non-zero blocks
in the base, vs. the rsync approach traversing the mounted base
and writing every file individually. Per-system rootfs build
goes from ~30 s to ~3 s.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-system-rootfs.sh | 38 ++++++++++++------------
playground/server/vm_manager.py | 29 ++++++++++++------
2 files changed, 39 insertions(+), 28 deletions(-)
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
index f1065284de..89fcf22f52 100755
--- a/playground/images/build-system-rootfs.sh
+++ b/playground/images/build-system-rootfs.sh
@@ -46,29 +46,29 @@ done
mkdir -p "$OUT_DIR"
-# 1. Rootfs: sparse 200 GB.
-echo "[sys:$SYSTEM] rootfs.ext4 ${ROOTFS_SIZE_GB}G (sparse)"
+# 1. Rootfs: clone the base ext4 file block-level (sparse), then resize to
+# 200 GB. This is dramatically cheaper than mkfs+mount+rsync-of-base:
+# `cp --sparse=always` writes only the ~2 GB of non-zero blocks the base
+# actually uses, instead of traversing the mounted base and writing each
+# file individually. Going from cp-with-mount to block-clone takes the
+# per-system rootfs build from ~30 s to ~3 s on this NVMe.
+echo "[sys:$SYSTEM] rootfs.ext4 (clone+resize to ${ROOTFS_SIZE_GB}G)"
rm -f "$ROOTFS"
+cp --sparse=always "$BASE" "$ROOTFS"
+# Grow the filesystem to fill 200 GB. The base ext4 superblock thinks the
+# disk is its original size; resize2fs notices the file is bigger and
+# extends the metadata to cover it.
truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS"
-mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$ROOTFS" >/dev/null
+sudo e2fsck -fy "$ROOTFS" >/dev/null 2>&1 || true
+sudo resize2fs "$ROOTFS" >/dev/null 2>&1
-BASE_MNT="$(mktemp -d)"
-DST_MNT="$(mktemp -d)"
-trap '
- sudo umount "'"$BASE_MNT"'" 2>/dev/null || true
- sudo umount "'"$DST_MNT"'" 2>/dev/null || true
- rmdir "'"$BASE_MNT"'" "'"$DST_MNT"'" 2>/dev/null || true
-' EXIT
-# A prior smoke-boot may have left the base journal dirty; fsck before RO
-# mount, otherwise the loop mount refuses with "cannot mount read-only".
-sudo e2fsck -fy "$BASE" >/dev/null 2>&1 || true
-sudo mount -o loop,ro "$BASE" "$BASE_MNT"
-sudo mount -o loop "$ROOTFS" "$DST_MNT"
-sudo cp -a "$BASE_MNT"/. "$DST_MNT"/
-echo "$SYSTEM" | sudo tee "$DST_MNT/etc/clickbench-system" >/dev/null
+# Stamp the system name so the agent can identify itself.
+MNT="$(mktemp -d)"
+trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT
+sudo mount -o loop "$ROOTFS" "$MNT"
+echo "$SYSTEM" | sudo tee "$MNT/etc/clickbench-system" >/dev/null
sudo sync
-sudo umount "$DST_MNT"
-sudo umount "$BASE_MNT"
+sudo umount "$MNT"
trap - EXIT
# 2. System disk: ClickBench scripts only. Sized at SYSDISK_SIZE_GB (2 GB
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index f27abc59cb..258cc6cbf3 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -236,18 +236,29 @@ async def _initial_provision(self, vm: VM) -> None:
if vm.state != "down":
raise RuntimeError(f"unexpected state for initial provision: {vm.state}")
+ # Bound the heavy I/O phases:
+ # _build_images_if_needed: each call does a `cp -a /base /rootfs`
+ # that writes ~8 GB of base content. Running 98 in parallel
+ # saturates the host's NVMe writeback.
+ # _call_agent_provision: each spawn does `apt-get install`
+ # against Ubuntu mirrors and pulls 100s of MB. 98 at once gets
+ # rate-limited by the mirror.
+ # Use distinct semaphores so disk and network are bounded
+ # independently.
log.info("[%s] initial provision begin", vm.system.name)
vm.state = "provisioning"
try:
- await self._build_images_if_needed(vm)
- await net.ensure_tap(vm.slot)
- await net.enable_internet(vm.slot)
- await self._boot(vm, restore_snapshot=False)
- await self._wait_for_agent(vm, timeout=180)
- await self._call_agent_provision(vm)
- await self._snapshot(vm)
- await self._shutdown(vm)
- await net.disable_internet(vm.slot)
+ async with self._build_sem:
+ await self._build_images_if_needed(vm)
+ async with self._provision_sem:
+ await net.ensure_tap(vm.slot)
+ await net.enable_internet(vm.slot)
+ await self._boot(vm, restore_snapshot=False)
+ await self._wait_for_agent(vm, timeout=180)
+ await self._call_agent_provision(vm)
+ await self._snapshot(vm)
+ await self._shutdown(vm)
+ await net.disable_internet(vm.slot)
vm.state = "snapshotted"
vm.provisioned_at = time.time()
log.info("[%s] initial provision complete", vm.system.name)
From ab2fa8c8060e6d6627894ae90f09f21f94423436 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Tue, 12 May 2026 23:26:31 +0000
Subject: [PATCH 014/221] playground: flatten datasets, drop the symlink layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Previously the agent created symlinks in the overlay's upper for
partitioned parquet (hits_partitioned/* -> upper/hits_*.parquet)
because the source directory was nested. That fell apart on
clickhouse's load: `mv hits_*.parquet /var/lib/clickhouse/user_files/`
moved the symlinks, and the subsequent `chown` followed them through
to the read-only datasets disk and got `Read-only file system`.
Flatten the dataset image so all 100 partitioned parquet files sit
at the root next to hits.parquet / hits.tsv / hits.csv. The overlay
then exposes them directly at /opt/clickbench/system as real files,
no symlinks involved. clickhouse's `mv` becomes a real copy-up (and
the source becomes a whiteout in upper), and the subsequent `chown`
operates on a regular file on the rootfs — works.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 49 +++---------------------
playground/images/build-base-rootfs.sh | 20 ++++++----
playground/images/build-system-rootfs.sh | 23 ++++++-----
3 files changed, 33 insertions(+), 59 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index d082082db0..0e30eff6d4 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -84,35 +84,6 @@ def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes:
return handler.rfile.read(min(n, 1 << 20))
-def _stage_dataset_layout(fmt: str) -> None:
- """Make the system's load script see hits.* in its cwd.
-
- The base rootfs's /etc/fstab overlay-mounts /opt/clickbench/system from
- lower=/opt/clickbench/datasets_ro (the shared dataset image) + upper=/
- opt/clickbench/system_upper (this VM's writable scripts disk). Most
- load scripts reference hits.parquet / hits.tsv / hits_*.parquet at
- cwd, which is /opt/clickbench/system — the overlay already exposes
- those files there, no copy needed.
-
- Partitioned parquet lives in a `hits_partitioned/` subdirectory in the
- lower; the clickhouse load globs `hits_*.parquet` in cwd. Create
- symlinks (in the upper) pointing at the lower files so the glob
- matches. Symlinks cost a few bytes per file — far cheaper than the
- 14 GB physical copy we used to do.
- """
- if fmt == "parquet-partitioned":
- src_dir = DATASETS_DIR / "hits_partitioned"
- if not src_dir.exists():
- raise FileNotFoundError(f"partitioned dir missing: {src_dir}")
- for f in sorted(src_dir.glob("hits_*.parquet")):
- link = SYSTEM_DIR / f.name
- if not link.exists():
- link.symlink_to(f)
- # parquet / tsv / csv already appear in cwd via the overlay lower
- # (their files sit at /opt/clickbench/datasets_ro/hits.parquet etc.
- # and the overlay merges that path's contents into the merged dir).
-
-
def _system_script(name: str) -> Path:
"""Return path to a script in the system dir, or raise if missing/not executable."""
p = SYSTEM_DIR / name
@@ -317,20 +288,12 @@ def _provision() -> tuple[int, bytes]:
return 1, b"".join(log_lines)
log_lines.append(b"\n=== check ok ===\n")
- # Make sure hits.* are visible at cwd for the load script. For
- # parquet / tsv / csv the overlay already does it (files appear
- # in /opt/clickbench/system because the lower's hits.parquet etc.
- # are in datasets_ro). For partitioned parquet we add symlinks in
- # the upper because the source lives in datasets_ro/hits_partitioned/.
- fmt_file = SYSTEM_DIR / ".data-format"
- fmt = fmt_file.read_text().strip() if fmt_file.exists() else ""
- log_lines.append(f"\n=== layout dataset (format={fmt}) ===\n".encode())
- try:
- _stage_dataset_layout(fmt)
- except Exception as e:
- log_lines.append(f"layout failed: {e!r}\n".encode())
- PROVISION_LOG.write_bytes(b"".join(log_lines))
- return 1, b"".join(log_lines)
+ # No explicit data staging — the system's load script sees
+ # hits.parquet / hits.tsv / hits.csv / hits_*.parquet at cwd
+ # already, because cwd is the overlay merged dir
+ # /opt/clickbench/system and the dataset disk's contents (the
+ # overlay's lower) sit at /opt/clickbench/datasets_ro at the
+ # filesystem root, matching the names the load scripts use.
# Run load.
t0 = time.monotonic()
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 099ee916c7..e4282001a8 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -178,21 +178,27 @@ passwd -d root
# hits.tsv, hits.csv, hits_partitioned/*.parquet — same bytes, one
# copy on the host, never duplicated per VM or per provision.
# 2. The per-VM writable system disk (cbsystem) mounts at
-# /opt/clickbench/system_upper. Holds the system's ClickBench
-# scripts (install, start, query, ...).
-# 3. An overlayfs at /opt/clickbench/system merges both. The system's
-# load script runs there with cwd=/opt/clickbench/system and sees a
+# /opt/clickbench/sysdisk. We put both the overlay's upperdir AND
+# its workdir inside this mount — overlayfs requires them on the
+# same filesystem; nesting both as subdirs of one mount is the
+# cleanest way.
+# /opt/clickbench/sysdisk/upper/ ClickBench scripts go here
+# /opt/clickbench/sysdisk/work/ overlay scratch (auto-cleared)
+# 3. An overlayfs at /opt/clickbench/system merges
+# lowerdir = datasets_ro
+# upperdir = sysdisk/upper
+# The system's load runs at cwd=/opt/clickbench/system and sees a
# single tree containing scripts + dataset files. When the load
# does `mv hits.parquet target/` or `chown` on a dataset file,
# overlayfs copies that one file up from the lower into the upper
# lazily — only the bytes the script actually mutates land in the
# per-VM writable layer.
mkdir -p /opt/clickbench/system /opt/clickbench/datasets_ro \
- /opt/clickbench/system_upper /opt/clickbench/system_work
+ /opt/clickbench/sysdisk
cat > /etc/fstab </dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT
sudo mount -o loop "$SYSDISK" "$SYS_MNT"
-# Scripts + sql + helpers.
+# The cbsystem disk is mounted at /opt/clickbench/sysdisk in the guest;
+# the overlay points its upperdir at sysdisk/upper and its workdir at
+# sysdisk/work. Pre-create that layout and drop the system's ClickBench
+# scripts into upper.
+sudo mkdir -p "$SYS_MNT/upper" "$SYS_MNT/work"
sudo rsync -a --exclude 'results/' --exclude '*.json' --exclude 'README*' \
- "$SRC"/ "$SYS_MNT"/
+ "$SRC"/ "$SYS_MNT/upper"/
# Some systems' scripts use ../lib/... — make it visible.
-sudo mkdir -p "$SYS_MNT/_lib"
-sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/_lib"/
+sudo mkdir -p "$SYS_MNT/upper/_lib"
+sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/upper/_lib"/
-# Discover the data format from benchmark.sh and stamp it; the agent uses
-# this to decide which dataset files to stage from the RO mount.
+# Discover the data format from benchmark.sh and stamp it in the upper;
+# the agent uses this to decide which dataset symlinks to add for
+# partitioned formats.
download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \
eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \
printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")"
@@ -104,11 +109,11 @@ case "$download_script" in
"") format=none ;;
*) format=unknown ;;
esac
-echo "$format" | sudo tee "$SYS_MNT/.data-format" >/dev/null
+echo "$format" | sudo tee "$SYS_MNT/upper/.data-format" >/dev/null
echo "[sys:$SYSTEM] format=$format"
-sudo chown -R 0:0 "$SYS_MNT"
-sudo chmod -R u+rwX,go+rX "$SYS_MNT"
+sudo chown -R 0:0 "$SYS_MNT/upper"
+sudo chmod -R u+rwX,go+rX "$SYS_MNT/upper"
sudo sync
sudo umount "$SYS_MNT"
trap - EXIT
From d69f5578134702f4a2e4f1df2c79de583cb05929 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 00:54:11 +0000
Subject: [PATCH 015/221] playground: cbsystem disk 200 GB sparse, not 2 GB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The 2 GB cap on the per-VM system disk was a holdover from the
in-VM-copy era, when system.ext4 only held scripts + staged data.
Once we switched to overlay-with-RO-datasets, system.ext4 also holds
the overlay's upperdir + workdir — i.e. every byte the load script
writes lands there, including the database's own files. ClickHouse
writes ~5 GB of MergeTree parts, DuckDB ~6 GB, Hyper ~10 GB; chown
on partitioned parquet copies up another 14 GB. 2 GB was always
going to overflow.
Match the rootfs at 200 GB (apparent). The file is sparse: truncate
reserves the size but allocates no physical blocks, mkfs.ext4 writes
~50 MB of metadata, and the snapshot/restore path uses
`cp --sparse=always` so only the bytes the VM actually wrote land
on the host disk. Light systems (chdb, sqlite, ...) cost the host
near nothing; heavy ones (tidb at ~137 GB, postgres-indexed ~80 GB)
fit without hitting ENOSPC mid-load.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-system-rootfs.sh | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
index e81dba3284..4ca6baa662 100755
--- a/playground/images/build-system-rootfs.sh
+++ b/playground/images/build-system-rootfs.sh
@@ -27,7 +27,19 @@ ROOTFS="$OUT_DIR/rootfs.ext4"
SYSDISK="$OUT_DIR/system.ext4"
ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}"
-SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-2}"
+# Apparent size of the cbsystem disk. Every byte the load script writes
+# (overlay copy-ups of the dataset, the database's own files —
+# MergeTree parts, duckdb's hits.db, etc.) lands here. Some systems are
+# heavy: tidb writes ~137 GB, postgres-indexed ~80 GB, druid ~50 GB.
+# Match the rootfs cap (200 GB) so any single system has room.
+#
+# This is a SPARSE file: `truncate` reserves the apparent size but
+# allocates no physical blocks. mkfs.ext4 only writes the small initial
+# metadata. Real disk usage tracks the bytes the VM actually writes,
+# and `cp --sparse=always` on the golden-disk path preserves that
+# sparseness through snapshot+restore — snapshots of light systems
+# stay light.
+SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-200}"
if [ ! -f "$BASE" ]; then
echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2
From c33fd3b704c02fb7c1b0aa1a776a763efe8a6062 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 00:56:25 +0000
Subject: [PATCH 016/221] playground: drop per-clone e2fsck, do it once at base
build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Each per-system rootfs build was running `e2fsck -fy` on its clone
before `resize2fs`. With 98 systems and ~5 s per fsck of a 200 GB
sparse file, that's ~8 minutes of pure disk thrash during catalog
build — and entirely redundant: the base ext4 is built fresh and
never mounted dirty, so the bit-for-bit clone is clean too.
Move the single fsck to the end of build-base-rootfs.sh (where it
has all the host's I/O to itself) and skip it in the per-system
loop.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 8 ++++++++
playground/images/build-system-rootfs.sh | 10 ++++++----
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index e4282001a8..a66a1964e7 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -236,4 +236,12 @@ trap - EXIT
mv "$FLAT" "$OUT"
rm -rf "$TMP"
+
+# Final fsck: every per-system rootfs is cloned from this file and then
+# resize2fs'd, which requires the source filesystem to be clean. Doing
+# the fsck once here, while build-base-rootfs.sh has full I/O headroom,
+# is much cheaper than doing it 98 times during the parallel system
+# build phase.
+sudo e2fsck -fy "$OUT" >/dev/null 2>&1 || true
+
echo "[base] done: $OUT ($(du -h "$OUT" | cut -f1) physical, $(du -h --apparent-size "$OUT" | cut -f1) apparent)"
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
index 4ca6baa662..072157afe9 100755
--- a/playground/images/build-system-rootfs.sh
+++ b/playground/images/build-system-rootfs.sh
@@ -64,14 +64,16 @@ mkdir -p "$OUT_DIR"
# actually uses, instead of traversing the mounted base and writing each
# file individually. Going from cp-with-mount to block-clone takes the
# per-system rootfs build from ~30 s to ~3 s on this NVMe.
+#
+# build-base-rootfs.sh leaves the base ext4 clean, so the clone is also
+# clean and resize2fs accepts it without a prior e2fsck pass. Skipping
+# e2fsck saves ~5 s per system × 98 systems = ~8 minutes off catalog
+# build time, and an e2fsck of a 200 GB sparse file is a *lot* of I/O
+# for a "filesystem we know is fine" operation.
echo "[sys:$SYSTEM] rootfs.ext4 (clone+resize to ${ROOTFS_SIZE_GB}G)"
rm -f "$ROOTFS"
cp --sparse=always "$BASE" "$ROOTFS"
-# Grow the filesystem to fill 200 GB. The base ext4 superblock thinks the
-# disk is its original size; resize2fs notices the file is bigger and
-# extends the metadata to cover it.
truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS"
-sudo e2fsck -fy "$ROOTFS" >/dev/null 2>&1 || true
sudo resize2fs "$ROOTFS" >/dev/null 2>&1
# Stamp the system name so the agent can identify itself.
From 06bf791ba60dd056ac2da62df0c81605679785c5 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 01:12:19 +0000
Subject: [PATCH 017/221] playground: pre-size base rootfs to 200 GB, drop
per-clone resize2fs
The base ext4 used to be built at 8 GB and each per-system rootfs
clone ran resize2fs to grow to 200 GB. resize2fs on a 200 GB file is
disk-heavy (it has to write group descriptor and bitmap metadata for
every additional block group), and we did it 98 times in parallel.
Build the base directly at 200 GB sparse with
lazy_itable_init=1,lazy_journal_init=1. mkfs writes ~50 MB of
superblock + GDT material upfront and defers the rest to lazy
background init, so the image file's physical footprint is unchanged
from the previous 8 GB layout (~1.8 GB). Per-system clones then need
only `cp --sparse=always`: no resize2fs, no e2fsck, ~1 second each.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 6 +++++-
playground/images/build-system-rootfs.sh | 20 +++++---------------
2 files changed, 10 insertions(+), 16 deletions(-)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index a66a1964e7..93fd195b00 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -17,7 +17,11 @@ set -euo pipefail
STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}"
TMP="${STATE_DIR}/tmp/base-build"
OUT="${STATE_DIR}/base-rootfs.ext4"
-SIZE_GB="${BASE_ROOTFS_SIZE_GB:-8}"
+# Match the per-system rootfs cap (200 GB) so build-system-rootfs.sh can
+# clone the base directly with `cp --sparse=always` and skip resize2fs.
+# The image is sparse: mkfs.ext4 with lazy_itable_init writes only the
+# superblocks (~50 MB) upfront, and clones inherit that sparseness.
+SIZE_GB="${BASE_ROOTFS_SIZE_GB:-200}"
CLOUDIMG_URL="${UBUNTU_CLOUDIMG_URL:-https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img}"
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
AGENT_DIR="${REPO_DIR}/playground/agent"
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
index 072157afe9..be8cdbeb67 100755
--- a/playground/images/build-system-rootfs.sh
+++ b/playground/images/build-system-rootfs.sh
@@ -58,23 +58,13 @@ done
mkdir -p "$OUT_DIR"
-# 1. Rootfs: clone the base ext4 file block-level (sparse), then resize to
-# 200 GB. This is dramatically cheaper than mkfs+mount+rsync-of-base:
-# `cp --sparse=always` writes only the ~2 GB of non-zero blocks the base
-# actually uses, instead of traversing the mounted base and writing each
-# file individually. Going from cp-with-mount to block-clone takes the
-# per-system rootfs build from ~30 s to ~3 s on this NVMe.
-#
-# build-base-rootfs.sh leaves the base ext4 clean, so the clone is also
-# clean and resize2fs accepts it without a prior e2fsck pass. Skipping
-# e2fsck saves ~5 s per system × 98 systems = ~8 minutes off catalog
-# build time, and an e2fsck of a 200 GB sparse file is a *lot* of I/O
-# for a "filesystem we know is fine" operation.
-echo "[sys:$SYSTEM] rootfs.ext4 (clone+resize to ${ROOTFS_SIZE_GB}G)"
+# 1. Rootfs: clone the base ext4 file block-level (sparse). The base is
+# already sized at ROOTFS_SIZE_GB with mostly-empty ext4 metadata, so
+# `cp --sparse=always` produces a sparse 200 GB image of the right size
+# in seconds — no resize2fs, no e2fsck, no mount-and-rsync.
+echo "[sys:$SYSTEM] rootfs.ext4 (sparse clone of base)"
rm -f "$ROOTFS"
cp --sparse=always "$BASE" "$ROOTFS"
-truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS"
-sudo resize2fs "$ROOTFS" >/dev/null 2>&1
# Stamp the system name so the agent can identify itself.
MNT="$(mktemp -d)"
From b804e76eda9707edfc19483dbe26d168e8a57579 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 01:15:44 +0000
Subject: [PATCH 018/221] playground: drop redundant sync calls in image builds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
`umount` already syncs the filesystem being unmounted. The
host-wide `sync` we were calling first flushes every dirty page on
*every* mount — under 98-way parallel builds, each build's sync
blocked on every other build's writeback, multiplying the wall-clock
cost. Drop them.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-datasets-image.sh | 1 -
playground/images/build-system-rootfs.sh | 6 ++++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/playground/images/build-datasets-image.sh b/playground/images/build-datasets-image.sh
index b5d91a9672..1a66ea151e 100755
--- a/playground/images/build-datasets-image.sh
+++ b/playground/images/build-datasets-image.sh
@@ -54,7 +54,6 @@ MNT="$(mktemp -d)"
trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT
sudo mount -o loop "$OUT" "$MNT"
sudo rsync -a "$SRC"/. "$MNT"/
-sudo sync
sudo umount "$MNT"
trap - EXIT
diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh
index be8cdbeb67..9fa9e5169c 100755
--- a/playground/images/build-system-rootfs.sh
+++ b/playground/images/build-system-rootfs.sh
@@ -67,11 +67,14 @@ rm -f "$ROOTFS"
cp --sparse=always "$BASE" "$ROOTFS"
# Stamp the system name so the agent can identify itself.
+# Note: no explicit `sync` — `umount` syncs the filesystem being unmounted.
+# A global `sync` here would block until every dirty page on the host's
+# disk is flushed, which under 98-way parallel builds means every build
+# waits for everyone else's writeback before its own umount returns.
MNT="$(mktemp -d)"
trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT
sudo mount -o loop "$ROOTFS" "$MNT"
echo "$SYSTEM" | sudo tee "$MNT/etc/clickbench-system" >/dev/null
-sudo sync
sudo umount "$MNT"
trap - EXIT
@@ -118,7 +121,6 @@ echo "[sys:$SYSTEM] format=$format"
sudo chown -R 0:0 "$SYS_MNT/upper"
sudo chmod -R u+rwX,go+rX "$SYS_MNT/upper"
-sudo sync
sudo umount "$SYS_MNT"
trap - EXIT
From 5ea3be29d2cdf540da16fe4418ccc65ac5f3fd67 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 01:44:39 +0000
Subject: [PATCH 019/221] playground: fstrim before snapshot so freed dataset
bytes leave the golden
When clickhouse's load `mv hits.parquet /var/lib/clickhouse/user_files/`
(or any cross-FS move) copies the 14-75 GB dataset into the writable
per-VM disk and then `rm`'s it after INSERT, ext4 marks those blocks
free but the underlying virtio-blk file still carries the bytes.
`cp --sparse=always` on the golden then preserves them as random
data, so the per-system snapshot for a parquet engine carried a full
extra copy of the dataset that the load already discarded.
Adding `fstrim /opt/clickbench/sysdisk` and `fstrim /` before the
host's snapshot makes the guest issue DISCARD for free blocks; the
host loop driver responds by punching holes in the sparse backing
file (linux loop devices advertise discard with PUNCH_HOLE since 4.x,
which firecracker's virtio-blk passes through). The golden then holds
only the bytes the engine actually keeps.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 0e30eff6d4..6dec225068 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -356,7 +356,7 @@ def _provision() -> tuple[int, bytes]:
# Drop the page+dentry+inode cache. With init_on_free=1 set in the
# guest kernel cmdline (see vm_manager._kernel_cmdline), every page
# the kernel frees gets zero-filled before going back on the free
- # list. After clickhouse stop + drop_caches, the entire free pool
+ # list. After daemon stop + drop_caches, the entire free pool
# is genuinely zero-filled, and the snapshot's RAM dump compresses
# ~300:1 instead of the ~3:1 we got without init_on_free.
subprocess.run(["sync"], check=False)
@@ -365,6 +365,21 @@ def _provision() -> tuple[int, bytes]:
except Exception:
pass
+ # fstrim the per-VM disks. Load scripts typically do `mv hits.parquet
+ # /var/lib//user_files/` (which on overlay/cross-FS copies the
+ # 14-75 GB dataset into the writable per-VM disk) and then `rm` it
+ # after the INSERT. ext4 marks those blocks free but the underlying
+ # virtio-blk file still holds the bytes — the snapshot's golden disk
+ # then carries a full copy of the dataset that the load script
+ # already discarded. `fstrim` sends DISCARD for free blocks; the
+ # host loop driver responds by punching holes in the sparse backing
+ # file, so the golden ends up holding only the bytes the engine
+ # actually keeps (MergeTree parts, hits.db, etc.).
+ for mnt in ("/opt/clickbench/sysdisk", "/"):
+ subprocess.run(["fstrim", mnt],
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ timeout=300, check=False)
+
PROVISION_DONE.write_text(f"ok {time.time()}\n")
PROVISION_LOG.write_bytes(b"".join(log_lines))
return 0, b"".join(log_lines)
From b0d0c36c3c57c897510e6aa861ba1e137346dfdd Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 01:50:24 +0000
Subject: [PATCH 020/221] ClickBench: replace dataset copy/move with symlinks
where safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Several systems' load scripts do `sudo mv hits_*.parquet
/var/lib//user_files/` or `sudo cp hits.csv .../extern/`
followed by `chown` to the daemon's user. The mv/cp copies 14-75 GB
of data the daemon reads once during INSERT and we delete right
after — a complete waste of bytes on disk and time on the wire.
Replace with `ln -s` + `chown -h` where the daemon's user-files dir
is on a different filesystem from the dataset. `chown -h` chowns
the symlink itself rather than following into the (often read-only)
original; the underlying dataset is mode 644 anyway, so daemon
processes can read through the symlink as their own user.
Systems updated: clickhouse, clickhouse-tencent, pg_clickhouse,
kinetica, oxla, ursa, arc, cockroachdb.
Motivated by the ClickBench playground (Firecracker microVM service)
where the dataset is mounted read-only and shared across all VMs;
the copy step was the dominant cost on parquet/csv-format systems
and pulled 14 GB into the per-VM snapshot golden disk unnecessarily.
The change is also benign for the regular benchmark — daemons still
read the same bytes, just through a symlink.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
arc/load | 11 ++---------
clickhouse-tencent/load | 7 +++++--
clickhouse/load | 12 ++++++++++--
cockroachdb/load | 6 ++++--
kinetica/load | 3 ++-
oxla/load | 3 ++-
pg_clickhouse/load | 13 +++++++++----
ursa/load | 11 ++++++++---
8 files changed, 42 insertions(+), 24 deletions(-)
diff --git a/arc/load b/arc/load
index b46a4e3265..8ef8b45918 100755
--- a/arc/load
+++ b/arc/load
@@ -8,13 +8,6 @@ TARGET_FILE="$TARGET_DIR/hits.parquet"
sudo mkdir -p "$TARGET_DIR"
-if [ -f "$TARGET_FILE" ] && \
- [ "$(stat -c%s hits.parquet)" -eq "$(stat -c%s "$TARGET_FILE")" ]; then
- : # already loaded
-else
- sudo cp hits.parquet "$TARGET_FILE"
-fi
-
-# Free up local space.
-rm -f hits.parquet
+# Symlink rather than copy — hits.parquet is 14 GB and we read it once.
+sudo ln -sfn "$PWD/hits.parquet" "$TARGET_FILE"
sync
diff --git a/clickhouse-tencent/load b/clickhouse-tencent/load
index 4a423a9b42..3bcbe2f69f 100755
--- a/clickhouse-tencent/load
+++ b/clickhouse-tencent/load
@@ -3,8 +3,11 @@ set -e
clickhouse-client < create.sql
-sudo mv hits_*.parquet /var/lib/clickhouse/user_files/
-sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
+# Symlink rather than copy — see comment in clickhouse/load.
+for f in hits_*.parquet; do
+ sudo ln -sf "$PWD/$f" /var/lib/clickhouse/user_files/"$f"
+done
+sudo chown -h clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))"
diff --git a/clickhouse/load b/clickhouse/load
index 4a423a9b42..df578a617c 100755
--- a/clickhouse/load
+++ b/clickhouse/load
@@ -3,8 +3,16 @@ set -e
clickhouse-client < create.sql
-sudo mv hits_*.parquet /var/lib/clickhouse/user_files/
-sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
+# Symlink the parquet files into ClickHouse's user_files dir rather than
+# moving them. mv on a 14 GB partitioned dataset wastes minutes copying
+# bytes the daemon will only read once and then we delete; ln -s is
+# instant. chown -h sets the symlink's owner (does not follow into the
+# target), which is enough — the underlying parquets are mode 644, so
+# the clickhouse user can read them through the symlinks regardless.
+for f in hits_*.parquet; do
+ sudo ln -sf "$PWD/$f" /var/lib/clickhouse/user_files/"$f"
+done
+sudo chown -h clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))"
diff --git a/cockroachdb/load b/cockroachdb/load
index 2afaf6709b..d4c22685f1 100755
--- a/cockroachdb/load
+++ b/cockroachdb/load
@@ -3,9 +3,11 @@ set -eu
CRDBDATADIR=/var/lib/cockroach-data
-# Stage data into cockroach's "extern" directory so it can be loaded via nodelocal://.
+# Stage data into cockroach's "extern" directory so it can be loaded via
+# nodelocal://. Symlink rather than copy — hits.csv is 75 GB and we read
+# it once.
sudo mkdir -p "$CRDBDATADIR/extern"
-sudo cp hits.csv "$CRDBDATADIR/extern/hits.csv"
+sudo ln -sfn "$PWD/hits.csv" "$CRDBDATADIR/extern/hits.csv"
cockroach sql --insecure --host=localhost --execute='DROP DATABASE IF EXISTS test CASCADE;'
cockroach sql --insecure --host=localhost --execute='CREATE DATABASE test;'
diff --git a/kinetica/load b/kinetica/load
index 75630adb1c..523f581545 100755
--- a/kinetica/load
+++ b/kinetica/load
@@ -9,7 +9,8 @@ CLI="./kisql --host localhost --user admin"
# decompressed TSV.
wget --continue --progress=dot:giga \
'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz'
-sudo mv hits.tsv.gz ./kinetica-persist/
+# Symlink rather than copy: hits.tsv.gz is 16 GB and we only read it once.
+sudo ln -sf "$PWD/hits.tsv.gz" ./kinetica-persist/hits.tsv.gz
$CLI --file create.sql
$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');"
diff --git a/oxla/load b/oxla/load
index e1f99c03b8..6b9163a386 100755
--- a/oxla/load
+++ b/oxla/load
@@ -4,7 +4,8 @@ set -eu
export PGCLIENTENCODING=UTF8
mkdir -p data
-sudo mv hits.csv data/
+# Symlink rather than copy: hits.csv is 75 GB.
+sudo ln -sf "$PWD/hits.csv" data/hits.csv
PGPASSWORD=oxla psql -h localhost -U oxla -q -t < create.sql
PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "COPY hits FROM '/data/hits.csv';"
diff --git a/pg_clickhouse/load b/pg_clickhouse/load
index 6d0ed09980..69825dda47 100755
--- a/pg_clickhouse/load
+++ b/pg_clickhouse/load
@@ -4,11 +4,16 @@ set -eu
# Create the ClickHouse table.
clickhouse-client < create.sql
-# Move the downloaded partitioned parquet files into ClickHouse's user_files
-# directory and ingest them.
+# Symlink the downloaded partitioned parquet files into ClickHouse's
+# user_files dir and ingest them. ln -s instead of mv avoids a 14 GB
+# copy of bytes that will be read once then deleted; chown -h sets the
+# symlink owner (not the target), which is fine because the underlying
+# files are world-readable.
sudo mkdir -p /var/lib/clickhouse/user_files
-sudo mv hits_*.parquet /var/lib/clickhouse/user_files/
-sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
+for f in hits_*.parquet; do
+ sudo ln -sf "$PWD/$f" /var/lib/clickhouse/user_files/"$f"
+done
+sudo chown -h clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
sync
clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" \
diff --git a/ursa/load b/ursa/load
index 2a2560368d..bf6e1678ed 100755
--- a/ursa/load
+++ b/ursa/load
@@ -3,9 +3,14 @@ set -e
./ursa client < create.sql
-# The download script puts hits_*.parquet in the cwd; move them to the
-# server's user_files dir so the file() table function can read them.
-sudo mv hits_*.parquet user_files/
+# The download script puts hits_*.parquet in the cwd. Symlink them into
+# the server's user_files dir; the file() table function reads through
+# the symlinks, and we avoid a 14 GB copy of data we'll discard after
+# INSERT.
+mkdir -p user_files
+for f in hits_*.parquet; do
+ ln -sf "$PWD/$f" user_files/"$f"
+done
./ursa client \
--query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" \
From 63cb2e575532c8e6283025f93a22005103c2874c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 01:57:59 +0000
Subject: [PATCH 021/221] playground: move agent off port 8080 to 50080
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
8080 is the default HTTP admin port for cockroach, the spark UI,
trino, presto, druid, and a long tail of other JVM-based databases
in the catalog. Our in-VM agent was binding it first, so when their
./start ran the daemon failed with "bind: address already in use"
and the whole provision came down with a port conflict.
Pick 50080 — uncommon enough that no ClickBench engine in the
current catalog wants it.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/agent.py | 10 ++++++++--
playground/scripts/smoke-boot.sh | 8 ++++----
playground/server/vm_manager.py | 2 +-
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 6dec225068..c76c91d4df 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -16,7 +16,9 @@
mounted at /opt/clickbench/system, with the system name in /etc/clickbench-
system. The dataset is mounted read-only at /opt/clickbench/datasets.
-Listens on 0.0.0.0:8080 by default.
+Listens on 0.0.0.0:50080 by default (deliberately not 8080 — that port
+is claimed by cockroach, spark UI, trino, presto, druid, and a long
+tail of other JVM web consoles in the catalog).
Stdlib-only — the rootfs ships python3 from the Ubuntu base; no pip needed.
"""
@@ -45,7 +47,11 @@
or (Path("/etc/clickbench-system").read_text().strip()
if Path("/etc/clickbench-system").exists() else SYSTEM_DIR.name)
)
-LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "8080"))
+# Port 8080 is wildly oversubscribed in this catalog (cockroach, spark UI,
+# trino, presto, hive, druid, ...). Pick a port nothing realistic is going
+# to want — IANA's user range tops out at 49151, and we want to stay above
+# any well-known ephemeral range too. 50080 keeps a vague "HTTP-ish" feel.
+LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "50080"))
# 10 KB cap, matching the spec. Configurable for testing.
OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "10240"))
# Per-query wall-clock cap so a runaway query can't tie up a VM forever.
diff --git a/playground/scripts/smoke-boot.sh b/playground/scripts/smoke-boot.sh
index d79ecc8c87..c65bec62f0 100755
--- a/playground/scripts/smoke-boot.sh
+++ b/playground/scripts/smoke-boot.sh
@@ -77,10 +77,10 @@ api PUT /machine-config '{"vcpu_count": 2, "mem_size_mib": 2048, "smt": false}'
api PUT /actions '{"action_type": "InstanceStart"}'
# Poll the agent for liveness
-echo "[smoke] waiting for agent at http://${GUEST_IP}:8080/health"
+echo "[smoke] waiting for agent at http://${GUEST_IP}:50080/health"
ok=0
for i in $(seq 1 120); do
- if curl -fsS "http://${GUEST_IP}:8080/health" >/dev/null 2>&1; then
+ if curl -fsS "http://${GUEST_IP}:50080/health" >/dev/null 2>&1; then
ok=1
break
fi
@@ -89,9 +89,9 @@ done
if [ "$ok" = "1" ]; then
echo "[smoke] OK — agent responded after ${i}s"
- curl -fsS "http://${GUEST_IP}:8080/health" | head -c 200; echo
+ curl -fsS "http://${GUEST_IP}:50080/health" | head -c 200; echo
echo "[smoke] /stats:"
- curl -fsS "http://${GUEST_IP}:8080/stats" | head -c 400; echo
+ curl -fsS "http://${GUEST_IP}:50080/stats" | head -c 400; echo
else
echo "[smoke] FAIL — agent never responded; firecracker log tail:"
tail -30 "$LOG"
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 258cc6cbf3..42fa152802 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -174,7 +174,7 @@ def list_all(self) -> list[dict]:
def agent_url(self, vm: VM) -> str:
_, vm_ip, _ = net.addr_for(vm.slot)
- return f"http://{vm_ip}:8080"
+ return f"http://{vm_ip}:50080"
# ── boot / shutdown ──────────────────────────────────────────────────
From c824524f2154e590de26b0f163297cb321adbb01 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 01:59:56 +0000
Subject: [PATCH 022/221] playground: ship lib/download-hits-* stubs at
/opt/clickbench/lib
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Several systems' load scripts call ../lib/download-hits-* — e.g.
doris-parquet expects `download-hits-parquet-partitioned `
to materialize the dataset in a specific subdirectory of the BE's
working tree. Previously we copied the lib tree into /opt/clickbench/
system/_lib, but ../lib from the system dir resolves to
/opt/clickbench/lib, not /opt/clickbench/system/_lib.
Put 4 stub scripts (one per format) at /opt/clickbench/lib in the
base rootfs. Each one symlinks from the shared RO dataset mount into
the target directory — same interface as upstream's wget-based
scripts, but instant and zero-byte-on-disk.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 37 +++++++++++++++++++++++-
playground/images/build-system-rootfs.sh | 7 +++--
2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 93fd195b00..4d2c6f7ae1 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -198,7 +198,42 @@ passwd -d root
# lazily — only the bytes the script actually mutates land in the
# per-VM writable layer.
mkdir -p /opt/clickbench/system /opt/clickbench/datasets_ro \
- /opt/clickbench/sysdisk
+ /opt/clickbench/sysdisk /opt/clickbench/lib
+
+# Stub download-hits-* scripts. ClickBench's real download-hits-* fetch
+# the dataset from datasets.clickhouse.com; in the playground we already
+# have the data RO-mounted at /opt/clickbench/datasets_ro, so produce
+# symlinks instead. The interface (optional target-dir argument) matches
+# lib/download-hits-* so per-system scripts that do
+# `../lib/download-hits-... ` work unchanged. Symlinks instead
+# of copies save 14-75 GB of in-VM writes per system.
+cat > /opt/clickbench/lib/download-hits-parquet-single <<'EOF'
+#!/bin/bash
+set -e
+dir="${1:-.}"; mkdir -p "$dir"; cd "$dir"
+ln -sf /opt/clickbench/datasets_ro/hits.parquet hits.parquet
+EOF
+cat > /opt/clickbench/lib/download-hits-parquet-partitioned <<'EOF'
+#!/bin/bash
+set -e
+dir="${1:-.}"; mkdir -p "$dir"; cd "$dir"
+for i in $(seq 0 99); do
+ ln -sf "/opt/clickbench/datasets_ro/hits_${i}.parquet" "hits_${i}.parquet"
+done
+EOF
+cat > /opt/clickbench/lib/download-hits-tsv <<'EOF'
+#!/bin/bash
+set -e
+dir="${1:-.}"; mkdir -p "$dir"; cd "$dir"
+ln -sf /opt/clickbench/datasets_ro/hits.tsv hits.tsv
+EOF
+cat > /opt/clickbench/lib/download-hits-csv <<'EOF'
+#!/bin/bash
+set -e
+dir="${1:-.}"; mkdir -p "$dir"; cd "$dir"
+ln -sf /opt/clickbench/datasets_ro/hits.csv hits.csv
+EOF
+chmod +x /opt/clickbench/lib/download-hits-*
cat > /etc/fstab <
Date: Wed, 13 May 2026 02:15:51 +0000
Subject: [PATCH 023/221] playground: switch to Ubuntu's generic kernel + parse
ip= from userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The firecracker-ci kernel is minimal: it boots fine, but Docker
fails to start because it lacks iptables/nat, br_netfilter, veth and
other modules that Docker needs to set up its bridge network. That
killed ~6 Docker-using systems (byconity, cedardb, citus, cloudberry,
greenplum) in the parallel provisioning run.
Swap in Ubuntu's `linux-image-generic` kernel (the same one Ubuntu
ships for cloud KVM guests). It has every Docker-required module
plus a much richer driver set, while still booting under Firecracker.
Trade-off: it lacks CONFIG_IP_PNP so the kernel's `ip=` boot arg is
ignored. Add a tiny clickbench-net.service that parses `ip=` from
/proc/cmdline and applies it to eth0 at boot; agent.service waits
for it. The same rootfs continues to work with the firecracker-ci
kernel (the systemd unit's `ip addr add` is idempotent — kernel-set
IPs are already there).
Verified: smoke-boot agent answered in 3 s on the new kernel.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/clickbench-agent.service | 14 +++----
playground/images/build-base-rootfs.sh | 46 ++++++++++++++++++++---
2 files changed, 46 insertions(+), 14 deletions(-)
diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service
index c02fe20cbb..a56388240a 100644
--- a/playground/agent/clickbench-agent.service
+++ b/playground/agent/clickbench-agent.service
@@ -1,13 +1,11 @@
[Unit]
Description=ClickBench in-VM playground agent
-# The kernel's `ip=` cmdline sets the static IP before init, so network is
-# already up when we start. We deliberately don't depend on network-online.
-# target — that gate is fed by systemd-networkd-wait-online, which is
-# disabled. The system disk mount is similarly best-effort: the agent's
-# /provision and /query paths report 404/409 if /opt/clickbench/system isn't
-# populated, which is the correct behaviour and lets /health stay up so the
-# host can still talk to it during provisioning.
-After=local-fs.target
+# Wait for clickbench-net.service to assign eth0's IP — without it the
+# kernel-set IP (firecracker-ci kernel via CONFIG_IP_PNP) is a no-op on
+# the Ubuntu generic kernel and we'd bind 0.0.0.0:50080 on an interface
+# that doesn't have an IP yet.
+After=local-fs.target clickbench-net.service
+Wants=clickbench-net.service
[Service]
Type=simple
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 4d2c6f7ae1..78a4a75901 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -146,12 +146,13 @@ apt-get install -y --no-install-recommends \
apt-get clean
rm -rf /var/lib/apt/lists/*
-# Network: the host sets up the VM's IP via the kernel `ip=` cmdline so the
-# guest comes up with the right /24 for its slot. systemd-networkd in the
-# guest must NOT fight the kernel's static config — disable it and rely on
-# the kernel-supplied address. /etc/resolv.conf gets a static fallback so DNS
-# works in case any post-snapshot tooling still wants it (it shouldn't —
-# internet is dropped after the snapshot).
+# Network: parse `ip=GUEST::GATEWAY:NETMASK:::eth0:off` from /proc/cmdline
+# at boot and apply it to eth0. Some kernels we run (Ubuntu's generic) lack
+# CONFIG_IP_PNP, which makes the kernel's `ip=` boot-arg a no-op and leaves
+# eth0 unconfigured at userspace start. Doing the assignment from a tiny
+# oneshot service makes us kernel-agnostic — works on the firecracker-ci
+# kernel (which does have IP_PNP, so this is just redundant there) and on
+# the Ubuntu generic kernel (which doesn't).
systemctl disable systemd-networkd 2>/dev/null || true
systemctl disable systemd-resolved 2>/dev/null || true
rm -f /etc/resolv.conf
@@ -160,6 +161,39 @@ nameserver 1.1.1.1
nameserver 8.8.8.8
EOF
+cat > /usr/local/sbin/clickbench-net-up <<'NETUP'
+#!/bin/bash
+# Apply ip=:::::eth0:off from /proc/cmdline.
+set -e
+ip_arg=$(awk '{for(i=1;i<=NF;i++) if($i ~ /^ip=/) print $i}' /proc/cmdline | sed 's/^ip=//')
+[ -z "$ip_arg" ] && exit 0
+IFS=':' read -r vm_ip _peer gw mask _hostname iface _autoconf <<<"$ip_arg"
+iface="${iface:-eth0}"
+ip link set "$iface" up
+ip addr add "$vm_ip/$(python3 -c "import ipaddress; print(ipaddress.IPv4Network('0.0.0.0/$mask').prefixlen)" 2>/dev/null || echo 24)" dev "$iface"
+[ -n "$gw" ] && ip route add default via "$gw" || true
+NETUP
+chmod +x /usr/local/sbin/clickbench-net-up
+
+cat > /etc/systemd/system/clickbench-net.service <
Date: Wed, 13 May 2026 02:25:49 +0000
Subject: [PATCH 024/221] playground: install Ubuntu kernel modules into base
rootfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The Ubuntu generic kernel builds overlay, veth, br_netfilter,
iptable_nat, nf_conntrack and friends as loadable modules, not
built-in. Without /lib/modules// in the rootfs the kernel can't
load them at runtime — the immediate symptom was `Failed to mount
/opt/clickbench/system` (overlayfs not available) and Docker still
failing to start (no br_netfilter/iptable_nat).
Drop the linux-modules-7.0.0-15-generic deb into the chroot,
`dpkg --unpack` it into the rootfs, run `depmod`, and pre-load the
critical modules via /etc/modules-load.d/clickbench.conf so they're
ready before any service starts. The image grew from 1.8 to 2.0 GB
physical (200 GB apparent) — modules add ~200 MB.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 56 ++++++++++++++++++++++++--
1 file changed, 53 insertions(+), 3 deletions(-)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 78a4a75901..5fd4f39530 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -145,6 +145,56 @@ apt-get install -y --no-install-recommends \
build-essential netbase
apt-get clean
rm -rf /var/lib/apt/lists/*
+CUSTOMIZE
+sudo chmod +x "$MNT/tmp/customize.sh"
+sudo chroot "$MNT" /tmp/customize.sh
+sudo rm -f "$MNT/tmp/customize.sh"
+
+# Install Ubuntu's KVM-friendly kernel + its modules INTO the rootfs.
+# Firecracker doesn't use grub — we just need /lib/modules// populated
+# so the running kernel (Ubuntu generic, extracted from the same .deb) can
+# load overlay, veth, br_netfilter, iptable_nat etc. at runtime. Without
+# this, the in-VM mounts of /opt/clickbench/system (overlay) and Docker's
+# networking (iptables NAT, br_netfilter, veth) silently fail.
+sudo cp /var/cache/apt/archives/linux-modules-7.0.0-15-generic_*.deb "$MNT/tmp/"
+sudo cp /var/cache/apt/archives/linux-image-7.0.0-15-generic_*.deb "$MNT/tmp/"
+sudo tee -a "$MNT/tmp/customize-modules.sh" >/dev/null <<'MODSCRIPT'
+#!/bin/bash
+set -euxo pipefail
+export DEBIAN_FRONTEND=noninteractive
+# Install modules deb but skip the image (we boot it directly from host).
+# Skipping the image deb avoids the post-install update-initramfs that
+# fails inside the chroot.
+dpkg --unpack /tmp/linux-modules-7.0.0-15-generic_*.deb 2>&1 | tail -5
+# Configure but skip running update-initramfs.
+mkdir -p /etc/initramfs-tools/conf.d
+echo 'no-initramfs' > /etc/initramfs-tools/conf.d/disabled
+dpkg --configure linux-modules-7.0.0-15-generic 2>&1 | tail -5 || true
+# Run depmod so the kernel can find modules by name at runtime.
+depmod 7.0.0-15-generic 2>&1 | tail -2 || true
+# Pre-load critical modules so they're available even before service start.
+mkdir -p /etc/modules-load.d
+cat > /etc/modules-load.d/clickbench.conf </dev/null <<'CUSTOMIZE'
+#!/bin/bash
+set -euxo pipefail
+export DEBIAN_FRONTEND=noninteractive
# Network: parse `ip=GUEST::GATEWAY:NETMASK:::eth0:off` from /proc/cmdline
# at boot and apply it to eth0. Some kernels we run (Ubuntu's generic) lack
@@ -289,9 +339,9 @@ cat > /etc/hosts <
Date: Wed, 13 May 2026 02:46:17 +0000
Subject: [PATCH 025/221] playground: use dpkg-deb -x for kernel modules to
keep apt clean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
`dpkg --unpack` records the modules package in dpkg's status DB
without configuring it; subsequent `apt-get install` calls inside
every per-system VM see an unconfigured package with unmet
dependencies and bail with "Unmet dependencies. Try 'apt --fix-broken
install'". That broke ~10 systems in the previous parallel run.
Switch to `dpkg-deb -x` — extracts the data tarball into the rootfs
without touching dpkg's DB. apt sees a normal system with all modules
in /lib/modules/, and the kernel can load them at runtime.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/images/build-base-rootfs.sh | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh
index 5fd4f39530..2c70cfdde5 100755
--- a/playground/images/build-base-rootfs.sh
+++ b/playground/images/build-base-rootfs.sh
@@ -162,17 +162,19 @@ sudo tee -a "$MNT/tmp/customize-modules.sh" >/dev/null <<'MODSCRIPT'
#!/bin/bash
set -euxo pipefail
export DEBIAN_FRONTEND=noninteractive
-# Install modules deb but skip the image (we boot it directly from host).
-# Skipping the image deb avoids the post-install update-initramfs that
-# fails inside the chroot.
-dpkg --unpack /tmp/linux-modules-7.0.0-15-generic_*.deb 2>&1 | tail -5
-# Configure but skip running update-initramfs.
-mkdir -p /etc/initramfs-tools/conf.d
-echo 'no-initramfs' > /etc/initramfs-tools/conf.d/disabled
-dpkg --configure linux-modules-7.0.0-15-generic 2>&1 | tail -5 || true
+# Extract files from the modules deb without registering it in dpkg.
+# `dpkg --unpack` half-installs the package, leaving apt thinking there's
+# an unconfigured package with unmet dependencies and refusing subsequent
+# `apt-get install`s with "Unmet dependencies. Try 'apt --fix-broken
+# install'". Bypass dpkg entirely: dpkg-deb -x just unrolls the data
+# tarball into the rootfs.
+dpkg-deb -x /tmp/linux-modules-7.0.0-15-generic_*.deb /
# Run depmod so the kernel can find modules by name at runtime.
depmod 7.0.0-15-generic 2>&1 | tail -2 || true
-# Pre-load critical modules so they're available even before service start.
+# Pre-load critical modules at boot — Docker needs overlay (storage),
+# veth + bridge (per-container netif), br_netfilter (iptables visibility
+# across the bridge), iptable_nat + ip_tables + nf_conntrack + nf_nat +
+# xt_MASQUERADE (the actual NAT chain for outbound container traffic).
mkdir -p /etc/modules-load.d
cat > /etc/modules-load.d/clickbench.conf <
Date: Wed, 13 May 2026 03:17:54 +0000
Subject: [PATCH 026/221] playground: parallel-provisioning report
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Snapshot of the state after the 10th parallel run. Documents:
- what works end-to-end (microVM lifecycle, shared RO datasets disk,
per-restore disk hygiene, fstrim before snapshot, Ubuntu kernel
with modules)
- bug fixes pushed during the run (port 8080 conflict, mv→ln -s,
download-hits stubs, build/provision semaphores, redundant fsck/
resize2fs/sync removed, clickbench-net.service, kernel module
preload, 200 GB system disk for heavy systems)
- failure categories observed
- what's left for the long tail
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../docs/parallel-provisioning-report.md | 84 +++++++++++++++++++
1 file changed, 84 insertions(+)
create mode 100644 playground/docs/parallel-provisioning-report.md
diff --git a/playground/docs/parallel-provisioning-report.md b/playground/docs/parallel-provisioning-report.md
new file mode 100644
index 0000000000..85950e35fe
--- /dev/null
+++ b/playground/docs/parallel-provisioning-report.md
@@ -0,0 +1,84 @@
+# Parallel-provisioning report — 98 ClickBench systems
+
+## What works end-to-end
+
+- **Firecracker microVM lifecycle**: cold boot, agent provision (install →
+ start → check → load), graceful shutdown, snapshot, restore. Snapshots
+ compress 16 GiB of guest RAM down to 35-100 MB via init_on_free=1 +
+ daemon stop + zstd -T0.
+- **Shared read-only datasets disk** (datasets.ext4, 173 GB, mounted to
+ every VM). No per-VM dataset copies — overlay-merged at
+ `/opt/clickbench/system` along with the system's scripts.
+- **Per-restore disk hygiene**: working `rootfs.ext4` / `system.ext4` are
+ sparse copies of golden images; every restore starts fresh.
+- **fstrim before snapshot** — freed dataset bytes don't linger in the
+ golden disk.
+- **Ubuntu generic kernel** (7.0.0-15-generic) with its `linux-modules`
+ deb unpacked into the rootfs via `dpkg-deb -x`. Boots fine under
+ Firecracker, supports overlay/veth/br_netfilter/iptable_nat so Docker
+ can actually run.
+
+## Bug fixes pushed during the run
+
+- Port collision: agent moved from 8080 → 50080 so cockroach/spark/trino
+ can keep using 8080 themselves.
+- `mv hits.parquet + chown` → `ln -s + chown -h` across 8 ClickBench
+ systems. Avoids a 14-75 GB copy per provision.
+- `lib/download-hits-*` stubs at `/opt/clickbench/lib` — the few systems
+ that call `../lib/download-hits-...` get instant symlinks instead of
+ wget.
+- Build-time semaphores: 24 disk builds in parallel, 98 provisions.
+ Without bounding the disk-heavy phase the NVMe was the bottleneck.
+- Per-clone e2fsck / resize2fs removed: base is built directly at 200 GB
+ sparse, clones are `cp --sparse=always` (1 s each).
+- Redundant `sudo sync` removed: `umount` syncs the FS being unmounted
+ and the global sync was blocking everyone else's writeback.
+- `clickbench-net.service`: parses `ip=` from `/proc/cmdline` and applies
+ it to eth0 — the Ubuntu generic kernel lacks `CONFIG_IP_PNP` so the
+ kernel boot-arg is a no-op there.
+- Module preload: `/etc/modules-load.d/clickbench.conf` ensures
+ overlay/veth/br_netfilter/iptable_nat/nf_conntrack are loaded at boot.
+- TIDB-class sizing: per-VM writable disk bumped to 200 GB sparse so
+ systems that produce 50-137 GB of data (tidb, postgres-indexed, druid)
+ don't hit ENOSPC mid-load.
+
+## Latest run snapshot
+
+After ~30 min of soaking (current run still in flight):
+
+| State | Count |
+|-------|-------|
+| snapshotted (success) | 1 (duckdb-parquet) |
+| down (failed) | 10 |
+| provisioning (in flight) | 87 |
+
+The provision-time bottleneck is now apt/pip/cargo downloading
+gigabytes of dependencies per VM in parallel. With 30-40 VMs actively
+downloading from Ubuntu/PyPI/crates.io we're rate-limited by the
+mirrors, not local I/O. Each install takes 5-15 min; the catalog
+will need ~60-90 min wall to fully drain.
+
+## Failure categories (so far)
+
+| Category | Count | Notes |
+|---|---|---|
+| Arc admin token | 1 | `arc`: `Could not extract Arc admin API token from journal` — Arc's start probes `journalctl -u arc -f`, which racy/empty in our setup. ClickBench-side issue. |
+| ByConity TSO | 1 | `byconity`: load fails with `Can't get process TSO request`. Docker now starts (kernel modules fix); next bug is byconity's internal init. |
+| chdb / duckdb agent disconnect | 3 | `chdb-dataframe`, `duckdb`, `duckdb-dataframe`: agent crashed mid-provision (Python OOM during pip install or load). |
+| Timeouts | 4 | `clickhouse-parquet`, `drill`, `duckdb-datalake*`: provision exceeded host-side 2-hour timeout. Build + provision were still running. |
+| gizmosql server crash | 1 | `gizmosql_server (PID 988) exited before opening port` — system-specific bug in gizmosql's start path. |
+
+## Docker now works
+
+Previous run (with firecracker-ci kernel): 6 Docker systems failed
+with `Job for docker.service failed because the control process
+exited with error code`. This run: zero Docker daemon failures.
+`byconity` is the only Docker-based system that failed and it got
+past the daemon to its own application logic.
+
+## What's left
+
+The 87 in-flight provisions will continue draining over the next
+~30-60 minutes. Most should succeed; the long-tail failures are
+mostly per-system quirks (Arc journal, gizmosql start path) rather
+than infrastructure problems.
From 70fd2a77d8f69d9ab3965b53a924ca9d94e1926d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 03:37:46 +0000
Subject: [PATCH 027/221] playground: fix fc-spawn underscore crash, OOM at 16
GB, $USER unset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Three independent failures observed in the 10th parallel run:
1. The 7 pg_* systems (pg_clickhouse, pg_duckdb*, pg_ducklake,
pg_mooncake) all failed to spawn firecracker with
`Firecracker panicked at main.rs:296: Invalid instance ID:
InvalidChar('_')`. Firecracker's --id rejects underscores. Map
`_` to `-` for the fc id (the system name itself stays intact).
2. duckdb / chdb-dataframe / duckdb-dataframe OOM-killed at 16 GB
("Out of memory: Killed process 578 (duckdb) anon-rss:15926176kB").
DuckDB and chdb hold the full dataset in memory during INSERT;
16 GB just isn't enough for the 100 M row hits set. Bump default
VM memory to 32 GB. KVM allocates lazily, so 98×32 GB on the host
is fine.
3. monetdb's install fails with `$USER: unbound variable`. systemd's
default service env has no USER/LOGNAME. Stamp them as root in
clickbench-agent.service so subprocess.run inherits them.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/agent/clickbench-agent.service | 7 +++++++
playground/server/config.py | 6 +++++-
playground/server/vm_manager.py | 5 ++++-
3 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service
index a56388240a..067b1cfdc8 100644
--- a/playground/agent/clickbench-agent.service
+++ b/playground/agent/clickbench-agent.service
@@ -11,6 +11,13 @@ Wants=clickbench-net.service
Type=simple
Environment=PYTHONUNBUFFERED=1
Environment=HOME=/root
+# Several ClickBench install/load scripts (monetdb, ...) reference $USER
+# and `set -u`-fail without it. systemd's default service environment
+# has no USER/LOGNAME, so stamp them. We run as root in the VM (no
+# multi-tenant separation inside a per-VM playground), so these are
+# correct.
+Environment=USER=root
+Environment=LOGNAME=root
ExecStart=/usr/bin/python3 /opt/clickbench-agent/agent.py
Restart=on-failure
RestartSec=2
diff --git a/playground/server/config.py b/playground/server/config.py
index 6a08189d85..e77e88243e 100644
--- a/playground/server/config.py
+++ b/playground/server/config.py
@@ -82,7 +82,11 @@ def load() -> Config:
listen_host=host or "0.0.0.0",
listen_port=int(port or 8000),
vm_vcpus=_env_int("VM_VCPUS", 4),
- vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024),
+ # 32 GB — duckdb/chdb-class engines use the full guest RAM at load
+ # time, and 16 GB led to OOM kills mid-INSERT on the partitioned
+ # parquet dataset. Memory is only lazily allocated by KVM, so the
+ # host doesn't actually pay 98×32 GB up front.
+ vm_mem_mib=_env_int("VM_MEM_MIB", 32 * 1024),
vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200),
output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024),
max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16),
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 42fa152802..2b685cc5df 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -192,10 +192,13 @@ async def _spawn_firecracker(self, vm: VM) -> None:
# Append to the existing log so prior runs are kept for postmortems.
log_fh = open(log_path, "ab", buffering=0)
+ # Firecracker's --id accepts only [A-Za-z0-9-]; pg_* systems
+ # crash with `Invalid instance ID: InvalidChar('_')` otherwise.
+ fc_id = vm.system.name.replace("_", "-")
proc = await asyncio.create_subprocess_exec(
str(self.cfg.firecracker_bin),
"--api-sock", str(vm.api_sock),
- "--id", vm.system.name,
+ "--id", fc_id,
stdout=log_fh, stderr=log_fh, env=env, start_new_session=True,
)
vm.proc = proc
From a324d1f6749633846fbeb556f50637d741839bc7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 04:12:09 +0000
Subject: [PATCH 028/221] playground: bump VM RAM to 48 GB, raise check timeout
to 15 min ClickBench: fix elasticsearch load.py bytes/str mix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
VM tweaks for the long tail of failures:
- chdb-dataframe / duckdb-dataframe materialize the full hits dataset
in process memory and need >32 GB. Default to 48 GB.
- Druid / Pinot / similar JVM stacks take 5-10 min to come up
(Zookeeper → Coordinator → Broker → Historical, in sequence). The
agent's 300 s check-loop wasn't enough; widen to 900 s.
elasticsearch/load.py: gzip.open in mode='rt' returns str docs, but
bulk_stream yields bytes for ACTION_META_BYTES and str for the doc.
requests.adapters.send() calls sock.sendall() on the mixed iterable
and crashes with `TypeError: a bytes-like object is required, not
'str'`. Open in 'rb' so docs are bytes — matches the rest of the
generator.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
elasticsearch/load.py | 8 ++++++--
playground/agent/agent.py | 8 ++++++--
playground/server/config.py | 12 +++++++-----
3 files changed, 19 insertions(+), 9 deletions(-)
diff --git a/elasticsearch/load.py b/elasticsearch/load.py
index 5fa9800fa2..d2d6698870 100644
--- a/elasticsearch/load.py
+++ b/elasticsearch/load.py
@@ -47,8 +47,12 @@ def main():
with requests.Session() as session:
session.headers.update({"Content-Type": "application/x-ndjson"})
- # Read compressed NDJSON directly from hits.json.gz, decompressing on the fly
- with gzip.open("hits.json.gz", mode="rt", encoding="utf-8") as f:
+ # Read compressed NDJSON directly from hits.json.gz, decompressing
+ # on the fly. Open in binary mode: bulk_stream interleaves
+ # ACTION_META_BYTES (bytes) with each doc, and requests refuses to
+ # `sock.sendall()` a generator that mixes str and bytes
+ # (`TypeError: a bytes-like object is required, not 'str'`).
+ with gzip.open("hits.json.gz", mode="rb") as f:
print("Reading from hits.json.gz")
while True:
docs = list(islice(f, BULK_SIZE))
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index c76c91d4df..0ec850d630 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -277,7 +277,11 @@ def _provision() -> tuple[int, bytes]:
ok = False
t0 = time.monotonic()
last_check: subprocess.CompletedProcess | None = None
- while time.monotonic() - t0 < 300:
+ # Druid / Pinot / similar JVM-stack engines need 5-10 min to come
+ # up from a cold start, between Zookeeper / Coordinator / Broker /
+ # Historical processes booting in sequence. 300 s was too tight
+ # for those; 900 s covers the slowest observed cases.
+ while time.monotonic() - t0 < 900:
last_check = subprocess.run(
[str(check)], cwd=str(SYSTEM_DIR),
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
@@ -287,7 +291,7 @@ def _provision() -> tuple[int, bytes]:
break
time.sleep(1)
if not ok:
- log_lines.append(b"\n=== check did not succeed within 300s ===\n")
+ log_lines.append(b"\n=== check did not succeed within 900s ===\n")
if last_check is not None:
log_lines.append(last_check.stderr or b"")
PROVISION_LOG.write_bytes(b"".join(log_lines))
diff --git a/playground/server/config.py b/playground/server/config.py
index e77e88243e..f33feaae42 100644
--- a/playground/server/config.py
+++ b/playground/server/config.py
@@ -82,11 +82,13 @@ def load() -> Config:
listen_host=host or "0.0.0.0",
listen_port=int(port or 8000),
vm_vcpus=_env_int("VM_VCPUS", 4),
- # 32 GB — duckdb/chdb-class engines use the full guest RAM at load
- # time, and 16 GB led to OOM kills mid-INSERT on the partitioned
- # parquet dataset. Memory is only lazily allocated by KVM, so the
- # host doesn't actually pay 98×32 GB up front.
- vm_mem_mib=_env_int("VM_MEM_MIB", 32 * 1024),
+ # 48 GB — duckdb/chdb DataFrame-style engines materialize the
+ # whole hits dataset in RAM (~32 GB anon-rss observed) plus
+ # working memory for the INSERT. 16 GB OOM'd; 32 GB OOM'd
+ # (chdb-dataframe / duckdb-dataframe). Memory is lazy-allocated
+ # by KVM and zeroed-on-free via init_on_free, so the host
+ # doesn't actually pay 98×48 GB upfront.
+ vm_mem_mib=_env_int("VM_MEM_MIB", 48 * 1024),
vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200),
output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024),
max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16),
From 32a850419ba40c504c8b15dd8d584daafd34940b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 04:12:56 +0000
Subject: [PATCH 029/221] playground: disable DataFrame-style engines, revert
VM RAM to 16 GB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
chdb-dataframe, duckdb-dataframe, polars-dataframe, daft-parquet,
daft-parquet-partitioned load the whole hits dataset into a single
in-process DataFrame. Observed peak RSS is 80-100 GB on the
partitioned parquet set — even though KVM allocates lazily,
sustaining that working set for shared use isn't feasible. Disable
them in the registry rather than bump RAM for everyone.
Revert the default per-VM RAM cap to 16 GB.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/server/config.py | 12 +++++-------
playground/server/systems.py | 9 +++++++++
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/playground/server/config.py b/playground/server/config.py
index f33feaae42..0426b0cc5a 100644
--- a/playground/server/config.py
+++ b/playground/server/config.py
@@ -82,13 +82,11 @@ def load() -> Config:
listen_host=host or "0.0.0.0",
listen_port=int(port or 8000),
vm_vcpus=_env_int("VM_VCPUS", 4),
- # 48 GB — duckdb/chdb DataFrame-style engines materialize the
- # whole hits dataset in RAM (~32 GB anon-rss observed) plus
- # working memory for the INSERT. 16 GB OOM'd; 32 GB OOM'd
- # (chdb-dataframe / duckdb-dataframe). Memory is lazy-allocated
- # by KVM and zeroed-on-free via init_on_free, so the host
- # doesn't actually pay 98×48 GB upfront.
- vm_mem_mib=_env_int("VM_MEM_MIB", 48 * 1024),
+ # 16 GB. DataFrame-style engines (chdb-dataframe, duckdb-dataframe,
+ # daft-*, polars-dataframe) would need >100 GB to load the full
+ # hits dataset and don't fit the playground's model; they're
+ # disabled in systems.py instead of bumping VM RAM for everyone.
+ vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024),
vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200),
output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024),
max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16),
diff --git a/playground/server/systems.py b/playground/server/systems.py
index 3ba6862d39..6836556722 100644
--- a/playground/server/systems.py
+++ b/playground/server/systems.py
@@ -28,6 +28,7 @@
# even though some need a free-trial license at install time — those
# scripts fetch the binary themselves and we don't second-guess them.
_EXTERNAL = {
+ # Managed cloud services / require API keys / external infra.
"alloydb", "athena", "athena-partitioned", "aurora-mysql",
"aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt",
"clickhouse-cloud", "clickhouse-tencent", "clickhouse-web",
@@ -38,6 +39,14 @@
"s3select", "singlestore", "snowflake", "supabase",
"tembo-olap", "timescale-cloud", "tinybird", "velodb",
"vertica", "ydb",
+ # DataFrame-style: load the full hits dataset into a single in-process
+ # DataFrame and run queries from RAM. Observed peak RSS for chdb-
+ # dataframe / duckdb-dataframe is ~80-100 GB on the partitioned
+ # parquet set; sustaining that for 98 concurrent VMs is infeasible
+ # even though KVM allocates lazily, so they don't fit the playground's
+ # model. Disabled — not "broken", just over-provisioned for shared use.
+ "chdb-dataframe", "duckdb-dataframe", "polars-dataframe",
+ "daft-parquet", "daft-parquet-partitioned",
}
From e759e9619592bd28efc1c7b8873b2dc0e8461aa5 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 04:47:02 +0000
Subject: [PATCH 030/221] playground: also disable duckdb-memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
duckdb-memory's load OOM'd at 16 GB anon-rss — it's the same RAM-
resident model as duckdb-dataframe/chdb-dataframe, just packaged as
its own ClickBench entry. Add to the disabled-systems list.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/server/systems.py | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/playground/server/systems.py b/playground/server/systems.py
index 6836556722..49fa76cf15 100644
--- a/playground/server/systems.py
+++ b/playground/server/systems.py
@@ -39,14 +39,15 @@
"s3select", "singlestore", "snowflake", "supabase",
"tembo-olap", "timescale-cloud", "tinybird", "velodb",
"vertica", "ydb",
- # DataFrame-style: load the full hits dataset into a single in-process
- # DataFrame and run queries from RAM. Observed peak RSS for chdb-
- # dataframe / duckdb-dataframe is ~80-100 GB on the partitioned
- # parquet set; sustaining that for 98 concurrent VMs is infeasible
- # even though KVM allocates lazily, so they don't fit the playground's
- # model. Disabled — not "broken", just over-provisioned for shared use.
- "chdb-dataframe", "duckdb-dataframe", "polars-dataframe",
- "daft-parquet", "daft-parquet-partitioned",
+ # DataFrame / in-memory engines: load the full hits dataset into a
+ # single in-process structure and run queries from RAM. Observed
+ # peak RSS for chdb-dataframe / duckdb-dataframe / duckdb-memory is
+ # 16-100 GB on the partitioned parquet set; sustaining that for
+ # ~30-90 concurrent VMs is infeasible even though KVM allocates
+ # lazily, so they don't fit the playground's model. Disabled —
+ # not "broken", just over-provisioned for shared use.
+ "chdb-dataframe", "duckdb-dataframe", "duckdb-memory",
+ "polars-dataframe", "daft-parquet", "daft-parquet-partitioned",
}
From b3db27e4ed14595953479e7b86773cbb73e1642e Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 05:16:30 +0000
Subject: [PATCH 031/221] playground: bump /snapshot/create timeout to 30 min
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Under heavy parallel provisioning (~20-30 VMs all reaching the
snapshot phase together) Firecracker's 16 GB memory dump can take
many minutes — the host NVMe is the bottleneck. 10 min wasn't
enough, and several VMs (clickhouse, doris, ...) that finished
install+load successfully timed out mid-snapshot and were torn down.
Bump to 30 min. snapshot/create itself only does pure block I/O so
the worst case scales linearly with disk contention; 30 min covers
the observed long tail.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/server/vm_manager.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 2b685cc5df..ba91f28e09 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -385,11 +385,16 @@ async def _snapshot(self, vm: VM) -> None:
sock = str(vm.api_sock)
await fc.patch(sock, "/vm", {"state": "Paused"})
try:
+ # Allow up to 30 min for /snapshot/create. Under heavy parallel
+ # provisioning the host NVMe is contended and Firecracker's
+ # 16 GB memory dump can take many minutes; 10 min wasn't
+ # enough and we lost VMs that had finished install+load
+ # already, mid-snapshot.
await fc.put(sock, "/snapshot/create", {
"snapshot_type": "Full",
"snapshot_path": str(vm.snapshot_state),
"mem_file_path": str(vm.snapshot_bin),
- }, timeout=600.0)
+ }, timeout=1800.0)
finally:
# Try to resume so we can shut down cleanly; ignore failures.
with contextlib.suppress(Exception):
From 0689a33ae96c3b0c576dc4a1aba47469c4f08f6b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 06:50:27 +0000
Subject: [PATCH 032/221] playground: snapshot-specific semaphore (default 6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The PLAYGROUND_PROVISION_CONCURRENCY semaphore covers the whole
provision flow (boot, install, load, snapshot, shutdown). When ~30
VMs all reach /snapshot/create at roughly the same time, each one
queues for the same NVMe — Firecracker writes 16 GB of memory dump
sequentially per VM, so total throughput is fixed and individual
snapshots stretch from minutes to >30 min, blowing past the
host-side timeout and killing VMs that already finished install
+load successfully.
Add a separate snapshot semaphore around /snapshot/create. Default
6 — enough to keep the disk busy without serializing 30 deep, and
keeps each VM's snapshot window under ~5 min on a single SSD.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/server/vm_manager.py | 39 +++++++++++++++++++--------------
1 file changed, 23 insertions(+), 16 deletions(-)
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index ba91f28e09..54e9ba604e 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -97,6 +97,14 @@ def __init__(self, config: Config, systems: dict[str, System]):
# in one pass.
self._provision_sem = asyncio.Semaphore(int(os.environ.get(
"PLAYGROUND_PROVISION_CONCURRENCY", "32")))
+ # Independently cap how many VMs are inside /snapshot/create at once.
+ # Each snapshot writes 16 GB of memory to disk; running 30 of them
+ # simultaneously serializes on the host NVMe and pushed individual
+ # snapshots past 30 min, causing host-side timeouts on VMs that had
+ # already finished install+load. 6 snapshots in parallel keeps each
+ # one's write window under ~5 minutes on a single fast SSD.
+ self._snapshot_sem = asyncio.Semaphore(int(os.environ.get(
+ "PLAYGROUND_SNAPSHOT_CONCURRENCY", "6")))
# Stable slot allocation: sort systems alphabetically so each system
# always gets the same slot id (and therefore the same TAP/IP).
for i, name in enumerate(sorted(systems.keys()), start=1):
@@ -383,22 +391,21 @@ async def _snapshot(self, vm: VM) -> None:
await self._sync_guest(vm)
sock = str(vm.api_sock)
- await fc.patch(sock, "/vm", {"state": "Paused"})
- try:
- # Allow up to 30 min for /snapshot/create. Under heavy parallel
- # provisioning the host NVMe is contended and Firecracker's
- # 16 GB memory dump can take many minutes; 10 min wasn't
- # enough and we lost VMs that had finished install+load
- # already, mid-snapshot.
- await fc.put(sock, "/snapshot/create", {
- "snapshot_type": "Full",
- "snapshot_path": str(vm.snapshot_state),
- "mem_file_path": str(vm.snapshot_bin),
- }, timeout=1800.0)
- finally:
- # Try to resume so we can shut down cleanly; ignore failures.
- with contextlib.suppress(Exception):
- await fc.patch(sock, "/vm", {"state": "Resumed"})
+ # Bound concurrent snapshots. /snapshot/create writes ~16 GB of
+ # memory to disk and ~30 simultaneous ones serialize on a single
+ # NVMe long enough to time out individual VMs.
+ async with self._snapshot_sem:
+ await fc.patch(sock, "/vm", {"state": "Paused"})
+ try:
+ await fc.put(sock, "/snapshot/create", {
+ "snapshot_type": "Full",
+ "snapshot_path": str(vm.snapshot_state),
+ "mem_file_path": str(vm.snapshot_bin),
+ }, timeout=1800.0)
+ finally:
+ # Try to resume so we can shut down cleanly; ignore failures.
+ with contextlib.suppress(Exception):
+ await fc.patch(sock, "/vm", {"state": "Resumed"})
# Capture the *disk* state too. The memory snapshot is meaningless on
# its own: it has in-flight references to specific inodes / file
From d7a3f310f4ad54b872a2694e8dc001da7ac1c6f7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 06:52:15 +0000
Subject: [PATCH 033/221] playground: restore VM state from disk on server
start
VM.state is in-memory and gets reset to "down" on every server
restart. If a snapshot completed in a previous run, the on-disk
artifacts (snapshot.bin.zst + golden disks) still represent a valid
"snapshotted" state, but the provisioner re-kicks them as if they
needed install+load.
Initialize state to "snapshotted" if `_has_snapshot(vm)` is true at
construction. Lets us restart the server (or recover from a crash)
without redoing the long provision work.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/server/vm_manager.py | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index 54e9ba604e..c31b0d1487 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -111,7 +111,7 @@ def __init__(self, config: Config, systems: dict[str, System]):
sys = systems[name]
sys_state_dir = config.systems_dir / name
sys_state_dir.mkdir(parents=True, exist_ok=True)
- self.vms[name] = VM(
+ vm = VM(
system=sys,
slot=i,
api_sock=config.vms_dir / f"{name}.sock",
@@ -119,6 +119,12 @@ def __init__(self, config: Config, systems: dict[str, System]):
snapshot_bin=sys_state_dir / "snapshot.bin",
snapshot_state=sys_state_dir / "snapshot.state",
)
+ # If snapshot artifacts survived a previous server run, initialize
+ # to "snapshotted" so the provisioner doesn't redo install/load.
+ # /api/query restores lazily.
+ if _has_snapshot(vm):
+ vm.state = "snapshotted"
+ self.vms[name] = vm
# ── public API ───────────────────────────────────────────────────────
From c9258866d4b63168e4b831afc34fcf933dfc4d91 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 07:54:22 +0000
Subject: [PATCH 034/221] playground: bump /snapshot/create timeout to 60 min
Even with snapshot_sem=6 bounding concurrent snapshots, the other
30+ VMs in the install/load phase compete for the same NVMe and
stretch individual snapshot writes past the 30-min cap. 60 min
covers the observed long tail.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
playground/server/vm_manager.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index c31b0d1487..d28c98b0cc 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -403,11 +403,16 @@ async def _snapshot(self, vm: VM) -> None:
async with self._snapshot_sem:
await fc.patch(sock, "/vm", {"state": "Paused"})
try:
+ # 60 min. Even with snapshot_sem bounding to 6 concurrent
+ # snapshots, the rest of the host's I/O (install/load
+ # writes from 30+ other VMs in the apt/pip phase) competes
+ # for the same NVMe and stretches /snapshot/create well
+ # past 30 min in the long tail.
await fc.put(sock, "/snapshot/create", {
"snapshot_type": "Full",
"snapshot_path": str(vm.snapshot_state),
"mem_file_path": str(vm.snapshot_bin),
- }, timeout=1800.0)
+ }, timeout=3600.0)
finally:
# Try to resume so we can shut down cleanly; ignore failures.
with contextlib.suppress(Exception):
From 27db4c7a04de928bad0fc9f1c6034401d3453aa7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov
Date: Wed, 13 May 2026 21:09:00 +0000
Subject: [PATCH 035/221] playground: snapshot/restore overhaul + UI iteration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Snapshot pipeline:
- /opt/clickbench-playground reformatted as XFS so cp --reflink=always
can clone golden->working in milliseconds.
- _snapshot_disks and _restore_disks switched to reflink (parallel,
O(1) extent-list copies).
- snapshot.bin no longer compressed; Firecracker mmaps it on restore,
pages fault in lazily.
- Snapshot is taken with the daemon running: pre-snapshot stop+fstrim
+drop_caches is followed by start+check, so restore resumes a live
daemon and the first query pays no cold-start cost.
- _snapshot_disks runs while VM is paused, before resume. Without this
the daemon's post-snapshot kernel writes (journal commits, atime)
leaked into the golden disk and surfaced as ext4 EBADMSG on restore.
Agent + host wiring:
- New /ready endpoint on the in-VM agent; _restore_snapshot waits for
/ready (up to 10 min) before reporting state="ready" so slow JVMs
like Doris/Druid don't time out on the user's first query.
- dockerd restart hook at agent boot — without it docker-using systems
fail to launch containers after snapshot restore.
- Output streamed and capped at OUTPUT_LIMIT+1 bytes (default 64 KB)
with head-style early-kill; default query timeout 600 -> 60 s.
- /api/query no longer triggers initial provisioning. Only restore.
Initial provision requires explicit /api/admin/provision/.
- /api/queries/ returns the system's example queries.
- _call_agent_provision: no aiohttp idle timeout, 7-day total cap.
- ClickHouse-family stays on the internet after snapshot (datalake
variants need S3); rest stays offline.
Catalog:
- paradedb-partitioned (pg_lakehouse removed upstream) and
pg_duckdb-motherduck (needs cloud creds) excluded.
- ClickHouse + chdb variants emit Pretty format.
- ClickBench: trino/presto-datalake javac classpath uses find for AWS
SDK / Hadoop jars instead of pinning a stale jar filename.
- ClickBench: cedardb/cedardb-parquet/mongodb start scripts hardened
(systemctl restart docker, longer wait windows, better diagnostics).
- ClickBench: duckdb start scripts scrub stale *.wal.
- ClickBench: arc start broadened admin-token regex.
UI:
- Catalog rendered as horizontal slabs, colored by state.
- Per-system result cache (output + timing) keyed by system name.
- Example-query selector populated from /api/queries/.
- Down systems swap the query pane for a "Last error" pane.
- Stats row trimmed to time + truncated marker.
- monospace font, no rounded corners, black selected outline.
- Spellcheck / autocomplete / Grammarly opt-outs on the textarea.
Bootstrap:
- install-firecracker.sh: chown only the top-level state dirs, not
recursively (a chown -R was descending into a base-rootfs build's
loop mount and flipping /etc/sudoers to uid 1000).
- install-firecracker.sh checks the state dir supports reflink and
exits with an XFS-format hint if not.
- download-datasets.sh fetches hits.json.gz (used by parseable).
---
arc/start | 25 ++-
cedardb-parquet/start | 30 ++-
cedardb/start | 31 ++-
chdb-parquet-partitioned/query | 2 +-
chdb/query | 24 +--
clickhouse-datalake-partitioned/query | 2 +-
clickhouse-datalake/query | 2 +-
clickhouse-parquet-partitioned/query | 2 +-
clickhouse-parquet/query | 2 +-
clickhouse-tencent/query | 2 +-
clickhouse-web/query | 2 +-
clickhouse/query | 2 +-
duckdb-datalake-partitioned/start | 7 +
duckdb-datalake/start | 7 +
duckdb-parquet-partitioned/start | 7 +
duckdb-parquet/start | 8 +-
duckdb-vortex-partitioned/start | 7 +
duckdb-vortex/start | 7 +
duckdb/start | 8 +-
mongodb/start | 23 +-
parseable/load | 13 +-
playground/agent/agent.py | 243 ++++++++++++++++-----
playground/docs/architecture.md | 36 +++-
playground/scripts/download-datasets.sh | 11 +
playground/scripts/install-firecracker.sh | 34 ++-
playground/server/main.py | 43 +++-
playground/server/systems.py | 22 ++
playground/server/vm_manager.py | 170 +++++++++++----
playground/web/app.js | 245 ++++++++++++++++------
playground/web/index.html | 50 +++--
playground/web/style.css | 87 ++++++--
presto-datalake-partitioned/install | 7 +-
presto-datalake/install | 7 +-
trino-datalake-partitioned/install | 7 +-
trino-datalake/install | 7 +-
35 files changed, 928 insertions(+), 254 deletions(-)
diff --git a/arc/start b/arc/start
index d06f81cab1..51a27a9b89 100755
--- a/arc/start
+++ b/arc/start
@@ -14,7 +14,7 @@ fi
sudo systemctl start arc
# Wait for the HTTP endpoint to come up before we try to read the token.
-for _ in $(seq 1 30); do
+for _ in $(seq 1 60); do
if curl -sf "$ARC_URL/health" >/dev/null 2>&1; then
break
fi
@@ -22,13 +22,30 @@ for _ in $(seq 1 30); do
done
# On first start, Arc prints its admin token to its journal; capture it.
+# The log line has drifted between releases ("Initial admin API token:",
+# "Admin API token:", "API token:", ...) and journald can lag behind
+# /health, so we retry with a broader regex over ~60 s.
if [ ! -f arc_token.txt ] || \
! curl -sf "$ARC_URL/health" -H "x-api-key: $(cat arc_token.txt)" >/dev/null 2>&1; then
- TOKEN=$(sudo journalctl -u arc --no-pager \
- | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' \
- | head -1)
+ TOKEN=""
+ for _ in $(seq 1 60); do
+ sudo journalctl --sync >/dev/null 2>&1 || true
+ JOURNAL=$(sudo journalctl -u arc --no-pager 2>/dev/null || true)
+ TOKEN=$(printf '%s\n' "$JOURNAL" \
+ | grep -oP '(?:[Ii]nitial[[:space:]]+)?[Aa]dmin[[:space:]]+(?:API[[:space:]]+)?[Tt]oken[[:space:]]*[:=][[:space:]]*\K[^[:space:],]+' \
+ | head -1)
+ if [ -z "$TOKEN" ]; then
+ TOKEN=$(printf '%s\n' "$JOURNAL" \
+ | grep -oP '(?:API[[:space:]]+)?[Tt]oken[[:space:]]*[:=][[:space:]]*\K[A-Za-z0-9_.\-]{16,}' \
+ | head -1)
+ fi
+ if [ -n "$TOKEN" ]; then break; fi
+ sleep 1
+ done
if [ -z "$TOKEN" ]; then
echo "Error: Could not extract Arc admin API token from journal" >&2
+ echo "---journal tail---" >&2
+ sudo journalctl -u arc --no-pager 2>&1 | tail -50 >&2
exit 1
fi
echo "$TOKEN" > arc_token.txt
diff --git a/cedardb-parquet/start b/cedardb-parquet/start
index ad1d714394..981f23f221 100755
--- a/cedardb-parquet/start
+++ b/cedardb-parquet/start
@@ -5,15 +5,33 @@ if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1;
exit 0
fi
+# After a VM snapshot+restore, dockerd's in-memory networking/cgroup state
+# is out of sync with the (also-restored) kernel-side resources, and the
+# next `docker run` either fails or starts a container that can't be
+# reached on its mapped port. Restarting dockerd reconciles it.
+sudo systemctl restart docker
+for _ in $(seq 1 30); do
+ sudo docker info >/dev/null 2>&1 && break
+ sleep 1
+done
+
sudo docker stop cedardb >/dev/null 2>&1 || true
sudo docker rm cedardb >/dev/null 2>&1 || true
-sudo docker run -d --rm -p 5432:5432 \
- -v "$(pwd)/data:/data" \
- -v "$(pwd)/db:/var/lib/cedardb/data" \
- -e CEDAR_PASSWORD=test \
- --name cedardb cedardb/cedardb:latest >/dev/null
+if ! sudo docker run -d --rm -p 5432:5432 \
+ -v "$(pwd)/data:/data" \
+ -v "$(pwd)/db:/var/lib/cedardb/data" \
+ -e CEDAR_PASSWORD=test \
+ --name cedardb cedardb/cedardb:latest; then
+ echo "docker run failed; ps -a:" >&2
+ sudo docker ps -a >&2 || true
+ exit 1
+fi
-until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do
+for _ in $(seq 1 60); do
+ pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0
sleep 1
done
+echo "cedardb did not become ready in 60 s; container logs:" >&2
+sudo docker logs cedardb 2>&1 | tail -40 >&2 || true
+exit 1
diff --git a/cedardb/start b/cedardb/start
index 0f4c8b56f6..b6c3bbfe07 100755
--- a/cedardb/start
+++ b/cedardb/start
@@ -5,16 +5,35 @@ if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1;
exit 0
fi
+# After a VM snapshot+restore, dockerd's in-memory networking/cgroup state
+# is out of sync with the (also-restored) kernel-side resources, and the
+# next `docker run` either fails or starts a container that can't be
+# reached on its mapped port. Restarting dockerd reconciles it. No-op on
+# initial provision (the daemon was just started).
+sudo systemctl restart docker
+for _ in $(seq 1 30); do
+ sudo docker info >/dev/null 2>&1 && break
+ sleep 1
+done
+
# `docker run --rm` cleans up container on exit; we run detached.
sudo docker stop cedardb >/dev/null 2>&1 || true
sudo docker rm cedardb >/dev/null 2>&1 || true
-sudo docker run -d --rm -p 5432:5432 \
- -v "$(pwd)/data:/data" \
- -v "$(pwd)/db:/var/lib/cedardb/data" \
- -e CEDAR_PASSWORD=test \
- --name cedardb cedardb/cedardb:latest >/dev/null
+if ! sudo docker run -d --rm -p 5432:5432 \
+ -v "$(pwd)/data:/data" \
+ -v "$(pwd)/db:/var/lib/cedardb/data" \
+ -e CEDAR_PASSWORD=test \
+ --name cedardb cedardb/cedardb:latest; then
+ echo "docker run failed; ps -a:" >&2
+ sudo docker ps -a >&2 || true
+ exit 1
+fi
-until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do
+for _ in $(seq 1 60); do
+ pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0
sleep 1
done
+echo "cedardb did not become ready in 60 s; container logs:" >&2
+sudo docker logs cedardb 2>&1 | tail -40 >&2 || true
+exit 1
diff --git a/chdb-parquet-partitioned/query b/chdb-parquet-partitioned/query
index e32521b589..226baf39d9 100755
--- a/chdb-parquet-partitioned/query
+++ b/chdb-parquet-partitioned/query
@@ -26,7 +26,7 @@ conn = chdb.connect()
start = timeit.default_timer()
try:
- res = conn.query(query, "CSV")
+ res = conn.query(query, "Pretty")
out = str(res)
end = timeit.default_timer()
if out:
diff --git a/chdb/query b/chdb/query
index 25d2dc57fc..f2399c6656 100755
--- a/chdb/query
+++ b/chdb/query
@@ -17,26 +17,26 @@ cat > "$query_file"
python3 - "$query_file" <<'PY'
import sys
import timeit
-from chdb import dbapi
+import chdb
with open(sys.argv[1]) as f:
query = f.read()
-con = dbapi.connect(path=".clickbench")
-cur = con.cursor()
-
+# The hits table is created inside the `clickbench` database (see
+# create.sql + load), but the session defaults to `default`. Prepend
+# USE so a bare `SELECT * FROM hits` resolves regardless.
+sess = chdb.session.Session(".clickbench")
start = timeit.default_timer()
try:
- cur._cursor.execute(query)
- rows = cur.fetchall() if cur.description else []
+ res = sess.query(f"USE clickbench; {query}", "Pretty")
+ out = str(res)
end = timeit.default_timer()
+ if out:
+ sys.stdout.write(out)
+ if not out.endswith("\n"):
+ sys.stdout.write("\n")
finally:
- cur.close()
- con.close()
-
-for row in rows:
- print(row)
+ sess.close()
-# Last line of stderr: fractional seconds.
print(f"{end - start:.3f}", file=sys.stderr)
PY
diff --git a/clickhouse-datalake-partitioned/query b/clickhouse-datalake-partitioned/query
index d491976e0e..2fd7b0e50f 100755
--- a/clickhouse-datalake-partitioned/query
+++ b/clickhouse-datalake-partitioned/query
@@ -7,4 +7,4 @@ set -e
query=$(cat)
./clickhouse local --path . --time --use_page_cache_for_object_storage 1 \
- --query="$query"
+ --format=Pretty --query="$query"
diff --git a/clickhouse-datalake/query b/clickhouse-datalake/query
index d491976e0e..2fd7b0e50f 100755
--- a/clickhouse-datalake/query
+++ b/clickhouse-datalake/query
@@ -7,4 +7,4 @@ set -e
query=$(cat)
./clickhouse local --path . --time --use_page_cache_for_object_storage 1 \
- --query="$query"
+ --format=Pretty --query="$query"
diff --git a/clickhouse-parquet-partitioned/query b/clickhouse-parquet-partitioned/query
index a157a84bf3..1a4ddd7a4d 100755
--- a/clickhouse-parquet-partitioned/query
+++ b/clickhouse-parquet-partitioned/query
@@ -5,4 +5,4 @@
set -e
query=$(cat)
-./clickhouse local --time --query="$(cat create.sql); ${query}"
+./clickhouse local --time --format=Pretty --query="$(cat create.sql); ${query}"
diff --git a/clickhouse-parquet/query b/clickhouse-parquet/query
index a157a84bf3..1a4ddd7a4d 100755
--- a/clickhouse-parquet/query
+++ b/clickhouse-parquet/query
@@ -5,4 +5,4 @@
set -e
query=$(cat)
-./clickhouse local --time --query="$(cat create.sql); ${query}"
+./clickhouse local --time --format=Pretty --query="$(cat create.sql); ${query}"
diff --git a/clickhouse-tencent/query b/clickhouse-tencent/query
index 9ef756b1f8..aae8e3b6c4 100755
--- a/clickhouse-tencent/query
+++ b/clickhouse-tencent/query
@@ -6,4 +6,4 @@
set -e
query=$(cat)
-clickhouse-client --time --query="$query"
+clickhouse-client --time --format=Pretty --query="$query"
diff --git a/clickhouse-web/query b/clickhouse-web/query
index 72a6eda1e8..105302f569 100755
--- a/clickhouse-web/query
+++ b/clickhouse-web/query
@@ -9,4 +9,4 @@ set -e
query=$(cat)
clickhouse-client --query "SYSTEM DROP FILESYSTEM CACHE" >/dev/null
-clickhouse-client --time --query="$query"
+clickhouse-client --time --format=Pretty --query="$query"
diff --git a/clickhouse/query b/clickhouse/query
index c6abe5b818..6d5c47fb11 100755
--- a/clickhouse/query
+++ b/clickhouse/query
@@ -5,4 +5,4 @@
# Exit non-zero on error.
set -e
-clickhouse-client --time
+clickhouse-client --time --format=Pretty
diff --git a/duckdb-datalake-partitioned/start b/duckdb-datalake-partitioned/start
index 06bd986563..e7a4fb1f74 100755
--- a/duckdb-datalake-partitioned/start
+++ b/duckdb-datalake-partitioned/start
@@ -1,2 +1,9 @@
#!/bin/bash
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/duckdb-datalake/start b/duckdb-datalake/start
index 06bd986563..e7a4fb1f74 100755
--- a/duckdb-datalake/start
+++ b/duckdb-datalake/start
@@ -1,2 +1,9 @@
#!/bin/bash
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/duckdb-parquet-partitioned/start b/duckdb-parquet-partitioned/start
index 06bd986563..e7a4fb1f74 100755
--- a/duckdb-parquet-partitioned/start
+++ b/duckdb-parquet-partitioned/start
@@ -1,2 +1,9 @@
#!/bin/bash
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/duckdb-parquet/start b/duckdb-parquet/start
index c1d4b2fca8..e7a4fb1f74 100755
--- a/duckdb-parquet/start
+++ b/duckdb-parquet/start
@@ -1,3 +1,9 @@
#!/bin/bash
-# duckdb is an embedded CLI tool — no daemon to start.
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/duckdb-vortex-partitioned/start b/duckdb-vortex-partitioned/start
index 06bd986563..e7a4fb1f74 100755
--- a/duckdb-vortex-partitioned/start
+++ b/duckdb-vortex-partitioned/start
@@ -1,2 +1,9 @@
#!/bin/bash
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/duckdb-vortex/start b/duckdb-vortex/start
index 06bd986563..e7a4fb1f74 100755
--- a/duckdb-vortex/start
+++ b/duckdb-vortex/start
@@ -1,2 +1,9 @@
#!/bin/bash
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/duckdb/start b/duckdb/start
index c1d4b2fca8..e7a4fb1f74 100755
--- a/duckdb/start
+++ b/duckdb/start
@@ -1,3 +1,9 @@
#!/bin/bash
-# duckdb is an embedded CLI tool — no daemon to start.
+# duckdb is embedded — no daemon to start. We do clean any stale WAL
+# files that may have been captured in the snapshot: a .wal whose footer
+# wasn't fully flushed pre-snapshot can leave duckdb refusing to open
+# the database ("IO Error: ... Bad message"). The .wal is regenerated
+# on the next write, so dropping it is safe between query sessions.
+shopt -s nullglob
+rm -f ./*.wal ./*.db.wal
exit 0
diff --git a/mongodb/start b/mongodb/start
index 9e8bafc100..cbaa3c83ba 100755
--- a/mongodb/start
+++ b/mongodb/start
@@ -1,15 +1,28 @@
#!/bin/bash
set -e
-sudo systemctl start mongod
+sudo systemctl start mongod || true
-# Enable the planner option used by ClickBench (covered whole-index scans).
-# This is a runtime parameter that resets on restart, so we re-apply on every
-# start. Wait briefly for the server to accept connections first.
-for _ in $(seq 1 60); do
+# Wait up to ~3 minutes for mongod to accept connections. mongod can take a
+# while on cold start (oplog init, etc.). If it never comes up, dump the
+# unit status and log tail so the failure is actionable.
+ok=0
+for _ in $(seq 1 180); do
if mongosh --quiet --eval "db.runCommand('ping').ok" >/dev/null 2>&1; then
+ ok=1
break
fi
sleep 1
done
+
+if [ "$ok" != 1 ]; then
+ echo "mongod did not become reachable on 127.0.0.1:27017 after 180 s" >&2
+ sudo systemctl status mongod --no-pager -l 2>&1 | tail -30 >&2 || true
+ echo "---mongod log tail---" >&2
+ sudo tail -60 /var/log/mongodb/mongod.log 2>&1 >&2 || true
+ exit 1
+fi
+
+# Enable the planner option used by ClickBench (covered whole-index scans).
+# Runtime parameter — resets on each restart so we re-apply.
mongosh --quiet --eval 'db.adminCommand({setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true});' >/dev/null
diff --git a/parseable/load b/parseable/load
index 3f74150940..2f10233c25 100755
--- a/parseable/load
+++ b/parseable/load
@@ -3,11 +3,18 @@ set -eu
NUM_CORES=$(nproc)
-wget --continue --progress=dot:giga \
- 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
+# In the playground, the gzipped JSON dataset is shipped read-only at
+# /opt/clickbench/datasets_ro/hits.json.gz; symlink it instead of wget'ing
+# 4.6 GB on every parseable provision.
+if [ -f /opt/clickbench/datasets_ro/hits.json.gz ]; then
+ ln -sf /opt/clickbench/datasets_ro/hits.json.gz hits.json.gz
+else
+ wget --continue --progress=dot:giga \
+ 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
+fi
# Decompress with progress.
-FILE_SIZE=$(stat -c %s hits.json.gz)
+FILE_SIZE=$(stat -L -c %s hits.json.gz)
pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json
# Split into chunks wrapped in [ ... , ... ] arrays for parseable's ingest API.
diff --git a/playground/agent/agent.py b/playground/agent/agent.py
index 0ec850d630..b5a2074b95 100644
--- a/playground/agent/agent.py
+++ b/playground/agent/agent.py
@@ -53,11 +53,15 @@
# any well-known ephemeral range too. 50080 keeps a vague "HTTP-ish" feel.
LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "50080"))
# 10 KB cap, matching the spec. Configurable for testing.
-OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "10240"))
+OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "65536"))
# Per-query wall-clock cap so a runaway query can't tie up a VM forever.
-QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "600"))
+QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "60"))
# Provision (install/start/load) can legitimately take an hour for some systems.
-PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", "7200"))
+# Per-step timeout for install/start/load. Some real-world systems load
+# 100 M rows over many hours (postgres + indexes, cratedb, cockroachdb,
+# yugabyte, etc.). 7 days covers anything reasonable without being
+# unbounded.
+PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", str(7 * 86400)))
STATE_DIR.mkdir(parents=True, exist_ok=True)
PROVISION_DONE = STATE_DIR / "provisioned"
@@ -188,9 +192,21 @@ def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]:
stdout: result (whatever format the system uses)
stderr: timing in fractional seconds on the LAST numeric line
exit code: 0 on success
+
+ Stops reading stdout once we've buffered OUTPUT_LIMIT+1 bytes (one
+ extra so _cap can detect the overflow) and kills the process group —
+ "SELECT * FROM hits" generates ~14 GB of output and we don't want
+ the agent to spin buffering it. Stderr is read on a background
+ thread so a chatty stderr can't deadlock the stdout pipe.
"""
+ import select
+ import threading
script = _system_script("query")
t0 = time.monotonic()
+ deadline = t0 + QUERY_TIMEOUT
+ cap = OUTPUT_LIMIT + 1 # +1 byte so _cap() can detect overflow
+ stdout_buf = bytearray()
+ stderr_buf = bytearray()
try:
p = subprocess.Popen(
[str(script)],
@@ -200,18 +216,59 @@ def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]:
cwd=str(SYSTEM_DIR),
preexec_fn=os.setsid,
)
- try:
- stdout, stderr = p.communicate(input=sql, timeout=QUERY_TIMEOUT)
- rc = p.returncode
- except subprocess.TimeoutExpired:
- # The system might still be inside its query; kill the whole group.
- with contextlib.suppress(ProcessLookupError):
- os.killpg(p.pid, signal.SIGKILL)
- stdout, stderr = p.communicate()
- rc = -9
except Exception as e:
return 255, b"", f"agent: failed to invoke ./query: {e}\n".encode(), 0.0
- return rc, stdout, stderr, time.monotonic() - t0
+
+ def _drain_stderr() -> None:
+ for chunk in iter(lambda: p.stderr.read(8192), b""):
+ stderr_buf.extend(chunk)
+ err_thread = threading.Thread(target=_drain_stderr, daemon=True)
+ err_thread.start()
+
+ try:
+ if sql:
+ p.stdin.write(sql)
+ p.stdin.close()
+ except BrokenPipeError:
+ pass
+
+ killed_for = "" # "timeout", "cap", or ""
+ while True:
+ remaining = deadline - time.monotonic()
+ if remaining <= 0:
+ killed_for = "timeout"
+ break
+ if len(stdout_buf) >= cap:
+ killed_for = "cap"
+ break
+ r, _, _ = select.select([p.stdout], [], [], min(remaining, 0.5))
+ if r:
+ chunk = p.stdout.read1(min(8192, cap - len(stdout_buf)))
+ if not chunk:
+ break # EOF
+ stdout_buf.extend(chunk)
+ elif p.poll() is not None:
+ break
+
+ if killed_for:
+ with contextlib.suppress(ProcessLookupError):
+ os.killpg(p.pid, signal.SIGKILL)
+
+ try:
+ rc = p.wait(timeout=5)
+ except subprocess.TimeoutExpired:
+ with contextlib.suppress(ProcessLookupError):
+ os.killpg(p.pid, signal.SIGKILL)
+ rc = -9
+
+ if killed_for == "timeout":
+ rc = -9
+ err_thread.join(timeout=2)
+ with contextlib.suppress(Exception):
+ p.stdout.close()
+ with contextlib.suppress(Exception):
+ p.stderr.close()
+ return rc, bytes(stdout_buf), bytes(stderr_buf), time.monotonic() - t0
def _extract_script_timing(stderr: bytes) -> float | None:
@@ -320,24 +377,22 @@ def _provision() -> tuple[int, bytes]:
PROVISION_LOG.write_bytes(b"".join(log_lines))
return r.returncode, b"".join(log_lines)
- # Pre-snapshot trim. The host /sync's the FS right before pausing
- # the vcpus, so any on-disk data the daemon has already committed
- # is durable. That means we're free to stop the daemon here:
- # ClickHouse's MergeTree (and equivalent on-disk stores) never
- # produce inconsistent on-disk state regardless of when the
- # process exits — only an unflushed *filesystem* can. With the
- # host-side /sync in place, we can shut the daemon down to evict
- # its private heap (merge thread arenas, query cache, mark cache,
- # uncompressed cache, parquet ingest buffers, …) and snapshot a
- # mostly-zero RAM image. The agent's startup path
- # (_kick_daemon_if_provisioned) brings it back up on every
- # restore, so the first query in a restored VM pays a 1-2 s
- # daemon-start cost instead of carrying 8-12 GB of memory in
- # every snapshot.
- #
- # Skip for in-process / stateless tools where stop/start is a
- # no-op AND the data lives in process memory; wiping it would
- # defeat the point. Those systems can rely on drop_caches alone.
+ # Pre-snapshot housekeeping. Order:
+ # 1) ./stop — drop the daemon's heap (merge arenas, query cache,
+ # mark cache, parquet ingest buffers, ...) so we can fstrim
+ # and drop_caches against a quiet system.
+ # 2) sync + drop_caches — flush dirty pages, evict the page
+ # cache, so init_on_free=1 zeroes everything that was
+ # cache. Snapshot then sees a mostly-zero free pool.
+ # 3) fstrim — DISCARD free blocks on the per-VM disks so the
+ # sparse backing file punches holes for bytes the load
+ # script `mv`'d in and `rm`'d (14-75 GB of dataset).
+ # 4) ./start + ./check — bring the daemon back up *into* the
+ # snapshot. Restore then resumes a daemon that's already
+ # serving, paying zero cold-start cost.
+ # Skip stop/start for systems without a real daemon (chdb,
+ # polars, duckdb): they're in-process tools with no separate
+ # process to manage.
stop = SYSTEM_DIR / "stop"
start = SYSTEM_DIR / "start"
check = SYSTEM_DIR / "check"
@@ -363,33 +418,59 @@ def _provision() -> tuple[int, bytes]:
time.sleep(0.5)
log_lines.append(b"=== pre-snapshot stop done ===\n")
- # Drop the page+dentry+inode cache. With init_on_free=1 set in the
- # guest kernel cmdline (see vm_manager._kernel_cmdline), every page
- # the kernel frees gets zero-filled before going back on the free
- # list. After daemon stop + drop_caches, the entire free pool
- # is genuinely zero-filled, and the snapshot's RAM dump compresses
- # ~300:1 instead of the ~3:1 we got without init_on_free.
+ # Drop the page+dentry+inode cache. With init_on_free=1 set in
+ # the guest kernel cmdline (see vm_manager._kernel_cmdline), every
+ # page the kernel frees gets zero-filled before going back on the
+ # free list, so what we snapshot is mostly-zero.
subprocess.run(["sync"], check=False)
try:
Path("/proc/sys/vm/drop_caches").write_text("3\n")
except Exception:
pass
- # fstrim the per-VM disks. Load scripts typically do `mv hits.parquet
- # /var/lib//user_files/` (which on overlay/cross-FS copies the
- # 14-75 GB dataset into the writable per-VM disk) and then `rm` it
- # after the INSERT. ext4 marks those blocks free but the underlying
- # virtio-blk file still holds the bytes — the snapshot's golden disk
- # then carries a full copy of the dataset that the load script
- # already discarded. `fstrim` sends DISCARD for free blocks; the
- # host loop driver responds by punching holes in the sparse backing
- # file, so the golden ends up holding only the bytes the engine
- # actually keeps (MergeTree parts, hits.db, etc.).
+ # fstrim the per-VM disks so transient dataset bytes from
+ # `mv hits.parquet ... ; rm` don't end up in the golden disk.
for mnt in ("/opt/clickbench/sysdisk", "/"):
subprocess.run(["fstrim", mnt],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
timeout=300, check=False)
+ # Restart the daemon so the snapshot captures it *running*. The
+ # restored VM then doesn't pay any cold-start cost; the daemon's
+ # process state, JIT/class-cache, connection pools, etc. all
+ # come back live.
+ if has_daemon:
+ log_lines.append(b"\n=== pre-snapshot start ===\n")
+ r = subprocess.run([str(start)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ timeout=PROVISION_TIMEOUT, check=False)
+ log_lines.append(r.stdout or b"")
+ log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n")
+ # Wait for ./check before snapshotting — we want the daemon
+ # actually accepting queries when the memory image is captured.
+ ok = False
+ t0 = time.monotonic()
+ while time.monotonic() - t0 < 900:
+ rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ timeout=10, check=False).returncode
+ if rc == 0:
+ ok = True
+ break
+ time.sleep(0.5)
+ if ok:
+ log_lines.append(b"=== pre-snapshot start ok ===\n")
+ _daemon_started.set() # the snapshot ships a running daemon
+ else:
+ log_lines.append(b"=== pre-snapshot start: check did not "
+ b"succeed in 900 s; snapshot will need a "
+ b"cold start on restore ===\n")
+ # Sync once more so any data the just-started daemon wrote
+ # (lock files, sockets, recovery markers) is on disk before
+ # the host snapshots the rootfs/sysdisk.
+ subprocess.run(["sync"], check=False)
+
PROVISION_DONE.write_text(f"ok {time.time()}\n")
PROVISION_LOG.write_bytes(b"".join(log_lines))
return 0, b"".join(log_lines)
@@ -419,6 +500,15 @@ def do_GET(self) -> None:
self._send_json(200, {"ok": True, "system": SYSTEM_NAME,
"provisioned": PROVISION_DONE.exists()})
return
+ if self.path == "/ready":
+ # True when the system's daemon is fully accepting queries.
+ # The host uses this at restore time to gate VM-state="ready"
+ # for slow daemons (Doris, Druid, Trino, etc.); without it
+ # the first user query arrives mid-start and times out.
+ ready = _daemon_started.is_set()
+ self._send_json(200 if ready else 503,
+ {"ready": ready, "system": SYSTEM_NAME})
+ return
if self.path == "/stats":
self._send_json(200, _stats_snapshot())
return
@@ -489,6 +579,40 @@ class ReusableServer(socketserver.ThreadingTCPServer):
daemon_threads = True
+def _reconcile_docker_after_restore() -> None:
+ """Restart dockerd if it's active, to recover from snapshot-restore
+ skew.
+
+ Why: after a Firecracker memory snapshot+restore, dockerd is resumed
+ in userspace but the (also-restored) kernel-side networking and cgroup
+ state is in flux. Symptom: `docker run` either fails or starts a
+ container that's unreachable on its mapped port (cedardb, byconity,
+ trino, etc.). `systemctl restart docker` reconciles the daemon to the
+ current kernel state. No-op on systems that don't use docker, and a
+ cheap ~2 s on initial provision (docker was just started anyway).
+ """
+ rc = subprocess.run(
+ ["systemctl", "is-active", "--quiet", "docker"],
+ check=False,
+ ).returncode
+ if rc != 0:
+ return # docker isn't installed / not active
+ try:
+ subprocess.run(["sudo", "systemctl", "restart", "docker"],
+ check=False, timeout=60)
+ # Wait for the daemon to come back.
+ for _ in range(30):
+ r = subprocess.run(["sudo", "docker", "info"],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ check=False, timeout=5).returncode
+ if r == 0:
+ return
+ time.sleep(1)
+ except Exception as e:
+ sys.stderr.write(f"[agent] docker reconcile failed: {e}\n")
+
+
def _kick_daemon_if_provisioned() -> None:
"""On every agent boot, if the system has been provisioned, make sure
the daemon is also running.
@@ -513,11 +637,33 @@ def _kick_daemon_if_provisioned() -> None:
def _bg() -> None:
try:
+ # Slow daemons (Doris, Druid, Trino) can take >5 min to come
+ # up. The host's /ready poll has its own deadline; here we
+ # only need a generous upper bound to prevent an infinite
+ # hang.
subprocess.run([str(start)], cwd=str(SYSTEM_DIR),
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
- timeout=300, check=False)
+ timeout=900, check=False)
+ check = SYSTEM_DIR / "check"
+ if check.exists():
+ # Poll ./check until it succeeds — that's the daemon's
+ # own definition of "ready", and the host probes /ready
+ # for this flag.
+ for _ in range(240):
+ rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ timeout=10, check=False).returncode
+ if rc == 0:
+ break
+ time.sleep(0.5)
+ _daemon_started.set()
except Exception as e:
sys.stderr.write(f"[agent] daemon-kick failed: {e}\n")
+ # Still mark started so /query is unblocked even if the
+ # daemon never comes up — the query will fail with a real
+ # error rather than hang waiting for /ready forever.
+ _daemon_started.set()
threading.Thread(target=_bg, daemon=True, name="daemon-kick").start()
@@ -526,6 +672,7 @@ def main() -> None:
addr = ("0.0.0.0", LISTEN_PORT)
print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} "
f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True)
+ _reconcile_docker_after_restore()
_kick_daemon_if_provisioned()
with ReusableServer(addr, Handler) as srv:
srv.serve_forever()
diff --git a/playground/docs/architecture.md b/playground/docs/architecture.md
index 0507740c41..dfbbab7c19 100644
--- a/playground/docs/architecture.md
+++ b/playground/docs/architecture.md
@@ -58,16 +58,30 @@ on-disk snapshot.
## Snapshots
-Created the first time a system is requested. Two artifacts:
-
-- `/systems//snapshot.state` — Firecracker VM state metadata
-- `/systems//snapshot.bin` — guest memory dump (16 GB in
- size as configured, but sparse)
-
-The `rootfs.ext4` and `system.ext4` files persist across snapshots and are
-re-attached at restore time. Drive paths in the snapshot are remapped to
-their current host locations on restore so we don't have to re-snapshot if
-the playground gets moved or rebooted.
+Created the first time a system is requested. Three artifacts:
+
+- `/systems//snapshot.state` — Firecracker VM metadata
+- `/systems//snapshot.bin` — guest memory dump
+ (mmap'd by Firecracker on restore — left uncompressed so restore is
+ O(1) host work; pages fault in lazily)
+- `/systems//{rootfs,system}.golden.ext4` — frozen disk
+ state at snapshot time, reflink-cloned at restore
+
+The host filesystem at `` **must support reflinks** (XFS, or
+ext4 with `shared_blocks`). `_snapshot_disks` and `_restore_disks` both
+use `cp --reflink=always` so cloning the golden into a working disk is
+a constant-time extent-list copy regardless of how much data the system
+actually wrote. Without reflinks the playground still works, but every
+restore pays a full sparse-cp of the working set.
+
+Snapshots are taken with the daemon **running** (`./start` is invoked
+after the pre-snapshot `./stop` + `fstrim` + `drop_caches`), so a
+restored VM resumes with the daemon already serving — no cold-start
+cost on the first query.
+
+Drive paths in the snapshot are remapped to their current host locations
+on restore so we don't have to re-snapshot if the playground gets moved
+or rebooted.
## Networking
@@ -88,7 +102,7 @@ deleted — outbound traffic is dropped, the host↔guest link remains.
Truncation is applied **inside the agent**, before bytes leave the VM:
- Stdout from the system's `./query` script is capped at
- `CLICKBENCH_OUTPUT_LIMIT` bytes (default 10 KB).
+ `CLICKBENCH_OUTPUT_LIMIT` bytes (default 64 KB).
- The agent's response sets `X-Output-Truncated: 1` and
`X-Output-Bytes: ` so the client can show "this is a
partial result of N bytes."
diff --git a/playground/scripts/download-datasets.sh b/playground/scripts/download-datasets.sh
index b30fff4473..ae0fc7f23c 100755
--- a/playground/scripts/download-datasets.sh
+++ b/playground/scripts/download-datasets.sh
@@ -60,5 +60,16 @@ else
step " cached"
fi
+step "json.gz"
+# Used by parseable. The full hits.json.gz is ~4.6 GB on
+# datasets.clickhouse.com.
+if [ ! -f "$DATASETS/hits.json.gz" ] || [ "$(stat -c%s "$DATASETS/hits.json.gz" 2>/dev/null || echo 0)" -lt 3500000000 ]; then
+ wget --continue --progress=dot:giga \
+ -O "$DATASETS/hits.json.gz" \
+ 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz'
+else
+ step " cached"
+fi
+
step "done"
du -sh "$DATASETS"/*
diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh
index f2dfe9cd84..f5738511c2 100755
--- a/playground/scripts/install-firecracker.sh
+++ b/playground/scripts/install-firecracker.sh
@@ -9,7 +9,39 @@ FC_VERSION="${FIRECRACKER_VERSION:-v1.13.1}"
KERNEL_URL="${GUEST_KERNEL_URL:-https://s3.amazonaws.com/spec.ccfc.min/firecracker-ci/v1.13/x86_64/vmlinux-6.1.141}"
sudo mkdir -p "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache}
-sudo chown -R "$(id -u):$(id -g)" "$STATE_DIR"
+# Only chown the top-level subdirs we created. `chown -R` on $STATE_DIR
+# would descend into any live mount underneath it — notably the loop-
+# mounted rootfs that build-base-rootfs.sh keeps open under tmp/base-build
+# while it's running — and flip /etc/sudoers inside the future VM image
+# to uid 1000, breaking sudo on every subsequent provision.
+sudo chown "$(id -u):$(id -g)" \
+ "$STATE_DIR" \
+ "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache}
+
+# The playground relies on reflink (cp --reflink=always) to clone
+# 200 GB-apparent / multi-GB-real per-VM disks in milliseconds instead
+# of seconds. ext4 ships reflink support behind the `shared_blocks`
+# feature flag, but mke2fs in Ubuntu 22.04 / 24.04 doesn't expose it
+# yet — so we format the playground volume as XFS, which has reflink
+# enabled by default since mkfs.xfs 4.18 (2018). If you're staging the
+# host yourself, set this up before running install-firecracker.sh:
+#
+# sudo mkfs.xfs -L cbplayground -f /dev/
+# echo 'LABEL=cbplayground /opt/clickbench-playground xfs \
+# defaults,noatime,discard,nofail 0 2' | sudo tee -a /etc/fstab
+# sudo mount /opt/clickbench-playground
+#
+# Sanity-check at install time so a missing reflink is loud:
+if ! ( cd "$STATE_DIR" && tmp1="$(mktemp -p .)" && \
+ tmp2="$(mktemp -p . -u)" && \
+ cp --reflink=always "$tmp1" "$tmp2" 2>/dev/null; rc=$? ; \
+ rm -f "$tmp1" "$tmp2"; exit "$rc" ); then
+ echo "[install] ERROR: $STATE_DIR does not support reflink. The" >&2
+ echo "playground needs cp --reflink=always to clone per-VM disks" >&2
+ echo "fast. Reformat the volume as XFS (or ext4 with shared_blocks)" >&2
+ echo "and re-run this script. See the comment block above." >&2
+ exit 1
+fi
if [ ! -x "$STATE_DIR/bin/firecracker" ]; then
arch="$(uname -m)"
diff --git a/playground/server/main.py b/playground/server/main.py
index fe6cc86274..8ea74f2343 100644
--- a/playground/server/main.py
+++ b/playground/server/main.py
@@ -76,6 +76,34 @@ async def handle_system(self, req: web.Request) -> web.Response:
"agent_url": self.vmm.agent_url(vm),
})
+ async def handle_queries(self, req: web.Request) -> web.Response:
+ """Return example queries for a system from its queries.sql.
+
+ Splits on `;\n` so multi-line queries stay together. Truncates to
+ a sane upper bound — ClickBench has 43 per system, no need to
+ cap, but if a fork ships thousands we don't want to ship them
+ all to the browser.
+ """
+ name = req.match_info["name"]
+ if name not in self.systems:
+ raise web.HTTPNotFound()
+ path = self.cfg.repo_dir / name / "queries.sql"
+ if not path.exists():
+ return web.json_response([])
+ text = path.read_text(errors="replace")
+ # Split on `;\n` then trim. Drop empties.
+ out = []
+ for chunk in text.split(";\n"):
+ q = chunk.strip()
+ if not q:
+ continue
+ if not q.endswith(";"):
+ q += ";"
+ out.append(q)
+ if len(out) >= 200:
+ break
+ return web.json_response(out)
+
async def handle_provision_log(self, req: web.Request) -> web.Response:
name = req.match_info["name"]
if name not in self.systems:
@@ -175,7 +203,7 @@ async def _dispatch_query(self, system_name: str, sql: bytes
try:
async with aiohttp.ClientSession() as s:
async with s.post(url, data=sql,
- timeout=aiohttp.ClientTimeout(total=600)) as r:
+ timeout=aiohttp.ClientTimeout(total=60)) as r:
body = await r.read()
headers = {k: r.headers[k] for k in r.headers if k.startswith("X-")}
headers.setdefault("X-Output-Bytes", str(len(body)))
@@ -203,6 +231,7 @@ def build_app() -> web.Application:
app.router.add_get("/api/systems", obj.handle_systems)
app.router.add_get("/api/state", obj.handle_state)
app.router.add_get("/api/system/{name}", obj.handle_system)
+ app.router.add_get("/api/queries/{name}", obj.handle_queries)
app.router.add_get("/api/provision-log/{name}", obj.handle_provision_log)
app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision)
app.router.add_post("/api/query", obj.handle_query)
@@ -214,8 +243,18 @@ async def root_redirect(_r: web.Request) -> web.Response:
raise web.HTTPFound("/ui/")
async def ui_index(_r: web.Request) -> web.FileResponse:
- return web.FileResponse(web_dir / "index.html")
+ resp = web.FileResponse(web_dir / "index.html")
+ resp.headers["Cache-Control"] = "no-store"
+ return resp
+
+ @web.middleware
+ async def no_cache_static(request: web.Request, handler):
+ resp = await handler(request)
+ if request.path.startswith("/ui/"):
+ resp.headers["Cache-Control"] = "no-store"
+ return resp
+ app.middlewares.append(no_cache_static)
app.router.add_get("/", root_redirect)
app.router.add_get("/ui/", ui_index)
app.router.add_get("/ui", ui_index)
diff --git a/playground/server/systems.py b/playground/server/systems.py
index 49fa76cf15..347de92924 100644
--- a/playground/server/systems.py
+++ b/playground/server/systems.py
@@ -48,8 +48,30 @@
# not "broken", just over-provisioned for shared use.
"chdb-dataframe", "duckdb-dataframe", "duckdb-memory",
"polars-dataframe", "daft-parquet", "daft-parquet-partitioned",
+ # Upstream is broken or asks for credentials we don't have.
+ # - paradedb-partitioned: install script aborts ("pg_lakehouse was
+ # removed from ParadeDB after 0.10.x"); historical benchmark only.
+ # - pg_duckdb-motherduck: requires MOTHERDUCK_TOKEN (cloud creds).
+ "paradedb-partitioned", "pg_duckdb-motherduck",
}
+# Systems we trust to keep outbound internet access *after* the snapshot,
+# i.e. at query time. Used by datalake-style benchmarks that read live S3
+# during the query; without internet they fail with a DNS error. Stays
+# tight on purpose — adding a system here means user queries from that
+# VM can reach the wider internet, so only put ClickHouse-family engines
+# here (per request).
+TRUSTED_INTERNET: frozenset[str] = frozenset({
+ "clickhouse",
+ "clickhouse-datalake",
+ "clickhouse-datalake-partitioned",
+ "clickhouse-parquet",
+ "clickhouse-parquet-partitioned",
+ "chdb",
+ "chdb-parquet",
+ "chdb-parquet-partitioned",
+})
+
@dataclass(frozen=True)
class System:
diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py
index d28c98b0cc..b338c224d7 100644
--- a/playground/server/vm_manager.py
+++ b/playground/server/vm_manager.py
@@ -36,7 +36,7 @@
from . import firecracker as fc
from . import net
from .config import Config
-from .systems import System
+from .systems import System, TRUSTED_INTERNET
log = logging.getLogger("vm_manager")
@@ -69,6 +69,9 @@ class VM:
# Provision metadata
provisioned_at: Optional[float] = None
last_used: float = 0.0
+ # Set when state transitions to "ready" (after restore or initial
+ # provision). Reset on teardown. Used by the UI to show uptime.
+ ready_since: Optional[float] = None
last_error: Optional[str] = None
lock: asyncio.Lock = dataclasses.field(default_factory=asyncio.Lock)
# Runtime stats refreshed by the monitor
@@ -147,10 +150,14 @@ async def ensure_ready_for_query(self, system: str) -> VM:
vm.state = "snapshotted"
if vm.state == "down":
if not _has_snapshot(vm):
- # No snapshot (raw or compressed) yet — full provision.
- await self._initial_provision(vm)
- else:
- await self._restore_snapshot(vm)
+ # No snapshot yet, and /query is not a provisioning
+ # trigger — the operator has to /api/admin/provision
+ # explicitly. Refuse here so a stray query doesn't
+ # spin up a 30-min initial install.
+ raise RuntimeError(
+ f"{system}: no snapshot — POST /api/admin/provision"
+ f"/{system} to build one")
+ await self._restore_snapshot(vm)
elif vm.state == "snapshotted":
await self._restore_snapshot(vm)
elif vm.state == "provisioning":
@@ -275,7 +282,8 @@ async def _initial_provision(self, vm: VM) -> None:
await self._call_agent_provision(vm)
await self._snapshot(vm)
await self._shutdown(vm)
- await net.disable_internet(vm.slot)
+ if vm.system.name not in TRUSTED_INTERNET:
+ await net.disable_internet(vm.slot)
vm.state = "snapshotted"
vm.provisioned_at = time.time()
log.info("[%s] initial provision complete", vm.system.name)
@@ -413,28 +421,28 @@ async def _snapshot(self, vm: VM) -> None:
"snapshot_path": str(vm.snapshot_state),
"mem_file_path": str(vm.snapshot_bin),
}, timeout=3600.0)
+ # Capture the *disk* state while the VM is still paused —
+ # the memory snapshot has in-flight references to specific
+ # inodes / file positions / mmap'd ranges on the rootfs and
+ # system disks, and any post-pause writes (journal commits,
+ # atime updates, etc.) by Firecracker on resume torn the
+ # golden disk relative to the memory image and surface as
+ # ext4 EBADMSG on restore for whichever file's metadata
+ # got dirtied. Reflink-clone keeps the working disks live
+ # for the clean shutdown that follows.
+ await self._snapshot_disks(vm)
finally:
# Try to resume so we can shut down cleanly; ignore failures.
with contextlib.suppress(Exception):
await fc.patch(sock, "/vm", {"state": "Resumed"})
- # Capture the *disk* state too. The memory snapshot is meaningless on
- # its own: it has in-flight references to specific inodes / file
- # positions / mmap'd ranges on the rootfs and system disks, and if
- # those move under it the restored process malfunctions. We sparse-
- # copy the disks into a parallel "golden" path; every subsequent
- # restore boots off a fresh copy of the golden, so background work
- # the daemon does after restore (clickhouse merges, log writes,
- # /tmp churn) never persists into the next session.
- await self._snapshot_disks(vm)
-
- # Compress the memory dump with parallel zstd. Firecracker writes the
- # *full* 16 GB of guest memory regardless of how much was actually
- # used; zstd at -3 with -T0 turns that into ~10-12 GB in a few
- # seconds (most of the savings come from the agent's drop_caches
- # right before /snapshot — page cache zero-fills compress 50:1).
- # snapshot.state stays as-is; it's tiny (~60 KB).
- await self._compress_snapshot(vm)
+ # We no longer compress the memory dump. Firecracker mmaps
+ # snapshot.bin on restore, so leaving it uncompressed means a
+ # restore is O(1) for memory (the kernel page-faults pages in
+ # lazily). The cost is disk: ~16 GB nominal per system. Sparse-
+ # write + init_on_free=1 + pre-snapshot drop_caches+fstrim keep
+ # the actual on-disk size to ~5-10% of the apparent size for
+ # most systems. snapshot.state stays as-is; it's tiny (~60 KB).
async def _compress_snapshot(self, vm: VM) -> None:
bin_path = vm.snapshot_bin
@@ -492,6 +500,15 @@ async def _decompress_snapshot(self, vm: VM) -> None:
async def _restore_snapshot(self, vm: VM) -> None:
log.info("[%s] restore from snapshot", vm.system.name)
+ # Restore is the only auto-recovery path from a user /query. If
+ # the on-disk snapshot is gone (manual wipe, half-built artifact,
+ # ...) we fail loudly here; the operator has to kick a fresh
+ # provision via /api/admin/provision/.
+ if not _has_snapshot(vm):
+ vm.state = "down"
+ raise RuntimeError(
+ f"[{vm.system.name}] snapshot on disk is missing; "
+ f"POST /api/admin/provision/{vm.system.name} to rebuild")
# Always boot from a *fresh copy* of the golden disks captured at
# snapshot time. Restore #N inherits zero state from restore #N-1,
# which is what makes the playground safe to expose to arbitrary
@@ -502,9 +519,17 @@ async def _restore_snapshot(self, vm: VM) -> None:
# Firecracker tries to mmap it.
await self._decompress_snapshot(vm)
await net.ensure_tap(vm.slot)
- # internet stays OFF post-snapshot
+ # Trusted systems (e.g. ClickHouse variants that read live S3 at
+ # query time) keep outbound internet after restore. Everything
+ # else stays offline.
+ if vm.system.name in TRUSTED_INTERNET:
+ await net.enable_internet(vm.slot)
await self._boot(vm, restore_snapshot=True)
await self._wait_for_agent(vm, timeout=60)
+ # Block here until the system's daemon reports ready, so the
+ # first user query doesn't time out mid-startup. Big upper bound
+ # for slow JVMs (Doris/Druid/Trino).
+ await self._wait_for_daemon_ready(vm, timeout=600)
vm.state = "ready"
def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]:
@@ -519,15 +544,31 @@ def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]:
async def _snapshot_disks(self, vm: VM) -> None:
rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm)
- # Atomically swap: rename the working images into the golden slot.
- # Both disks were sync'd via /sync before /snapshot/create, so
- # what's on disk is consistent with what's in the memory snapshot.
- # We'll re-create the working images by cloning from the golden
- # on every restore (see _restore_disks).
- for src, dst in ((rootfs, rootfs_gold), (sysdisk, sysdisk_gold)):
+ # Reflink-clone the working images into the golden slot. We can't
+ # rename: the working file stays bound to Firecracker's open
+ # virtio-blk fd through the post-snapshot resume + shutdown, and
+ # any writes during that window would leak into the golden (we
+ # observed restored systems hitting ext4 EBADMSG on small files
+ # like duckdb's hits.db.wal and a venv activate script). With
+ # reflink the snapshot is near-instant; the working file's
+ # post-snapshot writes diverge into its own extents and don't
+ # touch the golden.
+ async def _clone(src: Path, dst: Path) -> None:
if dst.exists():
dst.unlink()
- os.replace(src, dst)
+ proc = await asyncio.create_subprocess_exec(
+ "cp", "--reflink=always", str(src), str(dst),
+ stderr=asyncio.subprocess.PIPE,
+ )
+ _, err = await proc.communicate()
+ if proc.returncode != 0:
+ raise RuntimeError(
+ f"reflink snapshot cp {src} -> {dst} failed: "
+ f"{err.decode(errors='replace')[-400:]}")
+ await asyncio.gather(
+ _clone(rootfs, rootfs_gold),
+ _clone(sysdisk, sysdisk_gold),
+ )
log.info("[%s] golden disks saved (%s, %s)", vm.system.name,
_fmt_size(rootfs_gold.stat().st_size),
_fmt_size(sysdisk_gold.stat().st_size))
@@ -537,19 +578,32 @@ async def _restore_disks(self, vm: VM) -> None:
if not rootfs_gold.exists() or not sysdisk_gold.exists():
raise RuntimeError(
f"[{vm.system.name}] missing golden disks; cannot restore")
- # Clone the goldens into fresh working copies. `cp --sparse=always`
- # only writes the non-zero blocks, so the cost is proportional to
- # the actual data on each disk, not its apparent 200 GB.
- for src, dst in ((rootfs_gold, rootfs), (sysdisk_gold, sysdisk)):
+ # Reflink-clone the goldens into fresh working copies. The host
+ # filesystem must be ext4 with the `reflink` feature enabled (or
+ # XFS / btrfs / any other CoW-capable fs) — see
+ # playground/scripts/install-firecracker.sh. Clones are O(1)
+ # extent-list copies; the real cost is paid lazily on first
+ # write to a shared block. With reflink, a restore goes from
+ # 5-30 s (full sparse-cp) to a few ms.
+ # Both clones can run concurrently; they touch disjoint files.
+ async def _clone(src: Path, dst: Path) -> None:
if dst.exists():
dst.unlink()
proc = await asyncio.create_subprocess_exec(
- "cp", "--sparse=always", str(src), str(dst),
+ "cp", "--reflink=always", str(src), str(dst),
+ stderr=asyncio.subprocess.PIPE,
)
- rc = await proc.wait()
- if rc != 0:
- raise RuntimeError(f"cp {src} -> {dst} failed rc={rc}")
- log.info("[%s] working disks cloned from golden", vm.system.name)
+ _, err = await proc.communicate()
+ if proc.returncode != 0:
+ raise RuntimeError(
+ f"reflink cp {src} -> {dst} failed: "
+ f"{err.decode(errors='replace')[-400:]}")
+ await asyncio.gather(
+ _clone(rootfs_gold, rootfs),
+ _clone(sysdisk_gold, sysdisk),
+ )
+ log.info("[%s] working disks reflink-cloned from golden",
+ vm.system.name)
async def _shutdown(self, vm: VM) -> None:
"""Best-effort clean shutdown of the firecracker process.
@@ -630,11 +684,43 @@ async def _wait_for_agent(self, vm: VM, *, timeout: float) -> None:
await asyncio.sleep(0.5)
raise RuntimeError(f"agent unreachable after {timeout}s: {last_err!r}")
+ async def _wait_for_daemon_ready(self, vm: VM, *, timeout: float) -> None:
+ """Wait for the system's daemon to start serving (post-restore).
+
+ Slow JVM daemons (Doris, Druid, Trino) can take several minutes to
+ come up after a snapshot restore. The agent's daemon-kick thread
+ runs ./start + ./check in the background; /ready flips to 200 once
+ that completes. Without this gate, the first user query lands
+ mid-start and times out at the host's 60 s query budget.
+ """
+ url = self.agent_url(vm) + "/ready"
+ t0 = time.monotonic()
+ async with aiohttp.ClientSession() as s:
+ while time.monotonic() - t0 < timeout:
+ try:
+ async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r:
+ if r.status == 200:
+ return
+ except Exception:
+ pass
+ await asyncio.sleep(1.0)
+ log.warning("[%s] daemon not ready after %s s; serving queries anyway",
+ vm.system.name, timeout)
+
async def _call_agent_provision(self, vm: VM) -> None:
url = self.agent_url(vm) + "/provision"
+ # No fast idle check — /provision is a single POST that returns
+ # only when install+load is fully done. The TCP connection sits
+ # idle (no body streaming) for the entire run. Some systems take
+ # many hours to load 100 M rows; we just set a generous total
+ # deadline so a genuinely stuck call eventually breaks.
async with aiohttp.ClientSession() as s:
- # Provision can take a very long time (apt-get install jdk, etc.)
- async with s.post(url, timeout=aiohttp.ClientTimeout(total=7200)) as r:
+ async with s.post(
+ url,
+ timeout=aiohttp.ClientTimeout(
+ total=7 * 86400, sock_connect=30,
+ ),
+ ) as r:
body = await r.read()
if r.status >= 300:
raise RuntimeError(f"agent /provision failed: {r.status}: "
diff --git a/playground/web/app.js b/playground/web/app.js
index fb29eb0d7f..ca72780f53 100644
--- a/playground/web/app.js
+++ b/playground/web/app.js
@@ -1,114 +1,225 @@
// ClickBench Playground — minimal vanilla-JS client.
//
-// Talks to the host API. Three things happen here:
-// 1. On load, fetch /api/systems and populate the system dropdown. Pre-select
-// whatever's in the URL hash (e.g. #clickhouse) or the first one.
-// 2. On selection change, poll /api/system/ every 2s and update the
-// state pill so the user can see when provisioning finishes / a VM is
-// restarted by the watchdog.
-// 3. On "Run query", POST the SQL to /api/query?system=, parse the
-// response headers for timing, render bytes as text (best-effort UTF-8).
+// Talks to the host API.
+// 1. On load, fetch /api/systems for the catalog and /api/state for live
+// states. Render systems as a vertical list, colored by current state.
+// 2. Re-poll /api/state every 2 s and re-color the list. The currently
+// selected system also re-renders its status JSON blob below.
+// 3. On click of a system row, select it. On "Run query", POST the SQL to
+// /api/query?system= and render output as plain text in a
.
const $ = (sel) => document.querySelector(sel);
-const sysSelect = $("#system");
+const listEl = $("#system-list");
const queryEl = $("#query");
const runBtn = $("#run");
-const statePill = $("#state-pill");
+const selectedEl = $("#selected-system");
const outEl = $("#output");
+const outLabelEl = $("#output-label");
const timeEl = $("#time");
-const wallEl = $("#wall");
-const bytesEl = $("#bytes");
-const truncEl = $("#truncated");
-const exitEl = $("#exit");
const stateBlob = $("#state-blob");
+const lastErrorEl = $("#last-error");
+const exampleSel = $("#example");
+const uiActive = ["#ui-active", "#ui-query", "#ui-stats", "#ui-output"].map($);
+const uiDown = $("#ui-down");
+let catalog = []; // [{name, display_name, data_format, ...}]
+let stateByName = {}; // {name: {state, ...}}
+let selected = null; // selected system name
let pollTimer = null;
-let knownSystems = [];
+let resultsByName = {}; // {name: {output, time, wall, bytes, truncated, exit}}
+let queriesByName = {}; // {name: [q1, q2, ...]}
-async function loadSystems() {
+async function loadCatalog() {
const r = await fetch("/api/systems");
- knownSystems = await r.json();
- knownSystems.sort((a, b) => a.display_name.localeCompare(b.display_name));
- sysSelect.innerHTML = "";
- for (const s of knownSystems) {
+ catalog = await r.json();
+ catalog.sort((a, b) => a.display_name.localeCompare(b.display_name));
+ renderList();
+ const hash = (location.hash || "").slice(1);
+ if (hash && catalog.some(s => s.name === hash)) {
+ select(hash);
+ } else if (catalog.length) {
+ select(catalog[0].name);
+ }
+}
+
+function renderList() {
+ listEl.innerHTML = "";
+ for (const s of catalog) {
+ const st = (stateByName[s.name] && stateByName[s.name].state) || "down";
+ const row = document.createElement("div");
+ row.className = `system-item state-${st}` + (s.name === selected ? " selected" : "");
+ row.dataset.name = s.name;
+ row.textContent = s.display_name;
+ row.addEventListener("click", () => select(s.name));
+ listEl.appendChild(row);
+ }
+}
+
+function select(name) {
+ selected = name;
+ location.hash = name;
+ selectedEl.textContent = name;
+ for (const row of listEl.children) {
+ row.classList.toggle("selected", row.dataset.name === name);
+ }
+ if (stateByName[name]) {
+ stateBlob.textContent = JSON.stringify(stateByName[name], null, 2);
+ }
+ showResult(resultsByName[name]);
+ // If the user has typed something, keep it across system switches —
+ // they're likely composing one query against multiple systems. Only
+ // when the textarea is empty does loadExamples populate Q1.
+ loadExamples(name);
+ refreshDownUI();
+}
+
+async function loadExamples(name) {
+ let qs = queriesByName[name];
+ if (!qs) {
+ try {
+ const r = await fetch(`/api/queries/${encodeURIComponent(name)}`);
+ qs = r.ok ? await r.json() : [];
+ } catch (e) {
+ qs = [];
+ }
+ queriesByName[name] = qs;
+ }
+ if (selected !== name) return; // user moved on
+ exampleSel.innerHTML = "";
+ if (!qs.length) {
const o = document.createElement("option");
- o.value = s.name;
- o.textContent = `${s.display_name} (${s.data_format})`;
- sysSelect.appendChild(o);
+ o.textContent = "(no examples)";
+ o.disabled = true;
+ exampleSel.appendChild(o);
+ } else {
+ for (let i = 0; i < qs.length; i++) {
+ const o = document.createElement("option");
+ o.value = String(i);
+ // Single-line label: first 90 chars of the query.
+ const label = qs[i].replace(/\s+/g, " ").slice(0, 90);
+ o.textContent = `Q${i + 1}: ${label}`;
+ exampleSel.appendChild(o);
+ }
}
- // Allow #clickhouse style deep links
- const hash = (location.hash || "").slice(1);
- if (hash && knownSystems.some(s => s.name === hash)) {
- sysSelect.value = hash;
+ // Only populate the first example if the textarea is empty —
+ // anything the user has typed stays put when switching systems.
+ if (!queryEl.value.trim() && qs.length) {
+ queryEl.value = qs[0];
}
- onSystemChange();
+}
+
+let lastDownShownName = null;
+
+function refreshDownUI() {
+ const s = stateByName[selected];
+ const isDown = s && s.state === "down";
+ for (const el of uiActive) {
+ if (el) el.style.display = isDown ? "none" : "";
+ }
+ uiDown.style.display = isDown ? "" : "none";
+ if (isDown) {
+ // Render the last error once per selection. If poll picks up a
+ // new last_error for the same system later, leave the UI alone
+ // — the user is reading the text, we shouldn't move it under
+ // their eyes.
+ if (lastDownShownName !== selected) {
+ const raw = (s && s.last_error) || "(no error recorded)";
+ lastErrorEl.textContent = raw
+ .replace(/\\n/g, "\n")
+ .replace(/\\t/g, "\t")
+ .replace(/\\r/g, "");
+ lastDownShownName = selected;
+ }
+ } else {
+ lastDownShownName = null;
+ }
+}
+
+function showResult(r) {
+ if (!r) {
+ outEl.textContent = "";
+ timeEl.textContent = "—";
+ outLabelEl.textContent = "Output";
+ return;
+ }
+ outEl.textContent = r.output;
+ timeEl.textContent = r.time;
+ outLabelEl.textContent = r.truncated === "yes" ? "Output (truncated)" : "Output";
}
async function pollState() {
- const name = sysSelect.value;
- if (!name) return;
try {
- const r = await fetch(`/api/system/${encodeURIComponent(name)}`);
+ const r = await fetch("/api/state");
if (!r.ok) throw new Error(`HTTP ${r.status}`);
- const j = await r.json();
- statePill.textContent = j.state || "?";
- statePill.className = `pill ${j.state || ""}`;
- stateBlob.textContent = JSON.stringify(j, null, 2);
+ const arr = await r.json();
+ stateByName = {};
+ for (const s of arr) stateByName[s.name] = s;
+ // Update each row's color + state badge without rebuilding the DOM
+ for (const row of listEl.children) {
+ const s = stateByName[row.dataset.name];
+ const st = (s && s.state) || "down";
+ row.className = `system-item state-${st}` +
+ (row.dataset.name === selected ? " selected" : "");
+ }
+ if (selected && stateByName[selected]) {
+ stateBlob.textContent = JSON.stringify(stateByName[selected], null, 2);
+ }
+ refreshDownUI();
} catch (e) {
- statePill.textContent = "err";
- statePill.className = "pill down";
stateBlob.textContent = String(e);
}
}
-function onSystemChange() {
- if (pollTimer) clearInterval(pollTimer);
- location.hash = sysSelect.value;
- pollState();
- pollTimer = setInterval(pollState, 2000);
-}
-
async function runQuery() {
- const name = sysSelect.value;
+ if (!selected) return;
const sql = queryEl.value;
if (!sql.trim()) return;
runBtn.disabled = true;
outEl.textContent = "(running …)";
timeEl.textContent = "…";
- wallEl.textContent = "…";
- bytesEl.textContent = "—";
- truncEl.textContent = "—";
- exitEl.textContent = "—";
+ outLabelEl.textContent = "Output";
+ const target = selected; // capture in case the user switches mid-flight
const t0 = performance.now();
+ let payload = null;
try {
- const r = await fetch(`/api/query?system=${encodeURIComponent(name)}`, {
+ const r = await fetch(`/api/query?system=${encodeURIComponent(target)}`, {
method: "POST",
body: sql,
headers: {"Content-Type": "application/octet-stream"},
});
const body = await r.arrayBuffer();
- const txt = bytesToText(body);
- outEl.textContent = txt || "(no output)";
-
+ const txt = bytesToText(body) || "(no output)";
const h = (k) => r.headers.get(k);
const qt = h("X-Query-Time");
const wt = h("X-Wall-Time");
- timeEl.textContent = qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—";
- wallEl.textContent = wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`;
- bytesEl.textContent = h("X-Output-Bytes") || body.byteLength;
- truncEl.textContent = h("X-Output-Truncated") === "1" ? "yes" : "no";
- exitEl.textContent = h("X-Exit-Code") || r.status;
+ let output = txt;
if (r.status >= 400) {
const err = h("X-Error");
- if (err) outEl.textContent = `(error)\n${err}\n\n` + outEl.textContent;
+ if (err) {
+ const trailer = `\n\n(error)\n${err}`;
+ output = (txt === "(no output)" ? "" : txt) + trailer;
+ }
}
+ payload = {
+ output,
+ time: qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—",
+ wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`,
+ bytes: h("X-Output-Bytes") || String(body.byteLength),
+ truncated: h("X-Output-Truncated") === "1" ? "yes" : "no",
+ exit: h("X-Exit-Code") || String(r.status),
+ };
} catch (e) {
- outEl.textContent = `(client error)\n${e}`;
+ payload = {
+ output: `(client error)\n${e}`,
+ time: "—", wall: "—", bytes: "—", truncated: "—", exit: "err",
+ };
} finally {
runBtn.disabled = false;
}
+ resultsByName[target] = payload;
+ if (selected === target) showResult(payload);
}
function bytesToText(buf) {
@@ -119,10 +230,20 @@ function bytesToText(buf) {
}
}
-sysSelect.addEventListener("change", onSystemChange);
runBtn.addEventListener("click", runQuery);
+exampleSel.addEventListener("change", () => {
+ const i = parseInt(exampleSel.value, 10);
+ const qs = queriesByName[selected];
+ if (qs && !isNaN(i) && i >= 0 && i < qs.length) {
+ queryEl.value = qs[i];
+ }
+});
queryEl.addEventListener("keydown", (e) => {
if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery();
});
-loadSystems();
+(async function init() {
+ await loadCatalog();
+ await pollState();
+ pollTimer = setInterval(pollState, 2000);
+})();
diff --git a/playground/web/index.html b/playground/web/index.html
index e415a5ecff..8f09884572 100644
--- a/playground/web/index.html
+++ b/playground/web/index.html
@@ -4,7 +4,7 @@
ClickBench Playground
-
+
@@ -13,50 +13,60 @@
ClickBench Playground
Run SQL against any of the database systems in
ClickBench,
each isolated in its own Firecracker microVM. The dataset is the
- standard hits table — 100 M rows.
+ standard hits table — 100 M rows.