From 56b9725a5902d786da35ffd8fa51bee4e93df6c5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 19:59:55 +0000 Subject: [PATCH 001/221] playground: scaffold ClickBench Firecracker microVM service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WIP checkpoint. Lets visitors run SQL against any of the 80+ ClickBench systems via a single-page UI, each isolated in a per-system Firecracker microVM. - server/ aiohttp API: /api/systems, /api/state, /api/query, /api/admin/provision. Owns the per-system VM lifecycle, a 1-Hz CPU/disk/host-pressure watchdog, and a batched ClickHouse-Cloud logging sink (JSONL fallback). - agent/ stdlib HTTP agent that runs inside each VM and wraps the system's install/start/load/query scripts. - images/ scripts to build the base Ubuntu 22.04 rootfs + per-system rootfs/system-disk pair (200 GB sparse + 16/88 GB sized for the system's data format). - web/ vanilla JS SPA — system picker, query box, X-Query-Time / X-Output-Truncated rendering. Smoke-tested: base rootfs boots under Firecracker, agent comes up in ~2 s, /health and /stats respond. Agent self-test on the host (no VM) covers all 4 endpoints including 10 KB output truncation. ClickHouse provisioning is in flight; see playground/docs/build-progress.md for the running checkpoint. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/.gitignore | 3 + playground/README.md | 102 ++++++ playground/__init__.py | 0 playground/agent/agent.py | 357 ++++++++++++++++++++ playground/agent/clickbench-agent.service | 23 ++ playground/clickbench-playground.service | 16 + playground/docs/architecture.md | 115 +++++++ playground/docs/build-progress.md | 110 ++++++ playground/images/build-base-rootfs.sh | 199 +++++++++++ playground/images/build-datasets-image.sh | 41 +++ playground/images/build-system-rootfs.sh | 170 ++++++++++ playground/scripts/agent-selftest.sh | 93 +++++ playground/scripts/download-datasets.sh | 64 ++++ playground/scripts/install-firecracker.sh | 36 ++ playground/scripts/run-server.sh | 15 + playground/scripts/smoke-boot.sh | 98 ++++++ playground/server/__init__.py | 0 playground/server/config.py | 98 ++++++ playground/server/firecracker.py | 117 +++++++ playground/server/logging_sink.py | 190 +++++++++++ playground/server/main.py | 252 ++++++++++++++ playground/server/monitor.py | 215 ++++++++++++ playground/server/net.py | 130 +++++++ playground/server/systems.py | 134 ++++++++ playground/server/vm_manager.py | 391 ++++++++++++++++++++++ playground/web/app.js | 128 +++++++ playground/web/index.html | 62 ++++ playground/web/style.css | 71 ++++ 28 files changed, 3230 insertions(+) create mode 100644 playground/.gitignore create mode 100644 playground/README.md create mode 100644 playground/__init__.py create mode 100644 playground/agent/agent.py create mode 100644 playground/agent/clickbench-agent.service create mode 100644 playground/clickbench-playground.service create mode 100644 playground/docs/architecture.md create mode 100644 playground/docs/build-progress.md create mode 100755 playground/images/build-base-rootfs.sh create mode 100755 playground/images/build-datasets-image.sh create mode 100755 playground/images/build-system-rootfs.sh create mode 100755 playground/scripts/agent-selftest.sh create mode 100755 playground/scripts/download-datasets.sh create mode 100755 playground/scripts/install-firecracker.sh create mode 100755 playground/scripts/run-server.sh create mode 100755 playground/scripts/smoke-boot.sh create mode 100644 playground/server/__init__.py create mode 100644 playground/server/config.py create mode 100644 playground/server/firecracker.py create mode 100644 playground/server/logging_sink.py create mode 100644 playground/server/main.py create mode 100644 playground/server/monitor.py create mode 100644 playground/server/net.py create mode 100644 playground/server/systems.py create mode 100644 playground/server/vm_manager.py create mode 100644 playground/web/app.js create mode 100644 playground/web/index.html create mode 100644 playground/web/style.css diff --git a/playground/.gitignore b/playground/.gitignore new file mode 100644 index 0000000000..b6cf5f0391 --- /dev/null +++ b/playground/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +.env diff --git a/playground/README.md b/playground/README.md new file mode 100644 index 0000000000..cc748036bf --- /dev/null +++ b/playground/README.md @@ -0,0 +1,102 @@ +# ClickBench Playground + +A self-service playground that lets visitors run arbitrary SQL against any of the +80+ database systems documented in ClickBench, isolated inside a Firecracker +microVM per system. + +## How it works + +1. The dataset (hits, in all formats ClickBench uses) is downloaded once into a + single directory on the host and exposed read-only to every VM as a virtio-blk + device. +2. For each system, a Firecracker microVM is launched once with internet access + to run the system's `install`, `start`, and `load` scripts. +3. A snapshot (memory + disk) is taken and persisted. Subsequent restorations + run without internet — the only path in or out is the host↔VM control link. +4. A small in-VM **agent** exposes `POST /query` over HTTP. The host **API + server** proxies user queries to the agent, returns the raw output as + `application/octet-stream`, and puts the timing into response headers. +5. A **monitor** loop watches per-VM CPU/disk/memory and host totals, killing + misbehaving or oversized VMs. +6. Every request and every restart is appended to a ClickHouse Cloud table. + +## Layout + +``` +playground/ +├── server/ # aiohttp API server, VM manager, monitor, logging sink +├── agent/ # In-VM HTTP agent (runs as systemd unit inside each VM) +├── images/ # Scripts that build the base rootfs + per-system overlays +├── web/ # Vanilla-JS single-page app +├── scripts/ # Host-side install / dataset / network helpers +└── docs/ # Design notes +``` + +Host state lives under `/opt/clickbench-playground/`: + +``` +/opt/clickbench-playground/ +├── bin/ firecracker, jailer +├── kernel/vmlinux guest kernel +├── base-rootfs.ext4 pristine Ubuntu 22.04 rootfs (built once) +├── datasets/ hits.parquet, hits_*.parquet, hits.tsv, hits.csv +├── datasets.ext4 read-only image of datasets/ (attached to every VM) +├── systems// per-system rootfs, snapshot, sockets, logs +├── vms/.sock Firecracker API socket +└── logs/ local JSONL fallback when ClickHouse Cloud is off +``` + +## Networking + +Each VM gets its own `/30` subnet on a dedicated TAP: + +| Side | Address | Notes | +|------|------------------|--------------------------------| +| Host | `10.200..1` | TAP device `fc-tap-` | +| VM | `10.200..2` | reachable from host always | + +During the install phase, `iptables FORWARD` + MASQUERADE are enabled for the +TAP so the VM can `apt-get`/`curl`/etc. After the snapshot is taken, the +forwarding rules are removed; the host↔VM link still works but external traffic +is blackholed. + +## Configuration + +Environment variables (read by `server/config.py`): + +| Var | Purpose | +|--------------------------------|-----------------------------------------------| +| `CLICKHOUSE_CLOUD_URL` | HTTPS URL of CH Cloud (e.g. `https://x.clickhouse.cloud:8443`) | +| `CLICKHOUSE_CLOUD_USER` | username | +| `CLICKHOUSE_CLOUD_PASSWORD` | password | +| `PLAYGROUND_STATE_DIR` | defaults to `/opt/clickbench-playground` | +| `PLAYGROUND_LISTEN` | defaults to `0.0.0.0:8000` | +| `PLAYGROUND_MAX_VMS` | concurrent live VMs cap (default 16) | +| `PLAYGROUND_OUTPUT_LIMIT` | response body cap in bytes (default 10240) | + +## Lifecycle of a request + +``` +client ──HTTP──▶ api/query?system=clickhouse + │ + ▼ + vm_manager.ensure_ready("clickhouse") + ├─ already running and /health OK ──▶ proceed + ├─ not running ──▶ restore from snapshot + └─ unresponsive ──▶ kill, restore, retry once + │ + ▼ + agent ◀── POST /query ── body=SQL + agent runs ./query, captures stdout/stderr, returns: + Content-Type: application/octet-stream + X-Query-Time: 0.234 + X-Output-Truncated: 0|1 + X-Output-Bytes: 8042 + body: (up to 10 KB of raw output) + │ + ▼ + logger.write_request(...) + │ + ▼ + client +``` diff --git a/playground/__init__.py b/playground/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/playground/agent/agent.py b/playground/agent/agent.py new file mode 100644 index 0000000000..62f5aad59f --- /dev/null +++ b/playground/agent/agent.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python3 +""" +ClickBench in-VM agent. + +Runs inside the Firecracker microVM. Exposes a tiny HTTP API that the host +server hits to: + + GET /health quick liveness probe; cheap + GET /stats CPU/mem/disk snapshot + POST /provision run install -> start -> load for the bundled system + (only called once, before the host snapshots the VM) + POST /query read SQL from request body, exec ./query, return + output as application/octet-stream + timing headers + +The system's ClickBench scripts (install/start/load/query/check/stop/...) are +mounted at /opt/clickbench/system, with the system name in /etc/clickbench- +system. The dataset is mounted read-only at /opt/clickbench/datasets. + +Listens on 0.0.0.0:8080 by default. + +Stdlib-only — the rootfs ships python3 from the Ubuntu base; no pip needed. +""" + +from __future__ import annotations + +import contextlib +import http.server +import json +import os +import shutil +import signal +import socket +import socketserver +import subprocess +import sys +import threading +import time +from pathlib import Path + +SYSTEM_DIR = Path(os.environ.get("CLICKBENCH_SYSTEM_DIR", "/opt/clickbench/system")) +DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets")) +STATE_DIR = Path(os.environ.get("CLICKBENCH_AGENT_STATE", "/var/lib/clickbench-agent")) +SYSTEM_NAME = ( + os.environ.get("CLICKBENCH_SYSTEM_NAME") + or (Path("/etc/clickbench-system").read_text().strip() + if Path("/etc/clickbench-system").exists() else SYSTEM_DIR.name) +) +LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "8080")) +# 10 KB cap, matching the spec. Configurable for testing. +OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "10240")) +# Per-query wall-clock cap so a runaway query can't tie up a VM forever. +QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "600")) +# Provision (install/start/load) can legitimately take an hour for some systems. +PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", "7200")) + +STATE_DIR.mkdir(parents=True, exist_ok=True) +PROVISION_DONE = STATE_DIR / "provisioned" +PROVISION_LOG = STATE_DIR / "provision.log" + +# Single-writer lock; the agent serializes queries per VM. Two ClickBench +# scripts hitting the same socket/temp file concurrently would not be safe. +_query_lock = threading.Lock() +_provision_lock = threading.Lock() + + +def _cap(b: bytes) -> tuple[bytes, bool]: + """Truncate to OUTPUT_LIMIT bytes; return (body, was_truncated).""" + if len(b) <= OUTPUT_LIMIT: + return b, False + return b[:OUTPUT_LIMIT], True + + +def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes: + n = int(handler.headers.get("Content-Length") or 0) + if n <= 0: + return b"" + # Cap inbound bodies at 1 MB; queries are SQL, not bulk data. + return handler.rfile.read(min(n, 1 << 20)) + + +def _system_script(name: str) -> Path: + """Return path to a script in the system dir, or raise if missing/not executable.""" + p = SYSTEM_DIR / name + if not p.exists(): + raise FileNotFoundError(f"missing system script: {p}") + if not os.access(p, os.X_OK): + raise PermissionError(f"system script not executable: {p}") + return p + + +def _read_proc_stat() -> tuple[int, int]: + """Return (total_jiffies, idle_jiffies) from /proc/stat.""" + with open("/proc/stat") as f: + parts = f.readline().split() + nums = list(map(int, parts[1:])) + total = sum(nums) + idle = nums[3] + (nums[4] if len(nums) > 4 else 0) + return total, idle + + +def _stats_snapshot() -> dict: + out: dict = {"system": SYSTEM_NAME, "ts": time.time()} + try: + out["loadavg"] = list(map(float, Path("/proc/loadavg").read_text().split()[:3])) + except Exception: + pass + try: + info = {k: v for k, v in ( + l.split(":", 1) for l in Path("/proc/meminfo").read_text().splitlines() if ":" in l + )} + out["mem_total_kb"] = int(info.get("MemTotal", "0 kB").split()[0]) + out["mem_avail_kb"] = int(info.get("MemAvailable", "0 kB").split()[0]) + except Exception: + pass + try: + st = shutil.disk_usage("/") + out["disk_total"] = st.total + out["disk_free"] = st.free + except Exception: + pass + try: + t1, i1 = _read_proc_stat() + time.sleep(0.05) + t2, i2 = _read_proc_stat() + total = max(1, t2 - t1) + out["cpu_busy"] = 1.0 - (i2 - i1) / total + except Exception: + pass + out["provisioned"] = PROVISION_DONE.exists() + return out + + +def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]: + """ + Invoke ./query with the SQL on stdin. + The query script's contract per lib/benchmark-common.sh: + stdout: result (whatever format the system uses) + stderr: timing in fractional seconds on the LAST numeric line + exit code: 0 on success + """ + script = _system_script("query") + t0 = time.monotonic() + try: + p = subprocess.Popen( + [str(script)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=str(SYSTEM_DIR), + preexec_fn=os.setsid, + ) + try: + stdout, stderr = p.communicate(input=sql, timeout=QUERY_TIMEOUT) + rc = p.returncode + except subprocess.TimeoutExpired: + # The system might still be inside its query; kill the whole group. + with contextlib.suppress(ProcessLookupError): + os.killpg(p.pid, signal.SIGKILL) + stdout, stderr = p.communicate() + rc = -9 + except Exception as e: + return 255, b"", f"agent: failed to invoke ./query: {e}\n".encode(), 0.0 + return rc, stdout, stderr, time.monotonic() - t0 + + +def _extract_script_timing(stderr: bytes) -> float | None: + """ + Pull fractional-seconds timing from the last numeric line of stderr, + matching the lib/benchmark-common.sh tail -n1 logic. + """ + # Handle the spark/pyspark carriage-return progress-bar case. + text = stderr.decode("utf-8", errors="replace").replace("\r", "\n") + last = None + for line in text.splitlines(): + s = line.strip() + if not s: + continue + try: + v = float(s) + except ValueError: + continue + last = v + return last + + +def _provision() -> tuple[int, bytes]: + """ + Run install -> start -> wait-for-check -> load. Capture everything to + PROVISION_LOG. Idempotent: subsequent calls succeed-fast if PROVISION_DONE + is present. + """ + if PROVISION_DONE.exists(): + return 0, b"already provisioned\n" + + with _provision_lock: + if PROVISION_DONE.exists(): + return 0, b"already provisioned\n" + + # Use the same /lib/benchmark-common.sh helpers if they're around. But + # since this is the playground, we want a *minimal* version: install, + # start, wait for check, load, sync. No cold-cycle restart, no + # concurrent-QPS test, no query loop. + steps: list[tuple[str, list[str]]] = [ + ("install", [str(_system_script("install"))]), + ("start", [str(_system_script("start"))]), + ] + + log_lines: list[bytes] = [] + for name, cmd in steps: + t0 = time.monotonic() + log_lines.append(f"\n=== {name} ===\n".encode()) + r = subprocess.run( + cmd, cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, + ) + dt = time.monotonic() - t0 + log_lines.append(r.stdout or b"") + log_lines.append(f"=== {name} done rc={r.returncode} in {dt:.1f}s ===\n".encode()) + if r.returncode != 0: + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return r.returncode, b"".join(log_lines) + + # Wait for ./check to succeed for up to 300s + check = SYSTEM_DIR / "check" + ok = False + t0 = time.monotonic() + last_check: subprocess.CompletedProcess | None = None + while time.monotonic() - t0 < 300: + last_check = subprocess.run( + [str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + ) + if last_check.returncode == 0: + ok = True + break + time.sleep(1) + if not ok: + log_lines.append(b"\n=== check did not succeed within 300s ===\n") + if last_check is not None: + log_lines.append(last_check.stderr or b"") + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return 1, b"".join(log_lines) + log_lines.append(b"\n=== check ok ===\n") + + # Data files are pre-staged on the per-system disk by the host-side + # build-system-rootfs.sh, so the load script's relative references + # (hits.parquet, hits.tsv, etc.) already resolve to local files it + # can chown / mv / rm without worrying about a RO source mount. + + # Run load. + t0 = time.monotonic() + log_lines.append(b"\n=== load ===\n") + r = subprocess.run( + [str(_system_script("load"))], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, + ) + dt = time.monotonic() - t0 + log_lines.append(r.stdout or b"") + log_lines.append(f"=== load done rc={r.returncode} in {dt:.1f}s ===\n".encode()) + if r.returncode != 0: + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return r.returncode, b"".join(log_lines) + + subprocess.run(["sync"], check=False) + PROVISION_DONE.write_text(f"ok {time.time()}\n") + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return 0, b"".join(log_lines) + + +class Handler(http.server.BaseHTTPRequestHandler): + server_version = "clickbench-agent/0.1" + + def log_message(self, fmt: str, *args) -> None: + sys.stderr.write("[agent] " + (fmt % args) + "\n") + + def _send(self, code: int, body: bytes, headers: dict | None = None) -> None: + self.send_response(code) + self.send_header("Content-Length", str(len(body))) + self.send_header("Content-Type", (headers or {}).pop("Content-Type", "application/json")) + for k, v in (headers or {}).items(): + self.send_header(k, v) + self.end_headers() + self.wfile.write(body) + + def _send_json(self, code: int, obj) -> None: + self._send(code, json.dumps(obj, default=str).encode() + b"\n", + {"Content-Type": "application/json"}) + + def do_GET(self) -> None: + if self.path == "/health": + self._send_json(200, {"ok": True, "system": SYSTEM_NAME, + "provisioned": PROVISION_DONE.exists()}) + return + if self.path == "/stats": + self._send_json(200, _stats_snapshot()) + return + if self.path == "/provision-log": + data = PROVISION_LOG.read_bytes() if PROVISION_LOG.exists() else b"" + self._send(200, data, {"Content-Type": "text/plain; charset=utf-8"}) + return + self._send_json(404, {"error": "not found", "path": self.path}) + + def do_POST(self) -> None: + if self.path == "/provision": + rc, log = _provision() + self._send(200 if rc == 0 else 500, log[-OUTPUT_LIMIT:], + {"Content-Type": "text/plain; charset=utf-8", + "X-Provision-Status": "ok" if rc == 0 else f"err-{rc}"}) + return + if self.path == "/query": + if not PROVISION_DONE.exists(): + self._send_json(409, {"error": "not provisioned"}) + return + sql = _read_body(self) + if not sql.strip(): + self._send_json(400, {"error": "empty query"}) + return + with _query_lock: + rc, out, err, wall = _run_query(sql) + script_t = _extract_script_timing(err) + body, truncated = _cap(out) + headers = { + "Content-Type": "application/octet-stream", + "X-Query-Wall-Time": f"{wall:.6f}", + "X-Output-Bytes": str(len(out)), + "X-Output-Truncated": "1" if truncated else "0", + "X-Exit-Code": str(rc), + "X-System": SYSTEM_NAME, + } + if script_t is not None: + headers["X-Query-Time"] = f"{script_t:.6f}" + if rc != 0: + # Surface a snippet of stderr so the client sees *something*. + err_snip = err[-1024:].decode("utf-8", errors="replace") + headers["X-Error"] = err_snip.replace("\n", " | ")[:512] + self._send(200 if rc == 0 else 502, body, headers) + return + self._send_json(404, {"error": "not found", "path": self.path}) + + +class ReusableServer(socketserver.ThreadingTCPServer): + allow_reuse_address = True + daemon_threads = True + + +def main() -> None: + addr = ("0.0.0.0", LISTEN_PORT) + print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} " + f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True) + with ReusableServer(addr, Handler) as srv: + srv.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service new file mode 100644 index 0000000000..c02fe20cbb --- /dev/null +++ b/playground/agent/clickbench-agent.service @@ -0,0 +1,23 @@ +[Unit] +Description=ClickBench in-VM playground agent +# The kernel's `ip=` cmdline sets the static IP before init, so network is +# already up when we start. We deliberately don't depend on network-online. +# target — that gate is fed by systemd-networkd-wait-online, which is +# disabled. The system disk mount is similarly best-effort: the agent's +# /provision and /query paths report 404/409 if /opt/clickbench/system isn't +# populated, which is the correct behaviour and lets /health stay up so the +# host can still talk to it during provisioning. +After=local-fs.target + +[Service] +Type=simple +Environment=PYTHONUNBUFFERED=1 +Environment=HOME=/root +ExecStart=/usr/bin/python3 /opt/clickbench-agent/agent.py +Restart=on-failure +RestartSec=2 +KillMode=mixed +TimeoutStopSec=10 + +[Install] +WantedBy=multi-user.target diff --git a/playground/clickbench-playground.service b/playground/clickbench-playground.service new file mode 100644 index 0000000000..979d31867b --- /dev/null +++ b/playground/clickbench-playground.service @@ -0,0 +1,16 @@ +[Unit] +Description=ClickBench Playground API +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ubuntu +WorkingDirectory=/home/ubuntu/ClickBench +EnvironmentFile=-/home/ubuntu/ClickBench/playground/.env +ExecStart=/usr/bin/python3 -m playground.server.main +Restart=on-failure +RestartSec=3 + +[Install] +WantedBy=multi-user.target diff --git a/playground/docs/architecture.md b/playground/docs/architecture.md new file mode 100644 index 0000000000..0507740c41 --- /dev/null +++ b/playground/docs/architecture.md @@ -0,0 +1,115 @@ +# ClickBench Playground architecture + +## Components + +``` +┌──────────────────────────────────────────────────────────────────────────┐ +│ Browser (vanilla JS) │ +│ picks a system, types SQL, POST /api/query │ +└────────────────────────────┬─────────────────────────────────────────────┘ + │ HTTP/1.1 +┌────────────────────────────▼─────────────────────────────────────────────┐ +│ Host API server (aiohttp) │ +│ ┌─────────────────┐ ┌──────────────┐ ┌────────────────┐ │ +│ │ HTTP routes │ │ VMManager │ │ Monitor │ │ +│ │ /api/systems │ │ per-VM │ │ 1Hz polling │ │ +│ │ /api/query │──▶│ lifecycle │◀──│ CPU/mem/disk │ │ +│ │ /api/state │ │ snapshots │ │ watchdog │ │ +│ └─────────────────┘ └──────┬───────┘ └────────────────┘ │ +│ ┌─────────────────────────────▼─────────────────────────────────────┐ │ +│ │ LoggingSink: batched INSERT into ClickHouse Cloud + local JSONL │ │ +│ └───────────────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬─────────────────────────────────────────────┘ + │ HTTP over per-VM TAP /24 +┌────────────────────────────▼─────────────────────────────────────────────┐ +│ Firecracker microVM (Ubuntu 22.04, 4 vCPU / 16 GB / 200 GB sparse) │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ in-VM agent (stdlib python HTTP server) │ │ +│ │ /health, /stats, /provision, /query │ │ +│ └────────────────────────┬─────────────────────────────────────────┘ │ +│ ▼ runs │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ /opt/clickbench/system/ — system's ClickBench scripts (RW) │ │ +│ │ /opt/clickbench/datasets/ — shared dataset image (RO) │ │ +│ └──────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────┘ +``` + +## State machine (per system) + +``` + ┌───────┐ no snapshot ┌───────────────┐ + │ down │─────────────▶│ provisioning │ + └───┬───┘ └───────┬───────┘ + │ │ provision OK + │ snapshot ok ▼ + │ ┌──────────────┐ + ▼ │ snapshotted │ + ┌───────────┐ restore └──────┬───────┘ + │ ready │◀──────────────── │ + └─────┬─────┘ │ + │ watchdog / failed query │ + └─────────────────────────┘ +``` + +`ready` is the only state that accepts /query. Any restart (watchdog or +explicit kick) returns to `snapshotted`; the next /query restores from the +on-disk snapshot. + +## Snapshots + +Created the first time a system is requested. Two artifacts: + +- `/systems//snapshot.state` — Firecracker VM state metadata +- `/systems//snapshot.bin` — guest memory dump (16 GB in + size as configured, but sparse) + +The `rootfs.ext4` and `system.ext4` files persist across snapshots and are +re-attached at restore time. Drive paths in the snapshot are remapped to +their current host locations on restore so we don't have to re-snapshot if +the playground gets moved or rebooted. + +## Networking + +A `/24` per VM, with the host owning `.1` and the guest owning `.2`. Each +TAP is `fc-tap-`, where `` is the deterministic per-system index +assigned in `VMManager.__init__`. + +``` +host 10.200..1/24 ◀── TAP ─▶ 10.200..2/24 guest +``` + +During the provision phase only, iptables NAT/FORWARD rules are added so +the guest can `apt-get` / `curl`. After the snapshot, those rules are +deleted — outbound traffic is dropped, the host↔guest link remains. + +## Output truncation + +Truncation is applied **inside the agent**, before bytes leave the VM: + +- Stdout from the system's `./query` script is capped at + `CLICKBENCH_OUTPUT_LIMIT` bytes (default 10 KB). +- The agent's response sets `X-Output-Truncated: 1` and + `X-Output-Bytes: ` so the client can show "this is a + partial result of N bytes." +- The host API server passes the headers through unchanged. + +## Watchdog rules + +The `Monitor` thread samples every running Firecracker process once per +second: + +- **CPU**: if per-VM CPU usage (utime+stime / wallclock / vcpus) stays + ≥ `VM_CPU_BUSY_THRESHOLD` (default 97%) for `VM_CPU_BUSY_WINDOW_SEC` + contiguous seconds (default 120), the VM is killed. +- **Disk**: if the sparse `rootfs.ext4` has allocated more than + `VM_DISK_FULL_PCT` (default 97%) of `VM_ROOTFS_SIZE_GB`, the VM is + killed. +- **Host RAM**: if `MemAvailable` drops below `HOST_MIN_FREE_RAM_GB` + (default 32 GiB), the watchdog kills the VM with the largest RSS. +- **Host disk**: if free space on `PLAYGROUND_STATE_DIR` drops below + `HOST_MIN_FREE_DISK_GB` (default 500 GiB), the watchdog kills the VM + with the largest allocated rootfs. + +A "kill" leaves the snapshot intact. The next user query restores from +snapshot, paying ~1 s of memory restore cost. diff --git a/playground/docs/build-progress.md b/playground/docs/build-progress.md new file mode 100644 index 0000000000..a710246518 --- /dev/null +++ b/playground/docs/build-progress.md @@ -0,0 +1,110 @@ +# Playground build progress — checkpoint 2026-05-12 ~19:58 UTC + +## What is built and committed + +- `playground/` directory scaffolded with subdirs `server/`, `agent/`, + `images/`, `web/`, `scripts/`, `docs/`. +- Architecture notes in `playground/README.md` and + `playground/docs/architecture.md`. +- Host-side API server (`playground/server/*.py`): + - `config.py` — env-driven config with sensible defaults + - `systems.py` — discovers 97 playground-eligible ClickBench systems + - `firecracker.py` — async unix-socket client for Firecracker API + - `net.py` — per-VM TAP + /24 + NAT toggle + - `vm_manager.py` — VM lifecycle (boot, provision, snapshot, restore) + - `monitor.py` — CPU/disk/host-memory watchdog (1 Hz) + - `logging_sink.py` — batched async logger → ClickHouse Cloud + JSONL fallback + - `main.py` — aiohttp routes + static SPA serving +- In-VM agent (`playground/agent/agent.py`, stdlib-only) with endpoints + `/health`, `/stats`, `/provision`, `/query`, `/provision-log`. +- systemd unit `playground/agent/clickbench-agent.service` installed in the + rootfs and enabled. +- Vanilla JS SPA (`playground/web/`): system picker, query box, timing display, + truncation indicator. Talks to `/api/systems`, `/api/system/`, + `/api/query?system=...`. +- Build scripts: + - `images/build-base-rootfs.sh` — Ubuntu 22.04 cloud image → flat 8 GB + ext4 with agent + systemd unit pre-installed. + - `images/build-system-rootfs.sh` — per-system 200 GB sparse rootfs + + sized system disk (16/88 GB depending on data format) containing the + ClickBench scripts + the dataset files this system needs (no symlinks + into a RO mount, because many systems' load scripts `chown`). + - `scripts/install-firecracker.sh` — idempotent host setup. + - `scripts/download-datasets.sh` — eager dataset download into + `/opt/clickbench-playground/datasets/`. + - `scripts/smoke-boot.sh` — boots the base rootfs alone in a VM; confirms + kernel + rootfs + agent path before per-system testing. + - `scripts/agent-selftest.sh` — runs the agent on the host (no VM) and + exercises all endpoints with a fake "system" dir. PASSES. + +## What is provisioned on disk (host) + +``` +/opt/clickbench-playground/ +├── bin/firecracker, bin/jailer (firecracker v1.13.1) +├── kernel/vmlinux (Linux 6.1.141, IP_PNP + virtio enabled) +├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent +├── datasets/ +│ ├── hits.parquet 14.7 GB (single) +│ ├── hits_partitioned/ 14 GB (100 partitioned files) +│ ├── hits.tsv 74 GB (decompressed) +│ ├── hits.csv ~14 GB partial (kill-stopped) +│ └── hits.csv.gz 16 GB +└── systems/clickhouse/ + ├── rootfs.ext4 8.2 MB physical / 200 GB sparse + └── system.ext4 16 GB (parquet files staged) +``` + +## What works + +- Python module imports clean (`python3 -m playground.server.main`). +- API server serves 97 systems via `/api/systems`. +- UI loads at `/ui/`. +- Firecracker smoke-boot (base rootfs only): agent comes up in 2 s, + `/health` and `/stats` respond OK. +- Agent self-test (no VM): all 4 endpoints behave correctly, output + truncation works (2 KB → 64 B with `X-Output-Truncated: 1`). +- Provision started on ClickHouse VM at 19:51:59 UTC: + - VM booted, agent up, internet enabled via MASQUERADE on `ens33` + - Install ran (ClickHouse binary downloaded + apt deps) + - Load is in progress — `cpu_busy=0.8-1.0` sustained, `disk_used` + grew from 17 GB → 30 GB, indicating MergeTree INSERT. + - At 19:57:33 the agent stopped responding to /health (timeout). + Firecracker process is still running (PID 19230, 16 min of CPU). + Likely cause: agent's HTTP server starved by the load process, + or a fork race in stdlib `socketserver`. Needs investigation. + +## What's left + +- Decide whether to add eager liveness pings or move agent to aiohttp + to avoid the stdlib threading server's quirks under heavy load. +- Once provision completes: snapshot → restore → /query test path. +- Build system disks for the other 96 systems (template is ready). +- Wire up ClickHouse Cloud credentials for the logging sink (currently + falling back to JSONL under `/opt/clickbench-playground/logs/`). + +## Known issues / things to revisit + +- TSV/CSV decompression contends with rootfs build for nvme writeback. + Workaround: pre-build the base rootfs before kicking off the heavy + decompressions, or rate-limit pigz. +- The "External" exclusion list in `systems.py` is conservative; some + entries (umbra, hyper, cedardb) actually run locally and should be + added back when verified. +- /etc/resolv.conf in the base rootfs is a static fallback (1.1.1.1 + + 8.8.8.8). Once we cut internet post-snapshot, DNS doesn't matter, but + during provision it does — sanity check that NAT + resolv.conf actually + let `apt-get update` work. +- KVM permissions were opened to mode 666 via a udev rule. Tighten to + the `kvm` group when the playground user is properly added. + +## Operator notes + +- The base rootfs ships with serial autologin as root on ttyS0 — good for + attaching the Firecracker console for debugging. +- Firecracker logs land in `/opt/clickbench-playground/logs/firecracker-.log`. +- The host's `/dev/kvm` group/mode was changed: `chown root:kvm`, `chmod 666`, + with a persistent udev rule at `/etc/udev/rules.d/65-kvm.rules`. +- `vm.dirty_writeback_centisecs` is set to 10 on the host (down from 500) + to reduce sfdisk hang during heavy concurrent writeback. Revert if it + causes other problems. diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh new file mode 100755 index 0000000000..2cffbaf52f --- /dev/null +++ b/playground/images/build-base-rootfs.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# Build a base Ubuntu 22.04 rootfs for the Firecracker microVMs. +# +# Strategy: start from the official Ubuntu 22.04 cloud image (qcow2), convert +# to raw, mount it, install python3 + sudo + curl + iproute2, drop the agent in +# place, install a systemd unit that runs the agent on boot, and add a +# /etc/fstab line that mounts the dataset disk read-only. +# +# The resulting image is /opt/clickbench-playground/base-rootfs.ext4. Per-system +# images are produced by overlaying the system's ClickBench scripts onto a copy +# of this base. +# +# Idempotent: re-running just re-builds the file from scratch. + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +TMP="${STATE_DIR}/tmp/base-build" +OUT="${STATE_DIR}/base-rootfs.ext4" +SIZE_GB="${BASE_ROOTFS_SIZE_GB:-8}" +CLOUDIMG_URL="${UBUNTU_CLOUDIMG_URL:-https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img}" +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +AGENT_DIR="${REPO_DIR}/playground/agent" + +echo "[base] state=$STATE_DIR out=$OUT size=${SIZE_GB}G" + +mkdir -p "$TMP" +mkdir -p "$STATE_DIR/cache" + +CLOUDIMG="$STATE_DIR/cache/jammy-cloudimg.img" +if [ ! -f "$CLOUDIMG" ]; then + echo "[base] downloading cloud image" + curl -fsSL "$CLOUDIMG_URL" -o "${CLOUDIMG}.part" + mv "${CLOUDIMG}.part" "$CLOUDIMG" +fi + +# Plan: rather than grow the cloud image's partition (which involves +# sfdisk/growpart/resize2fs — all of which call `sync` and therefore stall +# whenever the host is under unrelated writeback pressure), we work in two +# fixed-size hops: +# +# 1. Loop-mount the cloud image's existing partition (2 GB) and use that +# as a read-only source. +# 2. Create a fresh, no-partition-table ext4 image of SIZE_GB and mount it +# as the build root. Copy the cloud image's content into it. The new +# image is what Firecracker boots from (it expects a flat ext4, no +# partition table). +# +# No growpart, no resize2fs, no waiting on the kernel to flush GBs of +# unrelated dirty pages just to update a partition table. + +RAW="$TMP/base.raw" +echo "[base] converting cloud image to raw" +qemu-img convert -O raw "$CLOUDIMG" "$RAW" + +SRC_LOOP="$(sudo losetup --find --show --partscan "$RAW")" +trap 'sudo losetup -d "$SRC_LOOP" 2>/dev/null || true' EXIT +for i in $(seq 1 20); do + if [ -b "${SRC_LOOP}p1" ]; then break; fi + sleep 0.5 +done + +SRC_MNT="$TMP/src" +mkdir -p "$SRC_MNT" +sudo mount -o ro "${SRC_LOOP}p1" "$SRC_MNT" + +# Now build the *target* image: a plain ext4 file of SIZE_GB with no partition +# table. Firecracker boots root=/dev/vda directly off this. +echo "[base] mkfs.ext4 -> ${SIZE_GB}G no-partition flat image" +FLAT="$TMP/base.flat.ext4" +fallocate -l "${SIZE_GB}G" "$FLAT" +mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$FLAT" >/dev/null + +DST_LOOP="$(sudo losetup --find --show "$FLAT")" +MNT="$TMP/mnt" +mkdir -p "$MNT" +sudo mount "$DST_LOOP" "$MNT" +trap ' + sudo umount "'"$SRC_MNT"'" 2>/dev/null || true + sudo umount "'"$MNT"'" 2>/dev/null || true + sudo losetup -d "'"$SRC_LOOP"'" 2>/dev/null || true + sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true +' EXIT + +# Stage the cloud image contents into the new rootfs. +echo "[base] copying cloud image content into flat rootfs" +sudo cp -a "$SRC_MNT"/. "$MNT"/ +sudo umount "$SRC_MNT" +sudo losetup -d "$SRC_LOOP" +trap ' + sudo umount "'"$MNT"'" 2>/dev/null || true + sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true +' EXIT + +# Bind /dev /proc /sys for the chroot. +for d in dev proc sys; do + sudo mkdir -p "$MNT/$d" + sudo mount --rbind "/$d" "$MNT/$d" +done +trap ' + for d in dev proc sys; do sudo umount -lR "'"$MNT"'/$d" 2>/dev/null || true; done + sudo umount "'"$MNT"'" 2>/dev/null || true + sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true +' EXIT + +# Resolve DNS from host inside the chroot. The cloud image ships +# /etc/resolv.conf as a symlink into /run/systemd/resolve/ which is empty +# until systemd-resolved starts; we need a real file for the chroot's apt +# to work. +sudo rm -f "$MNT/etc/resolv.conf" +sudo install -m 0644 /etc/resolv.conf "$MNT/etc/resolv.conf" + +# Run system customization inside the chroot. +sudo tee "$MNT/tmp/customize.sh" >/dev/null <<'CUSTOMIZE' +#!/bin/bash +set -euxo pipefail +export DEBIAN_FRONTEND=noninteractive + +# Disable cloud-init's network configuration so eth0 just comes up via +# /etc/network/interfaces-style config we install below. +echo 'network: {config: disabled}' > /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg + +# Keep the image small: turn off heavy services that we don't need on a +# query-serving microVM. +systemctl disable snapd.service snapd.socket snapd.seeded.service 2>/dev/null || true +systemctl mask snapd.service snapd.socket snapd.seeded.service 2>/dev/null || true +systemctl disable unattended-upgrades.service apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true +systemctl mask unattended-upgrades.service apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true + +apt-get update -qq +apt-get install -y --no-install-recommends \ + python3 python3-yaml ca-certificates curl wget gnupg sudo less vim-tiny \ + iproute2 iputils-ping net-tools openssh-server lsb-release \ + htop sysstat strace ncdu pigz unzip xz-utils zstd \ + build-essential netbase +apt-get clean +rm -rf /var/lib/apt/lists/* + +# Network: the host sets up the VM's IP via the kernel `ip=` cmdline so the +# guest comes up with the right /24 for its slot. systemd-networkd in the +# guest must NOT fight the kernel's static config — disable it and rely on +# the kernel-supplied address. /etc/resolv.conf gets a static fallback so DNS +# works in case any post-snapshot tooling still wants it (it shouldn't — +# internet is dropped after the snapshot). +systemctl disable systemd-networkd 2>/dev/null || true +systemctl disable systemd-resolved 2>/dev/null || true +rm -f /etc/resolv.conf +cat > /etc/resolv.conf < /etc/systemd/system/serial-getty@ttyS0.service.d/override.conf < /etc/fstab < flat re-copy step. +sudo umount -lR "$MNT/dev" "$MNT/proc" "$MNT/sys" +sudo umount "$MNT" +sudo losetup -d "$DST_LOOP" +trap - EXIT + +mv "$FLAT" "$OUT" +rm -rf "$TMP" +echo "[base] done: $OUT ($(du -h "$OUT" | cut -f1) physical, $(du -h --apparent-size "$OUT" | cut -f1) apparent)" diff --git a/playground/images/build-datasets-image.sh b/playground/images/build-datasets-image.sh new file mode 100755 index 0000000000..492857c9bf --- /dev/null +++ b/playground/images/build-datasets-image.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Bundle the downloaded datasets directory into a single read-only ext4 image +# (datasets.ext4) that gets attached to every Firecracker VM as a virtio-blk +# device. Mounted at /opt/clickbench/datasets in the guest. +# +# A single shared read-only image is much more efficient than virtio-fs (which +# Firecracker doesn't ship) or per-VM copies of a ~250 GB dataset. + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +SRC="$STATE_DIR/datasets" +OUT="$STATE_DIR/datasets.ext4" + +if [ ! -d "$SRC" ]; then + echo "no datasets dir: $SRC" >&2 + exit 1 +fi + +bytes=$(du -sb "$SRC" | awk '{print $1}') +overhead=$(( 4 * 1024 * 1024 * 1024 )) # 4 GiB headroom for ext4 metadata +size=$(( bytes + overhead )) +# Round up to MiB +size_mib=$(( (size + 1024*1024 - 1) / (1024*1024) )) + +echo "[datasets] payload=$bytes B image=$size_mib MiB out=$OUT" + +rm -f "$OUT" +truncate -s "${size_mib}M" "$OUT" +mkfs.ext4 -F -L cbdata -m 0 -E lazy_itable_init=1,lazy_journal_init=1 -O ^has_journal "$OUT" >/dev/null + +MNT="$(mktemp -d)" +trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT +sudo mount -o loop "$OUT" "$MNT" +sudo rsync -a --info=progress2 "$SRC"/. "$MNT"/ +sudo sync +sudo umount "$MNT" +trap - EXIT + +echo "[datasets] done" +ls -lh "$OUT" diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh new file mode 100755 index 0000000000..90efe01c99 --- /dev/null +++ b/playground/images/build-system-rootfs.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# Build a per-system rootfs and "system disk" image for Firecracker. +# +# Outputs (under /opt/clickbench-playground/systems//): +# rootfs.ext4 CoW-ish copy of base-rootfs.ext4 (sparse 200 GB) +# system.ext4 ext4 holding ClickBench scripts + the dataset files +# this system needs. Mounted RW at /opt/clickbench/system +# in the VM. We include the data here (not a separate +# read-only datasets disk) because many load scripts do +# `sudo chown` on the source files, and chown follows +# symlinks — i.e. it tries to mutate the RO-mounted +# dataset and fails. Putting the data on the RW system +# disk sidesteps the problem entirely. +# +# The disk is sized based on the system's data format: +# parquet, parquet-partitioned 16 GB +# tsv, csv 88 GB +# none/unknown 2 GB +# +# Usage: build-system-rootfs.sh + +set -euo pipefail + +if [ $# -lt 1 ]; then + echo "usage: $0 " >&2 + exit 2 +fi +SYSTEM="$1" + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +BASE="$STATE_DIR/base-rootfs.ext4" +DATASETS="$STATE_DIR/datasets" +SRC="$REPO_DIR/$SYSTEM" +OUT_DIR="$STATE_DIR/systems/$SYSTEM" +ROOTFS="$OUT_DIR/rootfs.ext4" +SYSDISK="$OUT_DIR/system.ext4" + +ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}" + +if [ ! -f "$BASE" ]; then + echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2 + exit 1 +fi +if [ ! -d "$SRC" ]; then + echo "no such system directory: $SRC" >&2 + exit 1 +fi +for f in install start load query check stop; do + if [ ! -x "$SRC/$f" ]; then + echo "system '$SYSTEM' missing executable $f — not playground-ready" >&2 + exit 1 + fi +done + +# Discover the data format from the system's benchmark.sh. Source the file in +# a noop-shell so any of `export BENCH_DOWNLOAD_SCRIPT="..."` / +# `BENCH_DOWNLOAD_SCRIPT=...` etc. just becomes a variable. Drop everything +# else by running in a subshell. +download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \ + eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \ + printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")" +case "$download_script" in + *parquet-partitioned*) format=parquet-partitioned; sysdisk_size_gb=16 ;; + *parquet-single*) format=parquet; sysdisk_size_gb=16 ;; + *tsv*) format=tsv; sysdisk_size_gb=88 ;; + *csv*) format=csv; sysdisk_size_gb=88 ;; + "") format=none; sysdisk_size_gb=2 ;; + *) format=unknown; sysdisk_size_gb=4 ;; +esac +echo "[sys:$SYSTEM] format=$format sysdisk_size=${sysdisk_size_gb}G" + +mkdir -p "$OUT_DIR" + +# 1. Rootfs as a sparse file. Allocate 200 GB but only write blocks when +# something inside the VM dirties them. +echo "[sys:$SYSTEM] rootfs.ext4 ${ROOTFS_SIZE_GB}G (sparse)" +rm -f "$ROOTFS" +truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS" +mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$ROOTFS" >/dev/null + +BASE_MNT="$(mktemp -d)" +DST_MNT="$(mktemp -d)" +trap ' + sudo umount "'"$BASE_MNT"'" 2>/dev/null || true + sudo umount "'"$DST_MNT"'" 2>/dev/null || true + rmdir "'"$BASE_MNT"'" "'"$DST_MNT"'" 2>/dev/null || true +' EXIT +# A prior smoke-boot likely left the base rootfs's journal dirty. Replay it +# (fsck -fy is idempotent) before opening read-only — otherwise the loop +# mount refuses with "cannot mount read-only" and the script blows up +# silently. +sudo e2fsck -fy "$BASE" >/dev/null 2>&1 || true +sudo mount -o loop,ro "$BASE" "$BASE_MNT" +sudo mount -o loop "$ROOTFS" "$DST_MNT" +sudo cp -a --reflink=auto "$BASE_MNT"/. "$DST_MNT"/ +echo "$SYSTEM" | sudo tee "$DST_MNT/etc/clickbench-system" >/dev/null +sudo sync +sudo umount "$DST_MNT" +sudo umount "$BASE_MNT" +trap - EXIT + +# 2. System disk: ClickBench scripts + the data files this system needs. +# Sized per-format. The agent runs ./install/./start/./load with cwd here, so +# the load script's relative references to hits.parquet / hits.tsv / etc. all +# resolve to local files it owns. +echo "[sys:$SYSTEM] system.ext4 ${sysdisk_size_gb}G" +rm -f "$SYSDISK" +truncate -s "${sysdisk_size_gb}G" "$SYSDISK" +mkfs.ext4 -F -L cbsystem -E lazy_itable_init=1,lazy_journal_init=1 "$SYSDISK" >/dev/null + +SYS_MNT="$(mktemp -d)" +trap 'sudo umount "'"$SYS_MNT"'" 2>/dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT +sudo mount -o loop "$SYSDISK" "$SYS_MNT" + +# Scripts. +sudo rsync -a --exclude 'results/' --exclude '*.json' --exclude 'README*' \ + "$SRC"/ "$SYS_MNT"/ + +# Some systems' scripts use ../lib/... — provide it. +sudo mkdir -p "$SYS_MNT/_lib" +sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/_lib"/ + +# Data files. +case "$format" in + parquet) + if [ -f "$DATASETS/hits.parquet" ]; then + echo "[sys:$SYSTEM] copying hits.parquet" + sudo cp --reflink=auto "$DATASETS/hits.parquet" "$SYS_MNT/hits.parquet" + else + echo "[sys:$SYSTEM] WARN hits.parquet not present in datasets dir" + fi + ;; + parquet-partitioned) + if [ -d "$DATASETS/hits_partitioned" ]; then + echo "[sys:$SYSTEM] copying 100 partitioned parquet files" + sudo cp --reflink=auto "$DATASETS/hits_partitioned"/hits_*.parquet "$SYS_MNT/" + else + echo "[sys:$SYSTEM] WARN hits_partitioned/ not present" + fi + ;; + tsv) + if [ -f "$DATASETS/hits.tsv" ]; then + echo "[sys:$SYSTEM] copying hits.tsv (large)" + sudo cp --reflink=auto "$DATASETS/hits.tsv" "$SYS_MNT/hits.tsv" + else + echo "[sys:$SYSTEM] WARN hits.tsv not present" + fi + ;; + csv) + if [ -f "$DATASETS/hits.csv" ]; then + echo "[sys:$SYSTEM] copying hits.csv (large)" + sudo cp --reflink=auto "$DATASETS/hits.csv" "$SYS_MNT/hits.csv" + else + echo "[sys:$SYSTEM] WARN hits.csv not present" + fi + ;; + none|unknown) + echo "[sys:$SYSTEM] no data staging for format=$format" + ;; +esac + +sudo chown -R 0:0 "$SYS_MNT" +sudo chmod -R u+rwX,go+rX "$SYS_MNT" +sudo sync +sudo umount "$SYS_MNT" +trap - EXIT + +echo "[sys:$SYSTEM] done" +ls -lh "$OUT_DIR" diff --git a/playground/scripts/agent-selftest.sh b/playground/scripts/agent-selftest.sh new file mode 100755 index 0000000000..afad1ca506 --- /dev/null +++ b/playground/scripts/agent-selftest.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Spin up the agent in a local sandbox and hit its HTTP endpoints. Useful for +# iterating on agent.py without rebuilding a Firecracker image. +# +# The sandbox is just two temp directories that mimic the in-VM mounts: +# /tmp/clickbench-selftest/system — copy of the duckdb system dir +# /tmp/clickbench-selftest/datasets — empty +# +# We exercise: +# GET /health expects 200 with provisioned=false +# GET /stats expects 200 with cpu/mem/disk +# POST /provision expects 200 (will fail unless duckdb is locally installed) +# POST /query expects 200 with timing headers, output bytes capped +# +# Cleanup: kills the agent on exit. + +set -euo pipefail + +SANDBOX="${SANDBOX:-/tmp/clickbench-selftest}" +SYS="${SANDBOX}/system" +DATA="${SANDBOX}/datasets" +PORT="${PORT:-18080}" +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +rm -rf "$SANDBOX" +mkdir -p "$SYS" "$DATA" +cp -a "$REPO_DIR/duckdb"/. "$SYS"/ + +# A trivial "system" that doesn't need provisioning: replace install/start/load +# with no-ops so the smoke test focuses on the agent's HTTP path. +cat > "$SYS/install" <<'EOF' +#!/bin/bash +echo "fake install" +EOF +cat > "$SYS/start" <<'EOF' +#!/bin/bash +exit 0 +EOF +cat > "$SYS/check" <<'EOF' +#!/bin/bash +exit 0 +EOF +cat > "$SYS/load" <<'EOF' +#!/bin/bash +echo "fake load" +EOF +# A query script that echoes the request and reports 0.123s. +cat > "$SYS/query" <<'EOF' +#!/bin/bash +cat +echo "0.123" >&2 +EOF +chmod +x "$SYS"/{install,start,check,load,query} + +echo "selftest: starting agent on :$PORT" +CLICKBENCH_SYSTEM_DIR="$SYS" \ +CLICKBENCH_DATASETS_DIR="$DATA" \ +CLICKBENCH_AGENT_STATE="$SANDBOX/state" \ +CLICKBENCH_SYSTEM_NAME=selftest \ +CLICKBENCH_AGENT_PORT="$PORT" \ +CLICKBENCH_OUTPUT_LIMIT=64 \ +python3 "$REPO_DIR/playground/agent/agent.py" & +AGENT_PID=$! +trap 'kill $AGENT_PID 2>/dev/null || true' EXIT + +# wait for listen +for i in {1..30}; do + if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then + break + fi + sleep 0.2 +done + +echo "--- /health ---" +curl -fsS "http://127.0.0.1:$PORT/health" +echo "--- /stats ---" +curl -fsS "http://127.0.0.1:$PORT/stats" +echo "--- POST /provision ---" +curl -fsS -X POST "http://127.0.0.1:$PORT/provision" | head -c 500; echo + +echo "--- POST /query (capped output) ---" +LONG_BODY="$(printf 'X%.0s' {1..2048})" # 2 KB of X +curl -sS -X POST --data-binary "$LONG_BODY" "http://127.0.0.1:$PORT/query" -D - -o /tmp/clickbench-selftest.out +echo +echo "Output size: $(wc -c < /tmp/clickbench-selftest.out) bytes (cap was 64)" +echo "First chars: $(head -c 32 /tmp/clickbench-selftest.out)" + +echo "--- POST /query (without provisioning state) ---" +rm -rf "$SANDBOX/state" +mkdir -p "$SANDBOX/state" +curl -sS -X POST --data-binary "SELECT 1" "http://127.0.0.1:$PORT/query" -D - -o /dev/null | head -3 + +echo "OK" diff --git a/playground/scripts/download-datasets.sh b/playground/scripts/download-datasets.sh new file mode 100755 index 0000000000..b30fff4473 --- /dev/null +++ b/playground/scripts/download-datasets.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Eagerly download every ClickBench dataset format into the playground +# datasets dir. Run idempotent: each download script is `wget --continue`-based +# so re-running picks up where the previous run left off. +# +# Output: +# /opt/clickbench-playground/datasets/ +# hits.parquet single-file Athena parquet +# hits_partitioned/hits_0..99.parquet partitioned parquet +# hits.tsv decompressed TSV (~75 GB) +# hits.csv decompressed CSV (~75 GB) +# +# These files are read-only-mounted into every Firecracker VM via a virtio-blk +# device built by `build-datasets-image.sh`. + +set -e + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +DATASETS="${STATE_DIR}/datasets" +LIB="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../.. && pwd)/lib" + +mkdir -p "$DATASETS" +mkdir -p "$DATASETS/hits_partitioned" + +step() { echo "[$(date -u +%FT%TZ)] $*"; } + +step "parquet (single)" +if [ ! -f "$DATASETS/hits.parquet" ] || [ "$(stat -c%s "$DATASETS/hits.parquet" 2>/dev/null || echo 0)" -lt 14000000000 ]; then + "$LIB/download-hits-parquet-single" "$DATASETS" +else + step " cached" +fi + +step "parquet (partitioned)" +need=0 +for i in $(seq 0 99); do + f="$DATASETS/hits_partitioned/hits_${i}.parquet" + if [ ! -f "$f" ] || [ "$(stat -c%s "$f" 2>/dev/null || echo 0)" -lt 100000000 ]; then + need=1 + break + fi +done +if [ "$need" = "1" ]; then + "$LIB/download-hits-parquet-partitioned" "$DATASETS/hits_partitioned" +else + step " cached" +fi + +step "tsv" +if [ ! -f "$DATASETS/hits.tsv" ] || [ "$(stat -c%s "$DATASETS/hits.tsv" 2>/dev/null || echo 0)" -lt 70000000000 ]; then + "$LIB/download-hits-tsv" "$DATASETS" +else + step " cached" +fi + +step "csv" +if [ ! -f "$DATASETS/hits.csv" ] || [ "$(stat -c%s "$DATASETS/hits.csv" 2>/dev/null || echo 0)" -lt 70000000000 ]; then + "$LIB/download-hits-csv" "$DATASETS" +else + step " cached" +fi + +step "done" +du -sh "$DATASETS"/* diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh new file mode 100755 index 0000000000..f2dfe9cd84 --- /dev/null +++ b/playground/scripts/install-firecracker.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Idempotent: download firecracker + jailer if they're not in +# /opt/clickbench-playground/bin/, and fetch the guest kernel. + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +FC_VERSION="${FIRECRACKER_VERSION:-v1.13.1}" +KERNEL_URL="${GUEST_KERNEL_URL:-https://s3.amazonaws.com/spec.ccfc.min/firecracker-ci/v1.13/x86_64/vmlinux-6.1.141}" + +sudo mkdir -p "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache} +sudo chown -R "$(id -u):$(id -g)" "$STATE_DIR" + +if [ ! -x "$STATE_DIR/bin/firecracker" ]; then + arch="$(uname -m)" + url="https://github.com/firecracker-microvm/firecracker/releases/download/${FC_VERSION}/firecracker-${FC_VERSION}-${arch}.tgz" + echo "[install] firecracker ${FC_VERSION}" + tmpdir="$(mktemp -d)" + curl -fsSL "$url" -o "$tmpdir/firecracker.tgz" + tar -C "$tmpdir" -xzf "$tmpdir/firecracker.tgz" --strip-components=1 + install -m 0755 "$tmpdir/firecracker-${FC_VERSION}-${arch}" "$STATE_DIR/bin/firecracker" + install -m 0755 "$tmpdir/jailer-${FC_VERSION}-${arch}" "$STATE_DIR/bin/jailer" + rm -rf "$tmpdir" +fi + +if [ ! -f "$STATE_DIR/kernel/vmlinux" ]; then + echo "[install] guest kernel" + curl -fsSL "$KERNEL_URL" -o "$STATE_DIR/kernel/vmlinux" +fi + +# IP forwarding for the per-VM TAPs. +sudo sysctl -w net.ipv4.ip_forward=1 >/dev/null +echo "net.ipv4.ip_forward=1" | sudo tee /etc/sysctl.d/99-clickbench-playground.conf >/dev/null + +echo "[install] done" +"$STATE_DIR/bin/firecracker" --version diff --git a/playground/scripts/run-server.sh b/playground/scripts/run-server.sh new file mode 100755 index 0000000000..b3bc56b959 --- /dev/null +++ b/playground/scripts/run-server.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Convenience wrapper to start the playground API server in the foreground. +# Looks for .env in the repo root for ClickHouse Cloud creds. + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +if [ -f "$REPO_DIR/playground/.env" ]; then + # shellcheck disable=SC2046 + export $(grep -v '^#' "$REPO_DIR/playground/.env" | xargs) +fi + +cd "$REPO_DIR" +exec python3 -m playground.server.main diff --git a/playground/scripts/smoke-boot.sh b/playground/scripts/smoke-boot.sh new file mode 100755 index 0000000000..d79ecc8c87 --- /dev/null +++ b/playground/scripts/smoke-boot.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Boot a single Firecracker VM with the playground's base rootfs, attaching +# only the rootfs (no system disk, no dataset disk). Confirms the kernel + +# rootfs + agent path works end-to-end before we start asking it to install +# a database. Tears down on exit. +# +# Usage: smoke-boot.sh [slot] +# Logs go to /opt/clickbench-playground/logs/smoke-boot.log + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +SLOT="${1:-250}" # high slot to avoid clashing with the real registry +SOCK="$STATE_DIR/vms/smoke-boot.sock" +LOG="$STATE_DIR/logs/smoke-boot.log" +TAP="fc-tap-${SLOT}" +HOST_IP="10.200.${SLOT}.1" +GUEST_IP="10.200.${SLOT}.2" + +cleanup() { + echo "[smoke] cleanup" + pkill -f "firecracker.*${SOCK}" 2>/dev/null || true + sleep 0.3 + sudo ip link set "$TAP" down 2>/dev/null || true + sudo ip tuntap del dev "$TAP" mode tap 2>/dev/null || true + rm -f "$SOCK" +} +trap cleanup EXIT + +mkdir -p "$STATE_DIR/vms" "$STATE_DIR/logs" +rm -f "$SOCK" + +if ! ip link show "$TAP" >/dev/null 2>&1; then + sudo ip tuntap add dev "$TAP" mode tap +fi +sudo ip addr flush dev "$TAP" 2>/dev/null || true +sudo ip addr add "${HOST_IP}/24" dev "$TAP" +sudo ip link set "$TAP" up + +# Start Firecracker +"$STATE_DIR/bin/firecracker" --api-sock "$SOCK" --id smoke-boot >"$LOG" 2>&1 & +FC_PID=$! +echo "[smoke] firecracker pid=$FC_PID sock=$SOCK" + +# Wait for socket +for _ in $(seq 1 40); do + [ -S "$SOCK" ] && break + sleep 0.1 +done + +api() { + local m="$1" path="$2" body="${3:-}" + if [ -n "$body" ]; then + curl --unix-socket "$SOCK" -fsS -X "$m" "http://localhost$path" \ + -H 'Content-Type: application/json' --data "$body" + else + curl --unix-socket "$SOCK" -fsS -X "$m" "http://localhost$path" + fi +} + +api PUT /boot-source "$(cat </dev/null 2>&1; then + ok=1 + break + fi + sleep 1 +done + +if [ "$ok" = "1" ]; then + echo "[smoke] OK — agent responded after ${i}s" + curl -fsS "http://${GUEST_IP}:8080/health" | head -c 200; echo + echo "[smoke] /stats:" + curl -fsS "http://${GUEST_IP}:8080/stats" | head -c 400; echo +else + echo "[smoke] FAIL — agent never responded; firecracker log tail:" + tail -30 "$LOG" +fi diff --git a/playground/server/__init__.py b/playground/server/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/playground/server/config.py b/playground/server/config.py new file mode 100644 index 0000000000..6a08189d85 --- /dev/null +++ b/playground/server/config.py @@ -0,0 +1,98 @@ +"""Central configuration for the playground server. + +All knobs are read from environment variables so a single systemd unit can drop +them in. Falls back to sensible defaults for local development. +""" +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path + + +def _env_int(name: str, default: int) -> int: + v = os.environ.get(name) + if not v: + return default + try: + return int(v) + except ValueError: + return default + + +def _env_bytes(name: str, default: int) -> int: + return _env_int(name, default) + + +@dataclass(frozen=True) +class Config: + # Where on the host disk we keep VM artifacts and dataset images. + state_dir: Path + repo_dir: Path + # HTTP listen target for the playground API server. Plain host:port string; + # aiohttp parses it. + listen_host: str + listen_port: int + # Per-VM resources. + vm_vcpus: int + vm_mem_mib: int + vm_rootfs_size_gb: int + # Output cap applied at the host edge (the agent enforces a per-VM cap too). + output_limit_bytes: int + # Max number of VMs we'll keep "warm" (resumed from snapshot, ready to + # answer) concurrently. + max_warm_vms: int + # Watchdog thresholds. + cpu_busy_window_sec: int + cpu_busy_threshold: float + host_min_free_ram_gb: int + host_min_free_disk_gb: int + # Per-system disk full check. + vm_disk_pct_kill_threshold: float + # ClickHouse Cloud logging. + ch_cloud_url: str + ch_cloud_user: str + ch_cloud_password: str + ch_cloud_db: str + + @property + def kernel_path(self) -> Path: return self.state_dir / "kernel" / "vmlinux" + @property + def base_rootfs(self) -> Path: return self.state_dir / "base-rootfs.ext4" + @property + def datasets_image(self) -> Path: return self.state_dir / "datasets.ext4" + @property + def systems_dir(self) -> Path: return self.state_dir / "systems" + @property + def vms_dir(self) -> Path: return self.state_dir / "vms" + @property + def logs_dir(self) -> Path: return self.state_dir / "logs" + @property + def firecracker_bin(self) -> Path: return self.state_dir / "bin" / "firecracker" + + +def load() -> Config: + state_dir = Path(os.environ.get("PLAYGROUND_STATE_DIR", "/opt/clickbench-playground")) + repo_dir = Path(os.environ.get("PLAYGROUND_REPO_DIR", "/home/ubuntu/ClickBench")) + listen = os.environ.get("PLAYGROUND_LISTEN", "0.0.0.0:8000") + host, _, port = listen.rpartition(":") + return Config( + state_dir=state_dir, + repo_dir=repo_dir, + listen_host=host or "0.0.0.0", + listen_port=int(port or 8000), + vm_vcpus=_env_int("VM_VCPUS", 4), + vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024), + vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), + output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024), + max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), + cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120), + cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), + host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32), + host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 500), + vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")), + ch_cloud_url=os.environ.get("CLICKHOUSE_CLOUD_URL", ""), + ch_cloud_user=os.environ.get("CLICKHOUSE_CLOUD_USER", ""), + ch_cloud_password=os.environ.get("CLICKHOUSE_CLOUD_PASSWORD", ""), + ch_cloud_db=os.environ.get("CLICKHOUSE_CLOUD_DB", "playground"), + ) diff --git a/playground/server/firecracker.py b/playground/server/firecracker.py new file mode 100644 index 0000000000..62aba74dca --- /dev/null +++ b/playground/server/firecracker.py @@ -0,0 +1,117 @@ +"""Thin async wrapper around Firecracker's REST API (Unix socket). + +We talk to the Firecracker process through its API socket, not the JSON config +file, because that's the only way to drive snapshot create/load and to mutate +runtime state. + +The HTTP layer is hand-rolled (single-shot HTTP/1.1 over Unix socket) so we +don't pull in extra deps just to send a few PUTs. Each call opens a new +connection — Firecracker's API socket is single-threaded and that's fine. +""" +from __future__ import annotations + +import asyncio +import json +from typing import Any + + +class FirecrackerError(RuntimeError): + pass + + +async def _request(socket_path: str, method: str, path: str, body: Any = None, + timeout: float = 30.0) -> tuple[int, bytes]: + payload = b"" + if body is not None: + payload = json.dumps(body).encode() + req_lines = [ + f"{method} {path} HTTP/1.1", + "Host: localhost", + "Accept: application/json", + "Connection: close", + ] + if payload: + req_lines.append("Content-Type: application/json") + req_lines.append(f"Content-Length: {len(payload)}") + req_lines.append("") + req_lines.append("") + head = "\r\n".join(req_lines).encode() + + reader, writer = await asyncio.wait_for( + asyncio.open_unix_connection(socket_path), timeout=timeout + ) + try: + writer.write(head + payload) + await writer.drain() + # Read response head line-by-line until the blank line that ends the + # header block. Don't `read(-1)` — Firecracker keeps the connection + # open after small responses (204s in particular), so EOF-based reads + # block until our timeout despite the response being fully on the + # wire. Once we have headers we know the Content-Length and can read + # exactly that many body bytes. + head_lines: list[bytes] = [] + while True: + line = await asyncio.wait_for(reader.readline(), timeout=timeout) + if not line: + # Server closed the connection mid-headers. + break + head_lines.append(line) + if line == b"\r\n" or line == b"\n": + break + + if not head_lines: + raise FirecrackerError(f"no response from firecracker for {method} {path}") + status_line = head_lines[0].rstrip(b"\r\n").decode("ascii", errors="replace") + parts = status_line.split(" ", 2) + if len(parts) < 2: + raise FirecrackerError(f"bad status line: {status_line!r}") + code = int(parts[1]) + + content_length = 0 + for raw_h in head_lines[1:]: + h = raw_h.rstrip(b"\r\n") + if not h: + continue + name, _, value = h.partition(b":") + if name.strip().lower() == b"content-length": + try: + content_length = int(value.strip()) + except ValueError: + content_length = 0 + + body_b = b"" + if content_length > 0: + body_b = await asyncio.wait_for( + reader.readexactly(content_length), timeout=timeout + ) + finally: + try: + writer.close() + await writer.wait_closed() + except Exception: + pass + return code, body_b + + +async def put(socket_path: str, path: str, body: Any = None, timeout: float = 30.0) -> None: + code, b = await _request(socket_path, "PUT", path, body, timeout) + if code >= 300: + raise FirecrackerError(f"PUT {path} -> {code}: {b!r}") + + +async def patch(socket_path: str, path: str, body: Any = None, timeout: float = 30.0) -> None: + code, b = await _request(socket_path, "PATCH", path, body, timeout) + if code >= 300: + raise FirecrackerError(f"PATCH {path} -> {code}: {b!r}") + + +async def get(socket_path: str, path: str, timeout: float = 30.0) -> dict: + code, b = await _request(socket_path, "GET", path, timeout=timeout) + if code >= 300: + raise FirecrackerError(f"GET {path} -> {code}: {b!r}") + if not b: + return {} + try: + return json.loads(b) + except Exception as e: + raise FirecrackerError(f"GET {path} -> non-JSON body: {b!r}") from e diff --git a/playground/server/logging_sink.py b/playground/server/logging_sink.py new file mode 100644 index 0000000000..6ba444c4e0 --- /dev/null +++ b/playground/server/logging_sink.py @@ -0,0 +1,190 @@ +"""Batched, async logger that writes events to ClickHouse Cloud over HTTPS. + +Two tables (auto-created on first connect if writeable): + + playground.requests + ts DateTime64(6) + client_addr String + user_agent String + system String + query String + output_bytes UInt64 + output_truncated UInt8 + query_time Nullable(Float64) from agent X-Query-Time + wall_time Float64 host-side end-to-end + status UInt16 HTTP status returned to client + error String + + playground.events + ts DateTime64(6) + system String + kind String "restart" / "oom-kick" / "boot" / ... + detail String + +When CLICKHOUSE_CLOUD_URL is unset, both tables are mirrored to +/opt/clickbench-playground/logs/requests.jsonl and events.jsonl so the +service still has an audit trail in dev. +""" +from __future__ import annotations + +import asyncio +import contextlib +import json +import logging +import time +from pathlib import Path +from typing import Any + +import aiohttp + +from .config import Config + +log = logging.getLogger("logging_sink") + + +_REQUESTS_DDL = """ +CREATE TABLE IF NOT EXISTS playground.requests ( + ts DateTime64(6) DEFAULT now64(6), + client_addr String, + user_agent String, + system String, + query String, + output_bytes UInt64, + output_truncated UInt8, + query_time Nullable(Float64), + wall_time Float64, + status UInt16, + error String +) ENGINE = MergeTree ORDER BY (system, ts) +""" + +_EVENTS_DDL = """ +CREATE TABLE IF NOT EXISTS playground.events ( + ts DateTime64(6) DEFAULT now64(6), + system String, + kind String, + detail String +) ENGINE = MergeTree ORDER BY (system, ts) +""" + + +class LoggingSink: + def __init__(self, cfg: Config): + self.cfg = cfg + self._queue: asyncio.Queue[tuple[str, dict]] = asyncio.Queue(maxsize=10000) + self._task: asyncio.Task | None = None + self._session: aiohttp.ClientSession | None = None + self._local_files: dict[str, Path] = {} + self._enabled = bool(cfg.ch_cloud_url and cfg.ch_cloud_user and cfg.ch_cloud_password) + + async def start(self) -> None: + self.cfg.logs_dir.mkdir(parents=True, exist_ok=True) + self._local_files = { + "requests": self.cfg.logs_dir / "requests.jsonl", + "events": self.cfg.logs_dir / "events.jsonl", + } + if self._enabled: + try: + self._session = aiohttp.ClientSession() + await self._run_ddl() + except Exception as e: + log.warning("ClickHouse Cloud DDL failed (%r); falling back to JSONL only", e) + self._enabled = False + self._task = asyncio.create_task(self._flusher(), name="logging-sink") + + async def stop(self) -> None: + if self._task: + self._task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await self._task + if self._session: + await self._session.close() + + def write_request(self, **row: Any) -> None: + self._enqueue("requests", row) + + def write_event(self, **row: Any) -> None: + self._enqueue("events", row) + + def _enqueue(self, table: str, row: dict) -> None: + row.setdefault("ts", _now_dt64()) + try: + self._queue.put_nowait((table, row)) + except asyncio.QueueFull: + # Backpressure: drop oldest log lines first so we never block the + # query path on the audit trail. + try: + self._queue.get_nowait() + self._queue.put_nowait((table, row)) + except Exception: + pass + + async def _run_ddl(self) -> None: + await self._exec_ch(f"CREATE DATABASE IF NOT EXISTS {self.cfg.ch_cloud_db}") + await self._exec_ch(_REQUESTS_DDL.replace("playground.", f"{self.cfg.ch_cloud_db}.")) + await self._exec_ch(_EVENTS_DDL.replace("playground.", f"{self.cfg.ch_cloud_db}.")) + + async def _exec_ch(self, sql: str) -> None: + assert self._session is not None + async with self._session.post( + self.cfg.ch_cloud_url, + data=sql, + auth=aiohttp.BasicAuth(self.cfg.ch_cloud_user, self.cfg.ch_cloud_password), + timeout=aiohttp.ClientTimeout(total=30), + ) as r: + if r.status >= 300: + txt = await r.text() + raise RuntimeError(f"CH error {r.status}: {txt[:500]}") + + async def _insert_ch(self, table: str, rows: list[dict]) -> None: + if not rows: + return + body = "\n".join(json.dumps(r, default=str) for r in rows) + sql = f"INSERT INTO {self.cfg.ch_cloud_db}.{table} FORMAT JSONEachRow\n{body}" + await self._exec_ch(sql) + + async def _flusher(self) -> None: + buf: dict[str, list[dict]] = {"requests": [], "events": []} + last_flush = time.monotonic() + try: + while True: + timeout = 1.0 + try: + table, row = await asyncio.wait_for(self._queue.get(), timeout=timeout) + buf[table].append(row) + except asyncio.TimeoutError: + pass + # Flush every 1s or when batch >= 256 rows for any table + now = time.monotonic() + full = any(len(v) >= 256 for v in buf.values()) + if full or now - last_flush > 1.0: + await self._do_flush(buf) + for k in buf: + buf[k] = [] + last_flush = now + except asyncio.CancelledError: + await self._do_flush(buf) + raise + + async def _do_flush(self, buf: dict[str, list[dict]]) -> None: + for table, rows in buf.items(): + if not rows: + continue + # Always write to the local JSONL too — gives us a tail for + # debugging and a buffer if CH Cloud rejects. + try: + with open(self._local_files[table], "ab") as f: + for r in rows: + f.write((json.dumps(r, default=str) + "\n").encode()) + except Exception: + pass + if self._enabled: + try: + await self._insert_ch(table, rows) + except Exception as e: + log.warning("CH insert failed (%r); rows preserved in JSONL", e) + + +def _now_dt64() -> str: + t = time.time() + return time.strftime("%Y-%m-%d %H:%M:%S.", time.gmtime(t)) + f"{int((t % 1) * 1e6):06d}" diff --git a/playground/server/main.py b/playground/server/main.py new file mode 100644 index 0000000000..fe6cc86274 --- /dev/null +++ b/playground/server/main.py @@ -0,0 +1,252 @@ +"""Playground HTTP API + static UI server. + +Endpoints: + + GET / redirects to /ui/ + GET /ui/... static-serves files from ../web/ + GET /api/systems JSON list of all playground-eligible systems + GET /api/state JSON snapshot of every VM's state + GET /api/system/{name} detail for a single system + POST /api/query?system=X body is the SQL; returns application/octet-stream + with timing in headers + GET /api/provision-log/{name} the system's most recent provision log + POST /api/admin/provision/{name} + manual trigger for first-time provision; convenient + for warming a system before the first user query + +The /api/query path tries once, then on failure tears down + restores from +snapshot and retries exactly once, matching the spec. +""" +from __future__ import annotations + +import asyncio +import logging +import signal +import time +from pathlib import Path + +import aiohttp +from aiohttp import web + +from . import config as config_mod +from . import systems as systems_mod +from .logging_sink import LoggingSink +from .monitor import Monitor +from .vm_manager import VMManager + +log = logging.getLogger("playground") + + +class App: + def __init__(self) -> None: + self.cfg = config_mod.load() + self.systems = systems_mod.discover(self.cfg.repo_dir) + self.vmm = VMManager(self.cfg, self.systems) + self.sink = LoggingSink(self.cfg) + self.monitor = Monitor(self.cfg, self.vmm, self.sink) + + async def on_startup(self, _app: web.Application) -> None: + await self.sink.start() + await self.monitor.start() + + async def on_cleanup(self, _app: web.Application) -> None: + await self.monitor.stop() + await self.sink.stop() + + # ── handlers ───────────────────────────────────────────────────────── + + async def handle_systems(self, _r: web.Request) -> web.Response: + return web.json_response([s.asdict() for s in self.systems.values()]) + + async def handle_state(self, _r: web.Request) -> web.Response: + return web.json_response(self.vmm.list_all()) + + async def handle_system(self, req: web.Request) -> web.Response: + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound(reason=f"unknown system: {name}") + vm = self.vmm.vms[name] + return web.json_response({ + **self.systems[name].asdict(), + "state": vm.state, + "has_snapshot": vm.snapshot_bin.exists(), + "provisioned_at": vm.provisioned_at, + "last_used": vm.last_used, + "last_error": vm.last_error, + "agent_url": self.vmm.agent_url(vm), + }) + + async def handle_provision_log(self, req: web.Request) -> web.Response: + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + log_path = self.cfg.logs_dir / f"firecracker-{name}.log" + if not log_path.exists(): + return web.Response(text="", content_type="text/plain") + try: + # Tail at most 64 KB so the browser doesn't choke. + data = log_path.read_bytes()[-64 * 1024:] + except Exception as e: + data = f"(failed to read: {e})".encode() + return web.Response(body=data, content_type="text/plain") + + async def handle_admin_provision(self, req: web.Request) -> web.Response: + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + # Fire-and-forget; the client polls /api/system/{name} for state. + asyncio.create_task(self._provision_bg(name)) + return web.json_response({"started": True, "system": name}) + + async def _provision_bg(self, name: str) -> None: + try: + await self.vmm.ensure_ready_for_query(name) + except Exception as e: + log.exception("background provision failed for %s", name) + self.sink.write_event(system=name, kind="provision-failed", detail=repr(e)) + + async def handle_query(self, req: web.Request) -> web.StreamResponse: + system_name = req.query.get("system", "") + if system_name not in self.systems: + return web.json_response({"error": f"unknown system: {system_name!r}"}, + status=400) + sql = await req.read() + if not sql.strip(): + return web.json_response({"error": "empty SQL"}, status=400) + + client_addr = req.headers.get("X-Forwarded-For", req.remote or "?") + ua = req.headers.get("User-Agent", "") + wall_t0 = time.monotonic() + status = 500 + body = b"" + headers: dict[str, str] = {} + err: str | None = None + try: + body, headers, status = await self._dispatch_query(system_name, sql) + except Exception as e: + err = repr(e) + log.exception("[%s] query dispatch failed", system_name) + finally: + wall = time.monotonic() - wall_t0 + try: + self.sink.write_request( + client_addr=client_addr, user_agent=ua, + system=system_name, + query=sql.decode("utf-8", errors="replace")[:65536], + output_bytes=int(headers.get("X-Output-Bytes", "0") or 0), + output_truncated=int(headers.get("X-Output-Truncated", "0") or 0), + query_time=(float(headers["X-Query-Time"]) + if "X-Query-Time" in headers else None), + wall_time=wall, + status=status, + error=err or "", + ) + except Exception: + log.exception("logging request failed") + + resp = web.Response(body=body, status=status, + content_type="application/octet-stream") + for k, v in headers.items(): + resp.headers[k] = v + resp.headers["X-Wall-Time"] = f"{wall:.6f}" + if err and "X-Error" not in resp.headers: + resp.headers["X-Error"] = err[:512] + return resp + + async def _dispatch_query(self, system_name: str, sql: bytes + ) -> tuple[bytes, dict[str, str], int]: + """Run the query once. On low-level failure (VM unreachable, transport + error) tear down and retry once. Higher-level errors (non-2xx from the + agent itself, e.g. a SQL syntax error) are NOT retried — they're real + results.""" + last_exc: Exception | None = None + for attempt in (1, 2): + try: + vm = await self.vmm.ensure_ready_for_query(system_name) + except Exception as e: + last_exc = e + if attempt == 1: + self.sink.write_event(system=system_name, kind="ensure-failed", + detail=f"attempt {attempt}: {e!r}") + await asyncio.sleep(0.5) + continue + raise + url = self.vmm.agent_url(vm) + "/query" + try: + async with aiohttp.ClientSession() as s: + async with s.post(url, data=sql, + timeout=aiohttp.ClientTimeout(total=600)) as r: + body = await r.read() + headers = {k: r.headers[k] for k in r.headers if k.startswith("X-")} + headers.setdefault("X-Output-Bytes", str(len(body))) + return body, headers, r.status + except Exception as e: + last_exc = e + self.sink.write_event(system=system_name, kind="agent-error", + detail=f"attempt {attempt}: {e!r}") + if attempt == 1: + # Hard kill, will trigger snapshot restore on next ensure. + await self.vmm.kick(system_name, "agent-error-retry") + await asyncio.sleep(0.5) + continue + raise + # unreachable, but keep mypy happy + raise RuntimeError(str(last_exc)) + + +def build_app() -> web.Application: + obj = App() + app = web.Application(client_max_size=4 * 1024 * 1024) + app.on_startup.append(obj.on_startup) + app.on_cleanup.append(obj.on_cleanup) + + app.router.add_get("/api/systems", obj.handle_systems) + app.router.add_get("/api/state", obj.handle_state) + app.router.add_get("/api/system/{name}", obj.handle_system) + app.router.add_get("/api/provision-log/{name}", obj.handle_provision_log) + app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision) + app.router.add_post("/api/query", obj.handle_query) + + # Static UI + web_dir = Path(__file__).resolve().parent.parent / "web" + + async def root_redirect(_r: web.Request) -> web.Response: + raise web.HTTPFound("/ui/") + + async def ui_index(_r: web.Request) -> web.FileResponse: + return web.FileResponse(web_dir / "index.html") + + app.router.add_get("/", root_redirect) + app.router.add_get("/ui/", ui_index) + app.router.add_get("/ui", ui_index) + app.router.add_static("/ui/", path=str(web_dir), show_index=False, follow_symlinks=True) + + return app + + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + ) + cfg = config_mod.load() + app = build_app() + # Wire signals to a clean shutdown. + runner = web.AppRunner(app) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(runner.setup()) + site = web.TCPSite(runner, cfg.listen_host, cfg.listen_port) + loop.run_until_complete(site.start()) + log.info("playground listening on http://%s:%d", cfg.listen_host, cfg.listen_port) + + stop = asyncio.Event() + for sig in (signal.SIGTERM, signal.SIGINT): + loop.add_signal_handler(sig, stop.set) + loop.run_until_complete(stop.wait()) + loop.run_until_complete(runner.cleanup()) + loop.close() + + +if __name__ == "__main__": + main() diff --git a/playground/server/monitor.py b/playground/server/monitor.py new file mode 100644 index 0000000000..1bdefc6b82 --- /dev/null +++ b/playground/server/monitor.py @@ -0,0 +1,215 @@ +"""Background watchdog. + +Runs alongside the API server. Once per second: + + * For every running VM, sample CPU% (from /proc//stat), RSS, and the + rootfs file's current physical size (via stat). Update the VM record. + * If a VM has been at >= cpu_busy_threshold for cpu_busy_window_sec + contiguous seconds, restart it. + * If a VM's rootfs is filled past vm_disk_pct_kill_threshold of its nominal + cap (200 GB) — i.e. the sparse file is using more than that fraction — + restart it. + * Sample host free memory / free disk on the state_dir filesystem. If under + threshold, find the largest live VM (by RSS for memory pressure, by + rootfs_used_bytes for disk pressure) and kick it. + +`kick` is implemented via vm_manager.kick(name, reason), which leaves the +snapshot intact. A subsequent /query will trigger a restore. +""" +from __future__ import annotations + +import asyncio +import logging +import os +import shutil +import time +from pathlib import Path + +from .config import Config +from .logging_sink import LoggingSink +from .vm_manager import VM, VMManager + +log = logging.getLogger("monitor") + + +class Monitor: + def __init__(self, cfg: Config, vmm: VMManager, sink: LoggingSink): + self.cfg = cfg + self.vmm = vmm + self.sink = sink + self._cpu_history: dict[str, tuple[int, int, float]] = {} # name -> (utime+stime, total, ts) + self._task: asyncio.Task | None = None + + async def start(self) -> None: + self._task = asyncio.create_task(self._loop(), name="monitor") + + async def stop(self) -> None: + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + + async def _loop(self) -> None: + try: + while True: + await self._tick() + await asyncio.sleep(1.0) + except asyncio.CancelledError: + raise + except Exception: + log.exception("monitor loop crashed; restarting in 5s") + await asyncio.sleep(5) + self._task = asyncio.create_task(self._loop(), name="monitor") + + async def _tick(self) -> None: + # Per-VM sampling + for name, vm in self.vmm.vms.items(): + if vm.pid is None or not _pid_alive(vm.pid): + self._cpu_history.pop(name, None) + vm.cpu_busy_since = None + continue + cpu_pct = self._sample_cpu(name, vm.pid) + vm.rss_bytes = _rss(vm.pid) + rootfs = self.cfg.systems_dir / name / "rootfs.ext4" + try: + st = rootfs.stat() + vm.rootfs_used_bytes = st.st_blocks * 512 # actual allocated bytes + except FileNotFoundError: + vm.rootfs_used_bytes = 0 + await self._check_per_vm(vm, cpu_pct) + + # Host-wide checks + await self._check_host_pressure() + + def _sample_cpu(self, name: str, pid: int) -> float | None: + """Return ratio of CPU used since last sample, normalized by vcpu count.""" + stat_path = Path(f"/proc/{pid}/stat") + try: + stat = stat_path.read_text() + except FileNotFoundError: + self._cpu_history.pop(name, None) + return None + # The comm field can contain spaces — split around the last ')' + end = stat.rfind(")") + parts = stat[end + 2:].split() + utime = int(parts[11]) + stime = int(parts[12]) + now = time.monotonic() + prev = self._cpu_history.get(name) + self._cpu_history[name] = (utime, stime, now) + if prev is None: + return None + dt = now - prev[2] + d_jiffies = (utime + stime) - (prev[0] + prev[1]) + clk = os.sysconf("SC_CLK_TCK") + if dt <= 0 or clk <= 0: + return None + # Normalize by the number of vCPUs the VM was allocated. + cpu_seconds = d_jiffies / clk + return cpu_seconds / (dt * self.cfg.vm_vcpus) + + async def _check_per_vm(self, vm: VM, cpu_pct: float | None) -> None: + # CPU saturation watchdog + if cpu_pct is None: + vm.cpu_busy_since = None + elif cpu_pct >= self.cfg.cpu_busy_threshold: + if vm.cpu_busy_since is None: + vm.cpu_busy_since = time.monotonic() + elif time.monotonic() - vm.cpu_busy_since > self.cfg.cpu_busy_window_sec: + self.sink.write_event( + system=vm.system.name, kind="cpu-watchdog", + detail=f"sustained CPU >= {self.cfg.cpu_busy_threshold:.0%} for " + f"{self.cfg.cpu_busy_window_sec}s", + ) + await self.vmm.kick(vm.system.name, "cpu-watchdog") + vm.cpu_busy_since = None + return + else: + vm.cpu_busy_since = None + + # Disk usage watchdog + cap = self.cfg.vm_rootfs_size_gb * (1 << 30) + if vm.rootfs_used_bytes and vm.rootfs_used_bytes / cap >= self.cfg.vm_disk_pct_kill_threshold: + self.sink.write_event( + system=vm.system.name, kind="disk-watchdog", + detail=f"rootfs used {vm.rootfs_used_bytes}/{cap}", + ) + await self.vmm.kick(vm.system.name, "disk-watchdog") + + async def _check_host_pressure(self) -> None: + # Memory pressure + info = _meminfo() + free_ram_gb = info.get("MemAvailable", 0) / (1024 * 1024) # MemAvailable is in KB + if free_ram_gb < self.cfg.host_min_free_ram_gb: + target = self._largest_running(by="rss") + if target: + self.sink.write_event( + system=target.system.name, kind="oom-kick", + detail=f"host free RAM {free_ram_gb:.1f}G < {self.cfg.host_min_free_ram_gb}G; " + f"largest is {target.system.name} ({target.rss_bytes/1e9:.1f}G)", + ) + await self.vmm.kick(target.system.name, "host-mem-pressure") + + # Disk pressure on the state dir + st = shutil.disk_usage(self.cfg.state_dir) + free_disk_gb = st.free / (1 << 30) + if free_disk_gb < self.cfg.host_min_free_disk_gb: + target = self._largest_running(by="disk") + if target: + self.sink.write_event( + system=target.system.name, kind="disk-kick", + detail=f"host free disk {free_disk_gb:.1f}G < {self.cfg.host_min_free_disk_gb}G; " + f"largest is {target.system.name} ({target.rootfs_used_bytes/1e9:.1f}G)", + ) + await self.vmm.kick(target.system.name, "host-disk-pressure") + + def _largest_running(self, *, by: str) -> VM | None: + running = [v for v in self.vmm.vms.values() + if v.pid is not None and _pid_alive(v.pid)] + if not running: + return None + key = (lambda v: v.rss_bytes) if by == "rss" else (lambda v: v.rootfs_used_bytes) + return max(running, key=key) + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + + +def _rss(pid: int) -> int: + try: + text = Path(f"/proc/{pid}/status").read_text() + except FileNotFoundError: + return 0 + for line in text.splitlines(): + if line.startswith("VmRSS:"): + parts = line.split() + return int(parts[1]) * 1024 # KB -> bytes + return 0 + + +def _meminfo() -> dict[str, int]: + out: dict[str, int] = {} + try: + text = Path("/proc/meminfo").read_text() + except FileNotFoundError: + return out + for line in text.splitlines(): + if ":" not in line: + continue + k, v = line.split(":", 1) + parts = v.split() + if parts: + try: + out[k.strip()] = int(parts[0]) + except ValueError: + continue + return out diff --git a/playground/server/net.py b/playground/server/net.py new file mode 100644 index 0000000000..2c8fdac96c --- /dev/null +++ b/playground/server/net.py @@ -0,0 +1,130 @@ +"""Per-VM TAP networking setup for Firecracker. + +Each VM gets its own /24 subnet on a dedicated TAP device: + + fc-tap- host: 10.200..1/24 vm: 10.200..2 + +Where is a small integer derived from the system slot (1..N). The /24 has +plenty of headroom but only two addresses are used — one /24 per VM keeps the +host's routing trivial: no shared bridge, no ARP nonsense, no collisions. + +During the *provision* phase we masquerade outbound traffic from the VM so it +can apt-get / curl. After the snapshot we drop the FORWARD rules; the VM can +still talk to the host (and therefore the agent endpoint) but cannot reach the +internet. +""" +from __future__ import annotations + +import asyncio +import contextlib +import re + +# The /16 we hand out from. 10.200.0.0/16 -> 256 /24 subnets, plenty for our use. +_BASE = "10.200" + + +def addr_for(slot: int) -> tuple[str, str, str]: + """Return (host_ip, vm_ip, cidr) for the given slot id.""" + if not 1 <= slot <= 250: + raise ValueError(f"slot out of range: {slot}") + return f"{_BASE}.{slot}.1", f"{_BASE}.{slot}.2", f"{_BASE}.{slot}.0/24" + + +def tap_name(slot: int) -> str: + return f"fc-tap-{slot}" + + +def mac_for(slot: int) -> str: + # Locally administered, unicast, deterministic by slot. + return f"02:fc:00:00:{slot // 256:02x}:{slot % 256:02x}" + + +async def _run(*args: str, check: bool = True) -> tuple[int, bytes, bytes]: + p = await asyncio.create_subprocess_exec( + *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + o, e = await p.communicate() + if check and p.returncode != 0: + raise RuntimeError(f"cmd failed: {' '.join(args)}: {e.decode(errors='replace')}") + return p.returncode or 0, o, e + + +async def ensure_tap(slot: int) -> None: + """Create the TAP device and assign the host-side address. Idempotent.""" + tap = tap_name(slot) + host_ip, _, _ = addr_for(slot) + # Does the device already exist? + rc, out, _ = await _run("ip", "-br", "link", "show", "dev", tap, check=False) + if rc != 0: + await _run("sudo", "ip", "tuntap", "add", "dev", tap, "mode", "tap") + # Make sure the IP is there + rc, addrs, _ = await _run("ip", "-br", "addr", "show", "dev", tap, check=False) + if rc != 0 or host_ip not in addrs.decode(errors="replace"): + # Strip any old IPs then add the canonical one. + await _run("sudo", "ip", "addr", "flush", "dev", tap, check=False) + await _run("sudo", "ip", "addr", "add", f"{host_ip}/24", "dev", tap) + await _run("sudo", "ip", "link", "set", tap, "up") + + +async def teardown_tap(slot: int) -> None: + tap = tap_name(slot) + with contextlib.suppress(Exception): + await _run("sudo", "ip", "link", "set", tap, "down", check=False) + with contextlib.suppress(Exception): + await _run("sudo", "ip", "tuntap", "del", "dev", tap, "mode", "tap", check=False) + + +_NAT_RULE_PAT = re.compile(r"^-A POSTROUTING.*-o\s+(\S+).*-j\s+MASQUERADE", re.MULTILINE) + + +async def _host_default_iface() -> str: + """Return the host's default outbound interface (e.g. eth0).""" + rc, out, _ = await _run("ip", "-o", "-4", "route", "show", "default") + text = out.decode(errors="replace") + # "default via 1.2.3.4 dev eth0 ..." + parts = text.split() + for i, p in enumerate(parts): + if p == "dev" and i + 1 < len(parts): + return parts[i + 1] + raise RuntimeError(f"could not find default route: {text!r}") + + +async def enable_internet(slot: int) -> None: + """Allow the VM to reach the outside world via MASQUERADE + FORWARD.""" + iface = await _host_default_iface() + _, _, cidr = addr_for(slot) + # MASQUERADE rule: add only if not already present. + rc, out, _ = await _run("sudo", "iptables", "-t", "nat", "-S", "POSTROUTING") + if f"-s {cidr}" not in out.decode(errors="replace"): + await _run("sudo", "iptables", "-t", "nat", "-A", "POSTROUTING", + "-s", cidr, "-o", iface, "-j", "MASQUERADE") + # FORWARD rules + for rule in ( + ("-i", tap_name(slot), "-o", iface, "-j", "ACCEPT"), + ("-i", iface, "-o", tap_name(slot), "-m", "state", "--state", + "RELATED,ESTABLISHED", "-j", "ACCEPT"), + ): + rc, out, _ = await _run("sudo", "iptables", "-C", "FORWARD", *rule, check=False) + if rc != 0: + await _run("sudo", "iptables", "-A", "FORWARD", *rule) + + +async def disable_internet(slot: int) -> None: + """Drop the masquerade + forward rules added by enable_internet.""" + iface = await _host_default_iface() + _, _, cidr = addr_for(slot) + # Best-effort removal — repeat until iptables reports the rule isn't there. + while True: + rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-D", "POSTROUTING", + "-s", cidr, "-o", iface, "-j", "MASQUERADE", check=False) + if rc != 0: + break + for rule in ( + ("-i", tap_name(slot), "-o", iface, "-j", "ACCEPT"), + ("-i", iface, "-o", tap_name(slot), "-m", "state", "--state", + "RELATED,ESTABLISHED", "-j", "ACCEPT"), + ): + while True: + rc, _, _ = await _run("sudo", "iptables", "-D", "FORWARD", *rule, check=False) + if rc != 0: + break diff --git a/playground/server/systems.py b/playground/server/systems.py new file mode 100644 index 0000000000..c41599858a --- /dev/null +++ b/playground/server/systems.py @@ -0,0 +1,134 @@ +"""Registry of ClickBench systems that can be exposed through the playground. + +A system is *playground-eligible* if its directory contains the canonical +unified script set (install/start/load/query/check/stop) AND there is no +external service required (no `aurora-*`, `redshift*`, `bigquery`, `snowflake`, +etc. — those need API keys and live on someone else's infra). + +The registry is built by scanning the repo at startup. Each `System` carries: + + * name the directory name (also the URL-safe identifier) + * display_name pulled from template.json "system" field if present + * tags from template.json + * download_script from `BENCH_DOWNLOAD_SCRIPT=` line in benchmark.sh + * data_format inferred from download_script (parquet / parquet-partitioned / tsv / csv / none) + * durable BENCH_DURABLE=yes/no (default yes) + * restartable BENCH_RESTARTABLE=yes/no (default yes) +""" +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path + +# Systems we explicitly skip — they all require external infrastructure +# (managed cloud DBs / API keys) we can't run inside an isolated microVM. +# Local-only systems (umbra, hyper, cedardb, etc.) stay in the catalog +# even though some need a free-trial license at install time — those +# scripts fetch the binary themselves and we don't second-guess them. +_EXTERNAL = { + "alloydb", "athena", "athena-partitioned", "aurora-mysql", + "aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt", + "clickhouse-cloud", "clickhouse-tencent", "clickhouse-web", + "crunchy-bridge-for-analytics", "databend", "databricks", "exasol", + "firebolt", "firebolt-parquet", "firebolt-parquet-partitioned", + "gravitons", "heavyai", "hologres", "hydrolix", "kinetica", + "motherduck", "oxla", "pgpro_tam", "redshift", "redshift-serverless", + "s3select", "singlestore", "snowflake", "supabase", + "tembo-olap", "timescale-cloud", "tinybird", "ursa", "velodb", + "vertica", "ydb", +} + + +@dataclass(frozen=True) +class System: + name: str + display_name: str + tags: tuple[str, ...] + download_script: str + data_format: str # parquet / parquet-partitioned / tsv / csv / none + durable: bool + restartable: bool + + def asdict(self) -> dict: + return { + "name": self.name, + "display_name": self.display_name, + "tags": list(self.tags), + "download_script": self.download_script, + "data_format": self.data_format, + "durable": self.durable, + "restartable": self.restartable, + } + + +def _read_template(p: Path) -> dict: + tpl = p / "template.json" + if not tpl.exists(): + return {} + try: + return json.loads(tpl.read_text()) + except Exception: + return {} + + +def _parse_benchmark_sh(p: Path) -> dict: + """Best-effort parse of `export FOO=bar` lines in benchmark.sh.""" + bench = p / "benchmark.sh" + if not bench.exists(): + return {} + out: dict[str, str] = {} + pat = re.compile(r'^\s*export\s+([A-Z_]+)=("([^"]*)"|([^\s]+))', re.MULTILINE) + text = bench.read_text(errors="replace") + for m in pat.finditer(text): + key = m.group(1) + out[key] = m.group(3) if m.group(3) is not None else m.group(4) + return out + + +def _data_format(download_script: str) -> str: + if not download_script: + return "none" + if "parquet-partitioned" in download_script: + return "parquet-partitioned" + if "parquet-single" in download_script: + return "parquet" + if "tsv" in download_script: + return "tsv" + if "csv" in download_script: + return "csv" + return "unknown" + + +def _is_playground_eligible(p: Path) -> bool: + if p.name in _EXTERNAL: + return False + for f in ("install", "start", "load", "query", "check", "stop"): + s = p / f + if not s.exists(): + return False + return True + + +def discover(repo_dir: Path) -> dict[str, System]: + """Walk the repo and return name -> System.""" + out: dict[str, System] = {} + for child in sorted(repo_dir.iterdir()): + if not child.is_dir(): + continue + if not _is_playground_eligible(child): + continue + tpl = _read_template(child) + env = _parse_benchmark_sh(child) + download = env.get("BENCH_DOWNLOAD_SCRIPT", "") + out[child.name] = System( + name=child.name, + display_name=tpl.get("system", child.name), + tags=tuple(tpl.get("tags", []) or []), + download_script=download, + data_format=_data_format(download), + durable=env.get("BENCH_DURABLE", "yes") != "no", + restartable=env.get("BENCH_RESTARTABLE", "yes") != "no", + ) + return out diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py new file mode 100644 index 0000000000..7b97e4d46c --- /dev/null +++ b/playground/server/vm_manager.py @@ -0,0 +1,391 @@ +"""Per-system Firecracker microVM lifecycle. + +For each ClickBench system we manage a VM with this lifecycle: + + [DOWN] --build_images--> [DOWN(images-ready)] + --first_boot--> [PROVISIONING] (internet ON, /provision called) + --snapshot--> [SNAPSHOTTED(internet OFF)] + --restore--> [READY] (handles /query requests) + --idle / OOM / disk full / forced--> kill -> [SNAPSHOTTED] + +State transitions are gated by `VM.lock`. The public API +`ensure_ready_for_query(system)` returns an `(agent_url, vm)` ready to take a +POST /query, doing whatever transitions are needed. + +We avoid the jailer for now: the playground host already runs as a dedicated +user; the chroot/cgroups layer would complicate dataset disk attach and +the value-add over a vanilla firecracker process is small for our threat +model (untrusted SQL but cooperatively-built rootfs). +""" +from __future__ import annotations + +import asyncio +import contextlib +import dataclasses +import json +import logging +import os +import shutil +import signal +import time +from pathlib import Path +from typing import Optional + +import aiohttp + +from . import firecracker as fc +from . import net +from .config import Config +from .systems import System + +log = logging.getLogger("vm_manager") + + +# Lifecycle states for VM.state: +# "down" no firecracker process for this system +# "provisioning" firecracker is up, install/start/load running inside +# "ready" firecracker is up, snapshotted at least once, /query OK +# "snapshotted" firecracker process is down, but snapshot.bin exists +@dataclasses.dataclass +class VM: + system: System + slot: int + # Firecracker config + api_sock: Path + log_sock: Path # we just point this at /dev/null actually + pid: Optional[int] = None + state: str = "down" + # Snapshot artifacts + snapshot_bin: Path = dataclasses.field(default_factory=lambda: Path()) + snapshot_state: Path = dataclasses.field(default_factory=lambda: Path()) + # Provision metadata + provisioned_at: Optional[float] = None + last_used: float = 0.0 + last_error: Optional[str] = None + lock: asyncio.Lock = dataclasses.field(default_factory=asyncio.Lock) + # Runtime stats refreshed by the monitor + cpu_busy_since: Optional[float] = None + rss_bytes: int = 0 + rootfs_used_bytes: int = 0 + + +class VMManager: + """Owns the registry of per-system VMs.""" + + def __init__(self, config: Config, systems: dict[str, System]): + self.cfg = config + self.systems = systems + self.vms: dict[str, VM] = {} + # Stable slot allocation: sort systems alphabetically so each system + # always gets the same slot id (and therefore the same TAP/IP). + for i, name in enumerate(sorted(systems.keys()), start=1): + sys = systems[name] + sys_state_dir = config.systems_dir / name + sys_state_dir.mkdir(parents=True, exist_ok=True) + self.vms[name] = VM( + system=sys, + slot=i, + api_sock=config.vms_dir / f"{name}.sock", + log_sock=config.vms_dir / f"{name}.log.sock", + snapshot_bin=sys_state_dir / "snapshot.bin", + snapshot_state=sys_state_dir / "snapshot.state", + ) + + # ── public API ─────────────────────────────────────────────────────── + + async def ensure_ready_for_query(self, system: str) -> VM: + """Make sure system is up and responsive to /query. Boot/resume as needed. + + On success the returned VM is in state "ready" and self.last_used has + been touched. + """ + if system not in self.vms: + raise KeyError(system) + vm = self.vms[system] + async with vm.lock: + if vm.state == "ready" and vm.pid and await self._agent_healthy(vm): + vm.last_used = time.time() + return vm + # The state machine: drive to "ready" by the cheapest available path. + if vm.state == "ready": + # Process is gone or unresponsive. Treat as snapshotted. + vm.state = "snapshotted" + if vm.state == "down": + if not vm.snapshot_bin.exists(): + # No snapshot yet — need a full provision. + await self._initial_provision(vm) + else: + await self._restore_snapshot(vm) + elif vm.state == "snapshotted": + await self._restore_snapshot(vm) + elif vm.state == "provisioning": + raise RuntimeError(f"{system}: provisioning in progress") + vm.last_used = time.time() + return vm + + async def kick(self, system: str, reason: str) -> None: + """Forcibly tear down the VM. Caller (monitor) is responsible for logging.""" + vm = self.vms.get(system) + if vm is None: + return + async with vm.lock: + await self._teardown(vm, reason) + + def list_all(self) -> list[dict]: + out = [] + for name, vm in self.vms.items(): + out.append({ + "name": name, + "system": vm.system.display_name, + "state": vm.state, + "slot": vm.slot, + "agent_url": self.agent_url(vm), + "provisioned_at": vm.provisioned_at, + "last_used": vm.last_used, + "tags": list(vm.system.tags), + "data_format": vm.system.data_format, + "last_error": vm.last_error, + "rss_bytes": vm.rss_bytes, + "rootfs_used_bytes": vm.rootfs_used_bytes, + "has_snapshot": vm.snapshot_bin.exists(), + }) + return out + + def agent_url(self, vm: VM) -> str: + _, vm_ip, _ = net.addr_for(vm.slot) + return f"http://{vm_ip}:8080" + + # ── boot / shutdown ────────────────────────────────────────────────── + + async def _spawn_firecracker(self, vm: VM) -> None: + """Start a fresh firecracker process listening on vm.api_sock.""" + with contextlib.suppress(FileNotFoundError): + vm.api_sock.unlink() + vm.api_sock.parent.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env["RUST_BACKTRACE"] = "1" + + log_path = self.cfg.logs_dir / f"firecracker-{vm.system.name}.log" + log_path.parent.mkdir(parents=True, exist_ok=True) + # Append to the existing log so prior runs are kept for postmortems. + log_fh = open(log_path, "ab", buffering=0) + + proc = await asyncio.create_subprocess_exec( + str(self.cfg.firecracker_bin), + "--api-sock", str(vm.api_sock), + "--id", vm.system.name, + stdout=log_fh, stderr=log_fh, env=env, start_new_session=True, + ) + vm.pid = proc.pid + # Wait for the API socket to exist + for _ in range(80): + if vm.api_sock.exists(): + break + await asyncio.sleep(0.05) + if not vm.api_sock.exists(): + raise RuntimeError("firecracker did not create API socket in time") + + def _kernel_cmdline(self, vm: VM) -> str: + # console=ttyS0 so we get a serial-attached login (in case we drop a + # console socket for debugging); reboot=k for clean halt-on-panic. + # The kernel's built-in IP autoconfig statically assigns the VM's + # /24 from its slot, sidestepping any DHCP/networkd in userland. + host_ip, vm_ip, _ = net.addr_for(vm.slot) + return ( + "console=ttyS0 reboot=k panic=1 pci=off " + f"ip={vm_ip}::{host_ip}:255.255.255.0::eth0:off " + "root=/dev/vda rw " + "init=/lib/systemd/systemd " + ) + + async def _initial_provision(self, vm: VM) -> None: + """First-time boot: build per-system images, boot with internet, run + agent /provision, snapshot, shut down.""" + if vm.state != "down": + raise RuntimeError(f"unexpected state for initial provision: {vm.state}") + + log.info("[%s] initial provision begin", vm.system.name) + vm.state = "provisioning" + try: + await self._build_images_if_needed(vm) + await net.ensure_tap(vm.slot) + await net.enable_internet(vm.slot) + await self._boot(vm, restore_snapshot=False) + await self._wait_for_agent(vm, timeout=180) + await self._call_agent_provision(vm) + await self._snapshot(vm) + await self._shutdown(vm) + await net.disable_internet(vm.slot) + vm.state = "snapshotted" + vm.provisioned_at = time.time() + log.info("[%s] initial provision complete", vm.system.name) + except Exception as e: + vm.last_error = f"provision: {e!r}" + log.exception("[%s] provision failed", vm.system.name) + await self._teardown(vm, "provision-failed") + raise + + async def _build_images_if_needed(self, vm: VM) -> None: + sys_dir = self.cfg.systems_dir / vm.system.name + rootfs = sys_dir / "rootfs.ext4" + sysdisk = sys_dir / "system.ext4" + if rootfs.exists() and sysdisk.exists(): + return + log.info("[%s] building rootfs + system disk", vm.system.name) + script = self.cfg.repo_dir / "playground" / "images" / "build-system-rootfs.sh" + p = await asyncio.create_subprocess_exec( + "bash", str(script), vm.system.name, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, + env={**os.environ, "PLAYGROUND_STATE_DIR": str(self.cfg.state_dir)}, + ) + out, _ = await p.communicate() + if p.returncode != 0: + raise RuntimeError(f"build-system-rootfs failed: {out.decode(errors='replace')[-2000:]}") + + async def _boot(self, vm: VM, *, restore_snapshot: bool) -> None: + """Configure and start a Firecracker instance. If restore_snapshot is + True, we load from the snapshot files; else we cold-boot from kernel + + rootfs.""" + await self._spawn_firecracker(vm) + sock = str(vm.api_sock) + + # Network: must be configured *before* either boot path. + await fc.put(sock, f"/network-interfaces/eth0", { + "iface_id": "eth0", + "guest_mac": net.mac_for(vm.slot), + "host_dev_name": net.tap_name(vm.slot), + }) + + rootfs = self.cfg.systems_dir / vm.system.name / "rootfs.ext4" + sysdisk = self.cfg.systems_dir / vm.system.name / "system.ext4" + + if restore_snapshot: + # Drives must match the layout that existed when the snapshot was + # taken, but Firecracker re-reads file paths on restore. We rebind + # them here in case the absolute paths changed (e.g. snapshot moved). + await fc.put(sock, "/snapshot/load", { + "snapshot_path": str(vm.snapshot_state), + "mem_backend": {"backend_type": "File", "backend_path": str(vm.snapshot_bin)}, + "enable_diff_snapshots": False, + "resume_vm": True, + }) + return + + # Cold boot. + await fc.put(sock, "/boot-source", { + "kernel_image_path": str(self.cfg.kernel_path), + "boot_args": self._kernel_cmdline(vm), + }) + await fc.put(sock, "/drives/rootfs", { + "drive_id": "rootfs", + "path_on_host": str(rootfs), + "is_root_device": True, + "is_read_only": False, + }) + await fc.put(sock, "/drives/system", { + "drive_id": "system", + "path_on_host": str(sysdisk), + "is_root_device": False, + "is_read_only": False, + }) + await fc.put(sock, "/machine-config", { + "vcpu_count": self.cfg.vm_vcpus, + "mem_size_mib": self.cfg.vm_mem_mib, + "smt": False, + }) + await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) + + async def _snapshot(self, vm: VM) -> None: + sock = str(vm.api_sock) + await fc.patch(sock, "/vm", {"state": "Paused"}) + try: + await fc.put(sock, "/snapshot/create", { + "snapshot_type": "Full", + "snapshot_path": str(vm.snapshot_state), + "mem_file_path": str(vm.snapshot_bin), + }, timeout=600.0) + finally: + # Try to resume so we can shut down cleanly; ignore failures. + with contextlib.suppress(Exception): + await fc.patch(sock, "/vm", {"state": "Resumed"}) + + async def _restore_snapshot(self, vm: VM) -> None: + log.info("[%s] restore from snapshot", vm.system.name) + await net.ensure_tap(vm.slot) + # internet stays OFF post-snapshot + await self._boot(vm, restore_snapshot=True) + await self._wait_for_agent(vm, timeout=60) + vm.state = "ready" + + async def _shutdown(self, vm: VM) -> None: + """Best-effort clean shutdown of the firecracker process.""" + if not vm.pid: + return + with contextlib.suppress(Exception): + await fc.put(str(vm.api_sock), "/actions", {"action_type": "SendCtrlAltDel"}) + # Wait briefly for graceful exit + for _ in range(50): + if not _pid_alive(vm.pid): + break + await asyncio.sleep(0.1) + if _pid_alive(vm.pid): + with contextlib.suppress(ProcessLookupError): + os.kill(vm.pid, signal.SIGKILL) + vm.pid = None + with contextlib.suppress(FileNotFoundError): + vm.api_sock.unlink() + + async def _teardown(self, vm: VM, reason: str) -> None: + log.warning("[%s] teardown: %s", vm.system.name, reason) + with contextlib.suppress(Exception): + await self._shutdown(vm) + vm.state = "snapshotted" if vm.snapshot_bin.exists() else "down" + + # ── agent helpers ──────────────────────────────────────────────────── + + async def _agent_healthy(self, vm: VM) -> bool: + if not vm.pid or not _pid_alive(vm.pid): + return False + url = self.agent_url(vm) + "/health" + try: + async with aiohttp.ClientSession() as s: + async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r: + return r.status == 200 + except Exception: + return False + + async def _wait_for_agent(self, vm: VM, *, timeout: float) -> None: + url = self.agent_url(vm) + "/health" + t0 = time.monotonic() + last_err: Exception | None = None + async with aiohttp.ClientSession() as s: + while time.monotonic() - t0 < timeout: + try: + async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r: + if r.status == 200: + return + except Exception as e: + last_err = e + await asyncio.sleep(0.5) + raise RuntimeError(f"agent unreachable after {timeout}s: {last_err!r}") + + async def _call_agent_provision(self, vm: VM) -> None: + url = self.agent_url(vm) + "/provision" + async with aiohttp.ClientSession() as s: + # Provision can take a very long time (apt-get install jdk, etc.) + async with s.post(url, timeout=aiohttp.ClientTimeout(total=7200)) as r: + body = await r.read() + if r.status >= 300: + raise RuntimeError(f"agent /provision failed: {r.status}: " + f"{body[-2000:].decode(errors='replace')}") + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True diff --git a/playground/web/app.js b/playground/web/app.js new file mode 100644 index 0000000000..fb29eb0d7f --- /dev/null +++ b/playground/web/app.js @@ -0,0 +1,128 @@ +// ClickBench Playground — minimal vanilla-JS client. +// +// Talks to the host API. Three things happen here: +// 1. On load, fetch /api/systems and populate the system dropdown. Pre-select +// whatever's in the URL hash (e.g. #clickhouse) or the first one. +// 2. On selection change, poll /api/system/ every 2s and update the +// state pill so the user can see when provisioning finishes / a VM is +// restarted by the watchdog. +// 3. On "Run query", POST the SQL to /api/query?system=, parse the +// response headers for timing, render bytes as text (best-effort UTF-8). + +const $ = (sel) => document.querySelector(sel); + +const sysSelect = $("#system"); +const queryEl = $("#query"); +const runBtn = $("#run"); +const statePill = $("#state-pill"); +const outEl = $("#output"); +const timeEl = $("#time"); +const wallEl = $("#wall"); +const bytesEl = $("#bytes"); +const truncEl = $("#truncated"); +const exitEl = $("#exit"); +const stateBlob = $("#state-blob"); + +let pollTimer = null; +let knownSystems = []; + +async function loadSystems() { + const r = await fetch("/api/systems"); + knownSystems = await r.json(); + knownSystems.sort((a, b) => a.display_name.localeCompare(b.display_name)); + sysSelect.innerHTML = ""; + for (const s of knownSystems) { + const o = document.createElement("option"); + o.value = s.name; + o.textContent = `${s.display_name} (${s.data_format})`; + sysSelect.appendChild(o); + } + // Allow #clickhouse style deep links + const hash = (location.hash || "").slice(1); + if (hash && knownSystems.some(s => s.name === hash)) { + sysSelect.value = hash; + } + onSystemChange(); +} + +async function pollState() { + const name = sysSelect.value; + if (!name) return; + try { + const r = await fetch(`/api/system/${encodeURIComponent(name)}`); + if (!r.ok) throw new Error(`HTTP ${r.status}`); + const j = await r.json(); + statePill.textContent = j.state || "?"; + statePill.className = `pill ${j.state || ""}`; + stateBlob.textContent = JSON.stringify(j, null, 2); + } catch (e) { + statePill.textContent = "err"; + statePill.className = "pill down"; + stateBlob.textContent = String(e); + } +} + +function onSystemChange() { + if (pollTimer) clearInterval(pollTimer); + location.hash = sysSelect.value; + pollState(); + pollTimer = setInterval(pollState, 2000); +} + +async function runQuery() { + const name = sysSelect.value; + const sql = queryEl.value; + if (!sql.trim()) return; + runBtn.disabled = true; + outEl.textContent = "(running …)"; + timeEl.textContent = "…"; + wallEl.textContent = "…"; + bytesEl.textContent = "—"; + truncEl.textContent = "—"; + exitEl.textContent = "—"; + + const t0 = performance.now(); + try { + const r = await fetch(`/api/query?system=${encodeURIComponent(name)}`, { + method: "POST", + body: sql, + headers: {"Content-Type": "application/octet-stream"}, + }); + const body = await r.arrayBuffer(); + const txt = bytesToText(body); + outEl.textContent = txt || "(no output)"; + + const h = (k) => r.headers.get(k); + const qt = h("X-Query-Time"); + const wt = h("X-Wall-Time"); + timeEl.textContent = qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—"; + wallEl.textContent = wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`; + bytesEl.textContent = h("X-Output-Bytes") || body.byteLength; + truncEl.textContent = h("X-Output-Truncated") === "1" ? "yes" : "no"; + exitEl.textContent = h("X-Exit-Code") || r.status; + if (r.status >= 400) { + const err = h("X-Error"); + if (err) outEl.textContent = `(error)\n${err}\n\n` + outEl.textContent; + } + } catch (e) { + outEl.textContent = `(client error)\n${e}`; + } finally { + runBtn.disabled = false; + } +} + +function bytesToText(buf) { + try { + return new TextDecoder("utf-8", {fatal: false}).decode(buf); + } catch { + return [...new Uint8Array(buf)].map(b => String.fromCharCode(b)).join(""); + } +} + +sysSelect.addEventListener("change", onSystemChange); +runBtn.addEventListener("click", runQuery); +queryEl.addEventListener("keydown", (e) => { + if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery(); +}); + +loadSystems(); diff --git a/playground/web/index.html b/playground/web/index.html new file mode 100644 index 0000000000..e415a5ecff --- /dev/null +++ b/playground/web/index.html @@ -0,0 +1,62 @@ + + + + + +ClickBench Playground + + + +
+

ClickBench Playground

+

+ Run SQL against any of the database systems in + ClickBench, + each isolated in its own Firecracker microVM. The dataset is the + standard hits table — 100 M rows. +

+
+ +
+
+ + + + +
+ +
+ + +
+ +
+
Time:
+
Wall:
+
Bytes:
+
Truncated:
+
Exit:
+
+ +
+ +

+    
+ +
+
+ System status +
loading…
+
+
+
+ +
+ ClickBench · + output is capped at 10 KB · queries are bounded to 10 min · the host + keeps no per-user state. +
+ + + + diff --git a/playground/web/style.css b/playground/web/style.css new file mode 100644 index 0000000000..e630327026 --- /dev/null +++ b/playground/web/style.css @@ -0,0 +1,71 @@ +:root { + --fg: #1f2328; + --muted: #6e7681; + --border: #d0d7de; + --bg: #ffffff; + --bg-alt: #f6f8fa; + --accent: #fb1f00; + --accent-fg: #ffffff; + --pill-bg: #e7eaef; + --pill-fg: #1f2328; + --good: #1f883d; + --bad: #cf222e; +} + +* { box-sizing: border-box; } +html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); } +body { font: 14px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif; } +header, main, footer { max-width: 960px; margin: 0 auto; padding: 0 16px; } + +header { padding-top: 24px; padding-bottom: 12px; border-bottom: 1px solid var(--border); } +header h1 { margin: 0 0 4px; font-size: 22px; font-weight: 600; } +header .lead { margin: 0; color: var(--muted); } +.muted { color: var(--muted); font-weight: normal; } + +main { padding-top: 16px; padding-bottom: 32px; } +main > section { margin: 12px 0; } + +label { display: block; font-weight: 600; font-size: 12px; text-transform: uppercase; + letter-spacing: 0.04em; color: var(--muted); margin-bottom: 4px; } + +select, textarea, pre, input { + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace; + font-size: 13px; + border: 1px solid var(--border); + background: var(--bg); + color: var(--fg); + border-radius: 6px; +} + +select { padding: 6px 8px; min-width: 280px; } +textarea { width: 100%; padding: 10px; resize: vertical; } +pre { padding: 10px; background: var(--bg-alt); margin: 0; max-height: 360px; + overflow: auto; white-space: pre-wrap; word-break: break-word; } + +button { + background: var(--accent); color: var(--accent-fg); + border: none; border-radius: 6px; padding: 6px 16px; + font-weight: 600; cursor: pointer; +} +button:disabled { opacity: 0.6; cursor: not-allowed; } +button:hover:not(:disabled) { filter: brightness(0.95); } + +.row { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; } +.row label { margin: 0; } +.stats { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 12px; color: var(--muted); padding: 8px 0; + border-top: 1px solid var(--border); border-bottom: 1px solid var(--border); } +.stats span { color: var(--fg); } + +.pill { display: inline-block; padding: 2px 8px; border-radius: 999px; + font-size: 11px; font-weight: 600; background: var(--pill-bg); color: var(--pill-fg); + text-transform: uppercase; letter-spacing: 0.04em; } +.pill.ready { background: #ddf4e4; color: var(--good); } +.pill.snapshotted { background: #fff4d1; color: #9a6700; } +.pill.provisioning { background: #ddf0ff; color: #0969da; } +.pill.down { background: #ffd7d6; color: var(--bad); } + +footer { color: var(--muted); padding-top: 16px; padding-bottom: 32px; + border-top: 1px solid var(--border); font-size: 12px; } +a { color: var(--accent); text-decoration: none; } +a:hover { text-decoration: underline; } From d1e144c2f5d601cdc85e3e5ee791ef59b3aaed42 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 20:00:34 +0000 Subject: [PATCH 002/221] playground: mark chroot's /dev /proc /sys as rslave A later `umount -lR` on the chroot's /dev was propagating through the shared mount group and tearing down the host's /dev/pts, breaking sshd's PTY allocation. `--make-rslave` keeps mount events flowing *into* the chroot but blocks unmounts from leaking back to the host. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 2cffbaf52f..6e4323bc71 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -92,10 +92,16 @@ trap ' sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true ' EXIT -# Bind /dev /proc /sys for the chroot. +# Bind /dev /proc /sys for the chroot. Use `--rbind` so submounts (devpts, +# mqueue, hugepages, /sys/fs/cgroup, …) come along. Critically, mark each +# new mount `--make-rslave` immediately afterwards. Without that, a later +# `umount -lR` on the chroot's `/dev` propagates back through the shared +# mount group and tears down the *host's* `/dev/pts` — at which point sshd +# can't allocate a PTY and the operator gets locked out. for d in dev proc sys; do sudo mkdir -p "$MNT/$d" sudo mount --rbind "/$d" "$MNT/$d" + sudo mount --make-rslave "$MNT/$d" done trap ' for d in dev proc sys; do sudo umount -lR "'"$MNT"'/$d" 2>/dev/null || true; done From 41ed4b37972b4a28755444bfa6708c8479179b1d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 20:18:34 +0000 Subject: [PATCH 003/221] playground: shrink snapshots with restart + drop_caches + zstd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 16 GB guest snapshot.bin compresses to ~2 GB once we 1) stop+start the system daemon (sheds INSERT-time heap arenas, buffers, fresh allocator pages), 2) echo 3 > drop_caches (turns 3-5 GB of page cache into zero pages), 3) zstd -T0 -3 --long=27 (parallel, big match window — most of the savings come from those zero pages). Restart is skipped for in-process engines where stop/start is a no-op AND the data lives in the process; wiping it would defeat the whole point. The host now keeps snapshot.bin.zst as the canonical artifact and decompresses on demand right before /snapshot/load. snapshot.bin itself is deleted after a successful restore + teardown. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 49 ++++++++++ playground/server/vm_manager.py | 153 ++++++++++++++++++++++++++++---- 2 files changed, 183 insertions(+), 19 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 62f5aad59f..3a2135155d 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -264,6 +264,55 @@ def _provision() -> tuple[int, bytes]: return r.returncode, b"".join(log_lines) subprocess.run(["sync"], check=False) + + # Pre-snapshot trim: + # + # 1. Restart the daemon if the system is restartable. After ./load + # finishes, most engines have hundreds of MB of fresh per-INSERT + # state on the heap: ClickHouse's MergeTree merge thread arenas, + # Postgres' aborted-batch buffers, etc. Stop/start sheds that + # private memory back to zero. Wait for ./check to confirm the + # server is ready again so the snapshot we take is on a quiesced + # process whose first user query will be a cold *query*, not a + # cold *startup*. Skip this for in-process engines (chdb, polars, + # pandas, …) where stop/start is a no-op AND the data lives in + # the process — wiping it would defeat the whole point. + restartable = (SYSTEM_DIR / "start").exists() and (SYSTEM_DIR / "stop").exists() + if restartable: + # Best effort: don't bail on errors. We try to stop, wait, start, + # check; if any step fails we proceed with whatever state we + # have. The host will see ./check fail and refuse to snapshot. + subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=120, check=False) + for _ in range(60): + rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc != 0: + break + time.sleep(0.5) + subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=300, check=False) + for _ in range(300): + rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + break + time.sleep(1) + + # 2. Drop the kernel's page+dentry+inode cache. The page cache holds + # 3-5 GB of file data the system would re-read on demand anyway; + # those pages become zero-fill in the snapshot, which zstd + # compresses ~50:1 vs random data. + subprocess.run(["sync"], check=False) + try: + Path("/proc/sys/vm/drop_caches").write_text("3\n") + except Exception: + pass + PROVISION_DONE.write_text(f"ok {time.time()}\n") PROVISION_LOG.write_bytes(b"".join(log_lines)) return 0, b"".join(log_lines) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 7b97e4d46c..45003c86aa 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -54,6 +54,14 @@ class VM: api_sock: Path log_sock: Path # we just point this at /dev/null actually pid: Optional[int] = None + # Keep the asyncio.subprocess.Process handle for the running firecracker. + # Without holding it, Python eventually garbage-collects the wrapper and + # the underlying child sits as a zombie until the host server + # exits — the kernel keeps the zombie's open TAP fd around with it, and a + # subsequent restore for the same slot then fails to open the TAP with + # "Resource busy". Holding the handle lets us `await proc.wait()` on + # shutdown and reap immediately. + proc: Optional[asyncio.subprocess.Process] = None state: str = "down" # Snapshot artifacts snapshot_bin: Path = dataclasses.field(default_factory=lambda: Path()) @@ -111,8 +119,8 @@ async def ensure_ready_for_query(self, system: str) -> VM: # Process is gone or unresponsive. Treat as snapshotted. vm.state = "snapshotted" if vm.state == "down": - if not vm.snapshot_bin.exists(): - # No snapshot yet — need a full provision. + if not _has_snapshot(vm): + # No snapshot (raw or compressed) yet — full provision. await self._initial_provision(vm) else: await self._restore_snapshot(vm) @@ -177,6 +185,7 @@ async def _spawn_firecracker(self, vm: VM) -> None: "--id", vm.system.name, stdout=log_fh, stderr=log_fh, env=env, start_new_session=True, ) + vm.proc = proc vm.pid = proc.pid # Wait for the API socket to exist for _ in range(80): @@ -248,31 +257,43 @@ async def _boot(self, vm: VM, *, restore_snapshot: bool) -> None: True, we load from the snapshot files; else we cold-boot from kernel + rootfs.""" await self._spawn_firecracker(vm) - sock = str(vm.api_sock) + try: + await self._configure_boot(vm, restore_snapshot=restore_snapshot) + except Exception: + # If config fails partway, the firecracker process still owns the + # TAP fd; without reaping it, the next attempt sees "Resource + # busy" because the kernel hasn't released the TAP. Kill + + # wait() before propagating. + await self._shutdown(vm) + raise - # Network: must be configured *before* either boot path. - await fc.put(sock, f"/network-interfaces/eth0", { - "iface_id": "eth0", - "guest_mac": net.mac_for(vm.slot), - "host_dev_name": net.tap_name(vm.slot), - }) + async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: + sock = str(vm.api_sock) rootfs = self.cfg.systems_dir / vm.system.name / "rootfs.ext4" sysdisk = self.cfg.systems_dir / vm.system.name / "system.ext4" if restore_snapshot: - # Drives must match the layout that existed when the snapshot was - # taken, but Firecracker re-reads file paths on restore. We rebind - # them here in case the absolute paths changed (e.g. snapshot moved). + # Firecracker's rule: `PUT /snapshot/load` must be the *first* + # configuring action — no boot-source, no drives, no network + # interfaces, no machine-config beforehand. The snapshot itself + # encodes all of that. We just need the same TAP available on + # the host with the same name (host_ensure_tap below handles + # this). await fc.put(sock, "/snapshot/load", { "snapshot_path": str(vm.snapshot_state), "mem_backend": {"backend_type": "File", "backend_path": str(vm.snapshot_bin)}, "enable_diff_snapshots": False, "resume_vm": True, - }) + }, timeout=120.0) return # Cold boot. + await fc.put(sock, "/network-interfaces/eth0", { + "iface_id": "eth0", + "guest_mac": net.mac_for(vm.slot), + "host_dev_name": net.tap_name(vm.slot), + }) await fc.put(sock, "/boot-source", { "kernel_image_path": str(self.cfg.kernel_path), "boot_args": self._kernel_cmdline(vm), @@ -310,8 +331,73 @@ async def _snapshot(self, vm: VM) -> None: with contextlib.suppress(Exception): await fc.patch(sock, "/vm", {"state": "Resumed"}) + # Compress the memory dump with parallel zstd. Firecracker writes the + # *full* 16 GB of guest memory regardless of how much was actually + # used; zstd at -3 with -T0 turns that into ~10-12 GB in a few + # seconds (most of the savings come from the agent's drop_caches + # right before /snapshot — page cache zero-fills compress 50:1). + # snapshot.state stays as-is; it's tiny (~60 KB). + await self._compress_snapshot(vm) + + async def _compress_snapshot(self, vm: VM) -> None: + bin_path = vm.snapshot_bin + zst_path = vm.snapshot_bin.with_suffix(".bin.zst") + if not bin_path.exists(): + return + log.info("[%s] zstd -T0 -3 snapshot.bin (%s)", + vm.system.name, _fmt_size(bin_path.stat().st_size)) + t0 = time.monotonic() + # Stream from snapshot.bin to .zst, multi-threaded. `--long=27` + # widens the matching window to 128 MB which helps with repetitive + # zero-region patterns common in guest RAM. + proc = await asyncio.create_subprocess_exec( + "zstd", "-T0", "-3", "--long=27", "-q", "-f", + str(bin_path), "-o", str(zst_path), + ) + rc = await proc.wait() + dt = time.monotonic() - t0 + if rc != 0: + log.warning("[%s] zstd compression failed rc=%d; keeping raw .bin", + vm.system.name, rc) + zst_path.unlink(missing_ok=True) + return + new = zst_path.stat().st_size + log.info("[%s] zstd done in %.1fs: %s -> %s (%.1fx)", + vm.system.name, dt, + _fmt_size(bin_path.stat().st_size), _fmt_size(new), + bin_path.stat().st_size / max(1, new)) + # The raw .bin can go; restore re-decompresses into a temp file. + bin_path.unlink(missing_ok=True) + + async def _decompress_snapshot(self, vm: VM) -> None: + """If the snapshot lives as .bin.zst, decompress to .bin in place. + Idempotent: a no-op if .bin already exists. + """ + bin_path = vm.snapshot_bin + zst_path = vm.snapshot_bin.with_suffix(".bin.zst") + if bin_path.exists(): + return + if not zst_path.exists(): + return + log.info("[%s] unzstd snapshot.bin.zst (%s)", + vm.system.name, _fmt_size(zst_path.stat().st_size)) + t0 = time.monotonic() + proc = await asyncio.create_subprocess_exec( + "zstd", "-T0", "-d", "-q", "-f", "--long=27", + str(zst_path), "-o", str(bin_path), + ) + rc = await proc.wait() + dt = time.monotonic() - t0 + if rc != 0: + raise RuntimeError(f"zstd decompress failed rc={rc}") + log.info("[%s] unzstd done in %.1fs -> %s", + vm.system.name, dt, _fmt_size(bin_path.stat().st_size)) + async def _restore_snapshot(self, vm: VM) -> None: log.info("[%s] restore from snapshot", vm.system.name) + # If we only have the zstd-compressed memory dump, expand it before + # Firecracker tries to mmap it. + await self._decompress_snapshot(vm) await net.ensure_tap(vm.slot) # internet stays OFF post-snapshot await self._boot(vm, restore_snapshot=True) @@ -319,19 +405,30 @@ async def _restore_snapshot(self, vm: VM) -> None: vm.state = "ready" async def _shutdown(self, vm: VM) -> None: - """Best-effort clean shutdown of the firecracker process.""" - if not vm.pid: + """Best-effort clean shutdown of the firecracker process. + + Always reap the asyncio.subprocess.Process handle so the kernel + releases its open file descriptors (notably the TAP — without this + the next /restore for the same slot fails with `Resource busy`). + """ + if not vm.pid and not vm.proc: return with contextlib.suppress(Exception): await fc.put(str(vm.api_sock), "/actions", {"action_type": "SendCtrlAltDel"}) - # Wait briefly for graceful exit + # Wait briefly for graceful exit. for _ in range(50): - if not _pid_alive(vm.pid): + if vm.pid is None or not _pid_alive(vm.pid): break await asyncio.sleep(0.1) - if _pid_alive(vm.pid): + if vm.pid is not None and _pid_alive(vm.pid): with contextlib.suppress(ProcessLookupError): os.kill(vm.pid, signal.SIGKILL) + # Reap the process. asyncio.Process.wait() drains the exit status so + # the kernel can release the resources (TAP fd, memory mappings). + if vm.proc is not None: + with contextlib.suppress(Exception): + await asyncio.wait_for(vm.proc.wait(), timeout=5.0) + vm.proc = None vm.pid = None with contextlib.suppress(FileNotFoundError): vm.api_sock.unlink() @@ -340,7 +437,13 @@ async def _teardown(self, vm: VM, reason: str) -> None: log.warning("[%s] teardown: %s", vm.system.name, reason) with contextlib.suppress(Exception): await self._shutdown(vm) - vm.state = "snapshotted" if vm.snapshot_bin.exists() else "down" + vm.state = "snapshotted" if _has_snapshot(vm) else "down" + # Drop the decompressed snapshot.bin if we still have the .zst — it's + # ~16 GB of redundancy on disk. Keep .zst as the canonical artifact. + zst = vm.snapshot_bin.with_suffix(".bin.zst") + if vm.snapshot_bin.exists() and zst.exists(): + with contextlib.suppress(FileNotFoundError): + vm.snapshot_bin.unlink() # ── agent helpers ──────────────────────────────────────────────────── @@ -381,6 +484,18 @@ async def _call_agent_provision(self, vm: VM) -> None: f"{body[-2000:].decode(errors='replace')}") +def _has_snapshot(vm: VM) -> bool: + return vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists() + + +def _fmt_size(n: int) -> str: + for u in ("B", "KiB", "MiB", "GiB", "TiB"): + if n < 1024: + return f"{n:.1f}{u}" + n //= 1024 + return f"{n}PiB" + + def _pid_alive(pid: int) -> bool: try: os.kill(pid, 0) From db4625a6176fd6f76ab4fa0aa98601ae9a1f818a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 20:25:28 +0000 Subject: [PATCH 004/221] playground: capture stop/start output in provision log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous version threw away stdout/stderr from the pre-snapshot stop/start cycle, so a silent failure (`sudo clickhouse start` failing because the data dir was still locked by the dying daemon, etc.) left us with a snapshot of a dead clickhouse-server — restored VMs then returned "Connection refused (localhost:9000)" on every query and the only way to recover was to manually delete the snapshot. Capture stdout+stderr into the provision log so the failure mode is visible via GET /provision-log, and refuse to mark PROVISION_DONE if ./check doesn't recover within the timeout. The host then sees /provision return 500 and skips the snapshot step entirely. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 43 +++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 3a2135155d..45dce9f667 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -271,20 +271,22 @@ def _provision() -> tuple[int, bytes]: # finishes, most engines have hundreds of MB of fresh per-INSERT # state on the heap: ClickHouse's MergeTree merge thread arenas, # Postgres' aborted-batch buffers, etc. Stop/start sheds that - # private memory back to zero. Wait for ./check to confirm the - # server is ready again so the snapshot we take is on a quiesced - # process whose first user query will be a cold *query*, not a - # cold *startup*. Skip this for in-process engines (chdb, polars, - # pandas, …) where stop/start is a no-op AND the data lives in - # the process — wiping it would defeat the whole point. + # private memory back to zero. We capture stop/start output into + # the provision log so a broken restart can be diagnosed, and + # bail out of /provision if ./check doesn't recover — the host + # must NOT snapshot a dead daemon, since post-restore /query + # would then hit "Connection refused" until the user manually + # kicks the VM. Skip the whole dance for in-process engines + # (chdb, polars, pandas, …) where stop/start is a no-op AND the + # data lives in the process; wiping it would defeat the point. restartable = (SYSTEM_DIR / "start").exists() and (SYSTEM_DIR / "stop").exists() if restartable: - # Best effort: don't bail on errors. We try to stop, wait, start, - # check; if any step fails we proceed with whatever state we - # have. The host will see ./check fail and refuse to snapshot. - subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR), - stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, - timeout=120, check=False) + log_lines.append(b"\n=== pre-snapshot restart ===\n") + r = subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=120, check=False) + log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n") + log_lines.append(r.stdout or b"") for _ in range(60): rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -292,16 +294,27 @@ def _provision() -> tuple[int, bytes]: if rc != 0: break time.sleep(0.5) - subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR), - stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, - timeout=300, check=False) + r = subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=300, check=False) + log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n") + log_lines.append(r.stdout or b"") + restart_ok = False for _ in range(300): rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=10, check=False).returncode if rc == 0: + restart_ok = True break time.sleep(1) + if not restart_ok: + log_lines.append(b"=== pre-snapshot restart FAILED ===\n") + PROVISION_LOG.write_bytes(b"".join(log_lines)) + # Do NOT set PROVISION_DONE; force /provision to return 500 + # so the host doesn't snapshot a dead daemon. + return 1, b"".join(log_lines) + log_lines.append(b"=== pre-snapshot restart ok ===\n") # 2. Drop the kernel's page+dentry+inode cache. The page cache holds # 3-5 GB of file data the system would re-read on demand anyway; From f9aed82925d92451ad014ba2411e79f469042c3d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 20:34:04 +0000 Subject: [PATCH 005/221] playground: kick daemon on agent boot, refresh rootfs on re-provision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PROVISION_DONE lives on the rootfs disk (/var/lib/clickbench-agent/), which persists across VM cold-boots. So on the second provision after the host deleted the snapshot files, the agent saw PROVISION_DONE already set and returned "already provisioned" — but the daemon itself wasn't running (cold boot, no clickhouse-server in systemd), so the host snapshotted an empty VM and every restored query came back with "Connection refused (localhost:9000)". Two fixes: 1. Agent: on every startup, if PROVISION_DONE is set, kick ./start in a background thread. start is idempotent for the systems that have a daemon, so it costs nothing when the daemon is already up (post-restore) and brings it up when the rootfs is being re-used across a cold reboot. 2. Host: when (re-)provisioning a system with no snapshot, drop the existing rootfs.ext4 so install/start/load run fresh. The system.ext4 (which holds ~14 GB of pre-staged dataset) is preserved. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 34 +++++++++++++++++++++++++++++++++ playground/server/vm_manager.py | 9 +++++++++ 2 files changed, 43 insertions(+) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 45dce9f667..d2b0def356 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -407,10 +407,44 @@ class ReusableServer(socketserver.ThreadingTCPServer): daemon_threads = True +def _kick_daemon_if_provisioned() -> None: + """On every agent boot, if the system has been provisioned, make sure + the daemon is also running. + + The rootfs is persistent across boots, so PROVISION_DONE survives a + cold restart of the VM. But the *process* doesn't — anything that was + in the snapshot's memory image goes away when the host takes a cold + boot (not a restore). Without this kick, a query would arrive at the + agent, the agent would see PROVISION_DONE and skip install/start, + and then ./query would hit a dead daemon and return "Connection + refused (localhost:9000)" forever. + + Run start asynchronously: blocking the agent's listen until the + daemon is ready would defeat /health, which the host uses to gate + snapshot creation and restore-wait timeouts. + """ + if not PROVISION_DONE.exists(): + return + start = SYSTEM_DIR / "start" + if not start.exists() or not os.access(start, os.X_OK): + return + + def _bg() -> None: + try: + subprocess.run([str(start)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=300, check=False) + except Exception as e: + sys.stderr.write(f"[agent] daemon-kick failed: {e}\n") + + threading.Thread(target=_bg, daemon=True, name="daemon-kick").start() + + def main() -> None: addr = ("0.0.0.0", LISTEN_PORT) print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} " f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True) + _kick_daemon_if_provisioned() with ReusableServer(addr, Handler) as srv: srv.serve_forever() diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 45003c86aa..8d75a23751 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -239,6 +239,15 @@ async def _build_images_if_needed(self, vm: VM) -> None: sys_dir = self.cfg.systems_dir / vm.system.name rootfs = sys_dir / "rootfs.ext4" sysdisk = sys_dir / "system.ext4" + # If we're (re-)provisioning a system whose rootfs already has + # /var/lib/clickbench-agent/provisioned set, drop just the rootfs so + # the agent reruns the full install/start/load flow on the next + # boot. The system.ext4 (scripts + ~14 GB of dataset) is preserved — + # rebuilding it copies 14 GB unnecessarily. + if rootfs.exists() and not _has_snapshot(vm): + log.info("[%s] rootfs exists but no snapshot — dropping it for " + "a fresh agent state", vm.system.name) + rootfs.unlink() if rootfs.exists() and sysdisk.exists(): return log.info("[%s] building rootfs + system disk", vm.system.name) From b3c05ee8df0870fae988399f14a4c0598d72fc62 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 20:34:44 +0000 Subject: [PATCH 006/221] playground: pre-stamp 'ubuntu' in /etc/hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cloud image ships hostname=ubuntu but /etc/hosts only maps 'localhost' to 127.0.0.1. Every sudo invocation inside the VM then tries to reverse-resolve 'ubuntu' against the network — which has no DNS after the snapshot drops internet — and pays the ~2 s resolver timeout. With several sudos per ./query, that's a multi-second floor on every query, visible in the firecracker log as repeated 'sudo: unable to resolve host ubuntu: Name or service not known'. Mapping ubuntu to 127.0.0.1 short-circuits the lookup. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 6e4323bc71..e3a102204c 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -181,6 +181,16 @@ EOF # and break on empty. mkdir -p /root chmod 700 /root + +# /etc/hosts: ensure both "localhost" and the cloud-image hostname "ubuntu" +# resolve locally. Without the second entry, every sudo invocation does a +# reverse DNS lookup that times out (~2 s each) trying to find "ubuntu" on +# the dropped-internet network, which adds up to a multi-second floor on +# every /query. Pre-stamping the host name removes the round trip. +cat > /etc/hosts < Date: Tue, 12 May 2026 20:54:39 +0000 Subject: [PATCH 007/221] playground: pre-snapshot sync from host + drop daemon restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mid-snapshot checksum-mismatch I attributed to "stopping the daemon mid-merge" was actually FS corruption: KVM pauses the vcpus the moment we call /vm Paused, and any ext4 writeback that was in flight at that instant gets captured by the snapshot as half-flushed. On restore the page cache references on-disk blocks that never landed, and the next read sees a torn write. Fix: 1. Drop the pre-snapshot stop/start. Killing ClickHouse at any point never corrupts on-disk MergeTree data — only an unflushed FS can. 2. Add a /sync endpoint to the agent and call it from the host right before /vm Paused, so all dirty pages have hit virtio-blk before KVM freezes the vcpus. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 75 ++++++++------------------------- playground/server/vm_manager.py | 18 ++++++++ 2 files changed, 36 insertions(+), 57 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index d2b0def356..a6950e0d9a 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -263,63 +263,10 @@ def _provision() -> tuple[int, bytes]: PROVISION_LOG.write_bytes(b"".join(log_lines)) return r.returncode, b"".join(log_lines) - subprocess.run(["sync"], check=False) - - # Pre-snapshot trim: - # - # 1. Restart the daemon if the system is restartable. After ./load - # finishes, most engines have hundreds of MB of fresh per-INSERT - # state on the heap: ClickHouse's MergeTree merge thread arenas, - # Postgres' aborted-batch buffers, etc. Stop/start sheds that - # private memory back to zero. We capture stop/start output into - # the provision log so a broken restart can be diagnosed, and - # bail out of /provision if ./check doesn't recover — the host - # must NOT snapshot a dead daemon, since post-restore /query - # would then hit "Connection refused" until the user manually - # kicks the VM. Skip the whole dance for in-process engines - # (chdb, polars, pandas, …) where stop/start is a no-op AND the - # data lives in the process; wiping it would defeat the point. - restartable = (SYSTEM_DIR / "start").exists() and (SYSTEM_DIR / "stop").exists() - if restartable: - log_lines.append(b"\n=== pre-snapshot restart ===\n") - r = subprocess.run([str(SYSTEM_DIR / "stop")], cwd=str(SYSTEM_DIR), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - timeout=120, check=False) - log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n") - log_lines.append(r.stdout or b"") - for _ in range(60): - rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR), - stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, - timeout=10, check=False).returncode - if rc != 0: - break - time.sleep(0.5) - r = subprocess.run([str(SYSTEM_DIR / "start")], cwd=str(SYSTEM_DIR), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - timeout=300, check=False) - log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n") - log_lines.append(r.stdout or b"") - restart_ok = False - for _ in range(300): - rc = subprocess.run([str(SYSTEM_DIR / "check")], cwd=str(SYSTEM_DIR), - stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, - timeout=10, check=False).returncode - if rc == 0: - restart_ok = True - break - time.sleep(1) - if not restart_ok: - log_lines.append(b"=== pre-snapshot restart FAILED ===\n") - PROVISION_LOG.write_bytes(b"".join(log_lines)) - # Do NOT set PROVISION_DONE; force /provision to return 500 - # so the host doesn't snapshot a dead daemon. - return 1, b"".join(log_lines) - log_lines.append(b"=== pre-snapshot restart ok ===\n") - - # 2. Drop the kernel's page+dentry+inode cache. The page cache holds - # 3-5 GB of file data the system would re-read on demand anyway; - # those pages become zero-fill in the snapshot, which zstd - # compresses ~50:1 vs random data. + # Drop the page+dentry+inode cache. The page cache typically holds + # 3-5 GB of file data the system would re-read on demand anyway; + # those pages turn into zero-fill in the snapshot, which zstd + # compresses 50:1 vs random data. subprocess.run(["sync"], check=False) try: Path("/proc/sys/vm/drop_caches").write_text("3\n") @@ -365,6 +312,20 @@ def do_GET(self) -> None: self._send_json(404, {"error": "not found", "path": self.path}) def do_POST(self) -> None: + if self.path == "/sync": + # Flush all dirty pages to the virtio-blk devices. The host + # calls this immediately before /snapshot/create so the + # on-disk image captured in the snapshot is consistent with + # what the in-memory page cache thinks is there. Without + # this, a long-running daemon's writeback may still be in + # flight when KVM pauses the vcpus, the snapshot freezes a + # mid-flush state, and post-restore reads see torn or + # checksum-mismatched data. + t0 = time.monotonic() + subprocess.run(["sync"], check=False) + self._send(200, f"{time.monotonic() - t0:.3f}\n".encode(), + {"Content-Type": "text/plain"}) + return if self.path == "/provision": rc, log = _provision() self._send(200 if rc == 0 else 500, log[-OUTPUT_LIMIT:], diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 8d75a23751..be655a3458 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -327,6 +327,14 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) async def _snapshot(self, vm: VM) -> None: + # Flush the guest's dirty pages to the virtio-blk devices before we + # pause the vcpus. Without an explicit sync here, KVM can freeze + # the guest mid-flush — the snapshot then captures memory that + # references on-disk blocks that haven't actually landed yet, and + # the next read after restore sees a checksum mismatch / torn + # write on whatever was being written at the moment of pause. + await self._sync_guest(vm) + sock = str(vm.api_sock) await fc.patch(sock, "/vm", {"state": "Paused"}) try: @@ -492,6 +500,16 @@ async def _call_agent_provision(self, vm: VM) -> None: raise RuntimeError(f"agent /provision failed: {r.status}: " f"{body[-2000:].decode(errors='replace')}") + async def _sync_guest(self, vm: VM) -> None: + url = self.agent_url(vm) + "/sync" + try: + async with aiohttp.ClientSession() as s: + async with s.post(url, timeout=aiohttp.ClientTimeout(total=300)) as r: + body = (await r.read()).decode("utf-8", errors="replace").strip() + log.info("[%s] guest sync: %s", vm.system.name, body) + except Exception as e: + log.warning("[%s] guest sync failed (%r); proceeding anyway", vm.system.name, e) + def _has_snapshot(vm: VM) -> bool: return vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists() From 446b3f7ebbf284f52e70ac671c2d587b6bff5d80 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 20:59:36 +0000 Subject: [PATCH 008/221] playground: stop daemon before snapshot for tiny snapshot.bin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the host /syncs the FS before pausing the vcpus, the snapshot captures consistent on-disk state regardless of when the daemon exits (MergeTree's on-disk format is durable under arbitrary process exit; only an unflushed *filesystem* corrupts it). So we can shut the daemon down here to evict its private heap (merge thread arenas, query cache, mark cache, uncompressed cache, ingest buffers) and snapshot what's left — mostly zero-fill RAM, which zstd compresses ~300:1. Restore path is unchanged: _kick_daemon_if_provisioned at agent startup brings the daemon back up on every cold restore. First query in a restored VM pays a 1-2 s daemon-start cost instead of carrying 8-12 GB of memory in every snapshot. In-process engines (chdb, polars, …) keep all state in RAM and have no daemon to stop; for them, has_daemon is false and we skip the stop step, falling back to drop_caches alone. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 51 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index a6950e0d9a..743a4068f0 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -263,10 +263,53 @@ def _provision() -> tuple[int, bytes]: PROVISION_LOG.write_bytes(b"".join(log_lines)) return r.returncode, b"".join(log_lines) - # Drop the page+dentry+inode cache. The page cache typically holds - # 3-5 GB of file data the system would re-read on demand anyway; - # those pages turn into zero-fill in the snapshot, which zstd - # compresses 50:1 vs random data. + # Pre-snapshot trim. The host /sync's the FS right before pausing + # the vcpus, so any on-disk data the daemon has already committed + # is durable. That means we're free to stop the daemon here: + # ClickHouse's MergeTree (and equivalent on-disk stores) never + # produce inconsistent on-disk state regardless of when the + # process exits — only an unflushed *filesystem* can. With the + # host-side /sync in place, we can shut the daemon down to evict + # its private heap (merge thread arenas, query cache, mark cache, + # uncompressed cache, parquet ingest buffers, …) and snapshot a + # mostly-zero RAM image. The agent's startup path + # (_kick_daemon_if_provisioned) brings it back up on every + # restore, so the first query in a restored VM pays a 1-2 s + # daemon-start cost instead of carrying 8-12 GB of memory in + # every snapshot. + # + # Skip for in-process / stateless tools where stop/start is a + # no-op AND the data lives in process memory; wiping it would + # defeat the point. Those systems can rely on drop_caches alone. + stop = SYSTEM_DIR / "stop" + start = SYSTEM_DIR / "start" + check = SYSTEM_DIR / "check" + has_daemon = (stop.exists() and start.exists() and + check.exists() and os.access(stop, os.X_OK) and + os.access(start, os.X_OK)) + if has_daemon: + log_lines.append(b"\n=== pre-snapshot stop ===\n") + r = subprocess.run([str(stop)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=120, check=False) + log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n") + log_lines.append(r.stdout or b"") + # Wait for the daemon to actually exit (./check failing means + # it's gone). Tolerant if it never fails — we still proceed. + for _ in range(120): + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc != 0: + break + time.sleep(0.5) + log_lines.append(b"=== pre-snapshot stop done ===\n") + + # Drop the page+dentry+inode cache. With the daemon stopped, this + # frees both file cache AND its mmap'd buffers. The result is a + # snapshot whose memory is mostly zero-fill, which zstd compresses + # ~300:1. subprocess.run(["sync"], check=False) try: Path("/proc/sys/vm/drop_caches").write_text("3\n") From 2d3ac3f963c2c523ea4f2f23054612644ba2c754 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 21:16:09 +0000 Subject: [PATCH 009/221] playground: init_on_free=1 + ensure-daemon-up on first /query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes for the small-snapshot path: 1. Pass init_on_free=1 in the guest kernel cmdline. Linux normally leaves freed page frames with whatever bytes were last written to them, so the post-`clickhouse stop` free pool was ~10 GB of stale daemon heap and Firecracker's snapshot dump compressed only ~3:1. init_on_free=1 zeros every page as it goes onto the free list, so the snapshot's RAM region is genuinely zero-filled and zstd hits ~300:1. 2. Add `_ensure_daemon_started` at the top of the agent's /query handler. After a snapshot restore (taken with the daemon stopped), the restored memory has no daemon process and `localhost:9000` refuses connections. The cold-boot `_kick_daemon_if_provisioned` only fires on actual cold boots, not on snapshot resumes, so we need an explicit check at query time. Lock-protected so concurrent /query requests don't try to ./start the daemon twice; idempotent and free once the daemon is up. Also dropped the userspace _zero_free_ram hack — init_on_free does it natively at no userspace cost. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 59 ++++++++++++++++++++++++++++++--- playground/server/vm_manager.py | 11 +++++- 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 743a4068f0..40ddbaae5e 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -61,6 +61,12 @@ # scripts hitting the same socket/temp file concurrently would not be safe. _query_lock = threading.Lock() _provision_lock = threading.Lock() +# Tracks whether we've successfully run ./start since this agent process +# came up. After a snapshot restore the daemon doesn't exist in the +# restored memory (we stop it pre-snapshot to keep snapshots small), so the +# first /query has to bring it up. +_daemon_started = threading.Event() +_daemon_lock = threading.Lock() def _cap(b: bytes) -> tuple[bytes, bool]: @@ -130,6 +136,45 @@ def _stats_snapshot() -> dict: return out +def _ensure_daemon_started() -> None: + """Bring the system's daemon up if it isn't already. + + Called at the top of every /query handler. The first call after a + snapshot restore is where the work happens — the snapshot was taken + with the daemon stopped (to keep the memory image compressible), so + nothing is listening on the daemon's port until we explicitly run + ./start. Subsequent calls are no-ops because _daemon_started is set. + + Wrapping ./start in a thread lock means only one /query in flight + pays the start cost, even if several arrive concurrently. + """ + if _daemon_started.is_set(): + return + with _daemon_lock: + if _daemon_started.is_set(): + return + start = SYSTEM_DIR / "start" + if not start.exists() or not os.access(start, os.X_OK): + # No daemon to start (in-process system like chdb/polars). + _daemon_started.set() + return + subprocess.run([str(start)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=300, check=False) + # Wait for ./check to confirm before unblocking the /query. + check = SYSTEM_DIR / "check" + if check.exists(): + for _ in range(120): + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + break + time.sleep(0.5) + _daemon_started.set() + + def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]: """ Invoke ./query with the SQL on stdin. @@ -306,10 +351,12 @@ def _provision() -> tuple[int, bytes]: time.sleep(0.5) log_lines.append(b"=== pre-snapshot stop done ===\n") - # Drop the page+dentry+inode cache. With the daemon stopped, this - # frees both file cache AND its mmap'd buffers. The result is a - # snapshot whose memory is mostly zero-fill, which zstd compresses - # ~300:1. + # Drop the page+dentry+inode cache. With init_on_free=1 set in the + # guest kernel cmdline (see vm_manager._kernel_cmdline), every page + # the kernel frees gets zero-filled before going back on the free + # list. After clickhouse stop + drop_caches, the entire free pool + # is genuinely zero-filled, and the snapshot's RAM dump compresses + # ~300:1 instead of the ~3:1 we got without init_on_free. subprocess.run(["sync"], check=False) try: Path("/proc/sys/vm/drop_caches").write_text("3\n") @@ -383,6 +430,10 @@ def do_POST(self) -> None: if not sql.strip(): self._send_json(400, {"error": "empty query"}) return + # First /query after a snapshot restore: start the daemon + # (it was stopped pre-snapshot to keep snapshots small). + # Subsequent calls are a near-instant no-op. + _ensure_daemon_started() with _query_lock: rc, out, err, wall = _run_query(sql) script_t = _extract_script_timing(err) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index be655a3458..926af22733 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -200,9 +200,18 @@ def _kernel_cmdline(self, vm: VM) -> str: # console socket for debugging); reboot=k for clean halt-on-panic. # The kernel's built-in IP autoconfig statically assigns the VM's # /24 from its slot, sidestepping any DHCP/networkd in userland. + # + # init_on_free=1: makes the kernel zero every page as it goes back + # on the free list. Without it, freed pages keep whatever the last + # writer put there — and Firecracker's snapshot dumps *all* RAM, + # so 8-12 GB of stale-but-freed daemon heap end up in snapshot.bin + # looking random to zstd. With it on, the pre-snapshot daemon + # shutdown leaves the guest's free pool genuinely zero-filled, and + # zstd compresses the snapshot ~300:1. The cost is a small write + # overhead on every free (~negligible vs the snapshot size win). host_ip, vm_ip, _ = net.addr_for(vm.slot) return ( - "console=ttyS0 reboot=k panic=1 pci=off " + "console=ttyS0 reboot=k panic=1 pci=off init_on_free=1 " f"ip={vm_ip}::{host_ip}:255.255.255.0::eth0:off " "root=/dev/vda rw " "init=/lib/systemd/systemd " From fd5d74fcfbd03322e448cd89e8ed6d0c21682104 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 21:30:34 +0000 Subject: [PATCH 010/221] =?UTF-8?q?playground:=20checkpoint=20=E2=80=94=20?= =?UTF-8?q?ClickHouse=20smoke=20test=20passes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end working with a 35 MB snapshot (16 GiB raw, ~470x ratio): SELECT COUNT(*) returns 99997497 cleanly, GROUP BY URL produces the expected top-N without any checksum errors, output truncation caps a 244 KB result at 10 KB with the right header set. Cold path (snapshot restore + daemon start): ~10 s. Warm path (live VM): subsecond on COUNT / MIN-MAX. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/docs/build-progress.md | 191 ++++++++++++++---------------- 1 file changed, 92 insertions(+), 99 deletions(-) diff --git a/playground/docs/build-progress.md b/playground/docs/build-progress.md index a710246518..0e1b07629b 100644 --- a/playground/docs/build-progress.md +++ b/playground/docs/build-progress.md @@ -1,110 +1,103 @@ -# Playground build progress — checkpoint 2026-05-12 ~19:58 UTC - -## What is built and committed - -- `playground/` directory scaffolded with subdirs `server/`, `agent/`, - `images/`, `web/`, `scripts/`, `docs/`. -- Architecture notes in `playground/README.md` and - `playground/docs/architecture.md`. -- Host-side API server (`playground/server/*.py`): - - `config.py` — env-driven config with sensible defaults - - `systems.py` — discovers 97 playground-eligible ClickBench systems - - `firecracker.py` — async unix-socket client for Firecracker API - - `net.py` — per-VM TAP + /24 + NAT toggle - - `vm_manager.py` — VM lifecycle (boot, provision, snapshot, restore) - - `monitor.py` — CPU/disk/host-memory watchdog (1 Hz) - - `logging_sink.py` — batched async logger → ClickHouse Cloud + JSONL fallback - - `main.py` — aiohttp routes + static SPA serving -- In-VM agent (`playground/agent/agent.py`, stdlib-only) with endpoints - `/health`, `/stats`, `/provision`, `/query`, `/provision-log`. -- systemd unit `playground/agent/clickbench-agent.service` installed in the - rootfs and enabled. -- Vanilla JS SPA (`playground/web/`): system picker, query box, timing display, - truncation indicator. Talks to `/api/systems`, `/api/system/`, - `/api/query?system=...`. -- Build scripts: - - `images/build-base-rootfs.sh` — Ubuntu 22.04 cloud image → flat 8 GB - ext4 with agent + systemd unit pre-installed. - - `images/build-system-rootfs.sh` — per-system 200 GB sparse rootfs + - sized system disk (16/88 GB depending on data format) containing the - ClickBench scripts + the dataset files this system needs (no symlinks - into a RO mount, because many systems' load scripts `chown`). - - `scripts/install-firecracker.sh` — idempotent host setup. - - `scripts/download-datasets.sh` — eager dataset download into - `/opt/clickbench-playground/datasets/`. - - `scripts/smoke-boot.sh` — boots the base rootfs alone in a VM; confirms - kernel + rootfs + agent path before per-system testing. - - `scripts/agent-selftest.sh` — runs the agent on the host (no VM) and - exercises all endpoints with a fake "system" dir. PASSES. - -## What is provisioned on disk (host) +# Playground build progress — checkpoint 2026-05-12 ~21:30 UTC + +## Status: ClickHouse end-to-end works + +``` +$ printf 'SELECT COUNT(*) FROM hits' | curl -sS -X POST --data-binary @- \ + 'http://127.0.0.1:8000/api/query?system=clickhouse' -D - +HTTP/1.1 200 OK +X-Query-Wall-Time: 0.122721 +X-Output-Bytes: 9 +X-Output-Truncated: 0 +X-Query-Time: 0.003000 +X-Wall-Time: 10.112950 +Content-Length: 9 + +99997497 +``` + +Cold path (snapshot restore + daemon start): ~10 s. +Warm path (live VM): subsecond on COUNT / MIN-MAX, ~24 s on top-of-URL. +Output truncation: 244 KB result correctly capped to 10 KB with +`X-Output-Truncated: 1` set. + +## Snapshot footprint + +`snapshot.bin.zst` for ClickHouse: **35 MB** (down from 16 GB raw RAM dump, +~470× compression). The combination that gets us there: + + 1. Agent stops the daemon at the end of /provision (clickhouse stop). + 2. Agent drops the page+dentry+inode cache. + 3. Guest kernel runs with `init_on_free=1` — every freed page is + zero-filled before going back on the free list, so the resulting + RAM is genuinely compressible (not just "freed-but-stale" stale + bytes that look random to zstd). + 4. Host calls a /sync endpoint on the agent immediately before + /vm Paused, so ext4 writeback completes before KVM freezes the + vcpus — no half-flushed pages in the snapshot. + 5. `zstd -T0 -3 --long=27` for parallel compression with a 128 MB + match window (helps with repetitive zero patterns). + +On restore the agent's first /query brings the daemon back up via +`_ensure_daemon_started`. That's ~3-5 s of clickhouse startup amortized +into the first cold query. + +## Components shipped + +- `playground/server/` — aiohttp API (UI + /api/{systems,system,query, + state,admin/provision,provision-log}), per-system Firecracker + lifecycle, monitor watchdog, batched ClickHouse-Cloud logging sink + with JSONL fallback. +- `playground/agent/` — stdlib HTTP agent. Endpoints: + - GET /health, /stats, /provision-log + - POST /provision (install → start → check → load → stop → drop_caches) + - POST /sync (guest fsync just before host snapshot) + - POST /query (10 KB output cap, fractional-second timing in headers) +- `playground/images/` — `build-base-rootfs.sh` (Ubuntu 22.04 → flat 8 GB + ext4 with agent pre-installed), `build-system-rootfs.sh` (per-system + 200 GB sparse rootfs + sized system disk with pre-staged dataset). +- `playground/web/` — vanilla-JS SPA with system picker, query box, + timing display, truncation indicator. + +## Host state ``` /opt/clickbench-playground/ -├── bin/firecracker, bin/jailer (firecracker v1.13.1) -├── kernel/vmlinux (Linux 6.1.141, IP_PNP + virtio enabled) -├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent +├── bin/firecracker, bin/jailer firecracker v1.13.1 +├── kernel/vmlinux Linux 6.1.141 +├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent ├── datasets/ -│ ├── hits.parquet 14.7 GB (single) -│ ├── hits_partitioned/ 14 GB (100 partitioned files) -│ ├── hits.tsv 74 GB (decompressed) -│ ├── hits.csv ~14 GB partial (kill-stopped) -│ └── hits.csv.gz 16 GB +│ ├── hits.parquet 14.7 GB +│ ├── hits_partitioned/ 14 GB (100 files) +│ ├── hits.tsv 74 GB +│ ├── hits.csv partial (kill-stopped); .gz intact └── systems/clickhouse/ - ├── rootfs.ext4 8.2 MB physical / 200 GB sparse - └── system.ext4 16 GB (parquet files staged) + ├── rootfs.ext4 sparse 200 GB + ├── system.ext4 16 GB (parquet + scripts) + ├── snapshot.bin.zst 35 MB + └── snapshot.state 58 KB ``` -## What works - -- Python module imports clean (`python3 -m playground.server.main`). -- API server serves 97 systems via `/api/systems`. -- UI loads at `/ui/`. -- Firecracker smoke-boot (base rootfs only): agent comes up in 2 s, - `/health` and `/stats` respond OK. -- Agent self-test (no VM): all 4 endpoints behave correctly, output - truncation works (2 KB → 64 B with `X-Output-Truncated: 1`). -- Provision started on ClickHouse VM at 19:51:59 UTC: - - VM booted, agent up, internet enabled via MASQUERADE on `ens33` - - Install ran (ClickHouse binary downloaded + apt deps) - - Load is in progress — `cpu_busy=0.8-1.0` sustained, `disk_used` - grew from 17 GB → 30 GB, indicating MergeTree INSERT. - - At 19:57:33 the agent stopped responding to /health (timeout). - Firecracker process is still running (PID 19230, 16 min of CPU). - Likely cause: agent's HTTP server starved by the load process, - or a fork race in stdlib `socketserver`. Needs investigation. - ## What's left -- Decide whether to add eager liveness pings or move agent to aiohttp - to avoid the stdlib threading server's quirks under heavy load. -- Once provision completes: snapshot → restore → /query test path. -- Build system disks for the other 96 systems (template is ready). -- Wire up ClickHouse Cloud credentials for the logging sink (currently +- Build system disks for the remaining 96 systems (template is ready; + each requires its own provision pass — most should "just work" with + the same flow). +- Tighten the External-only exclusion list in `systems.py` once we've + validated which local-only systems actually run. +- Wire ClickHouse Cloud credentials for the logging sink (currently falling back to JSONL under `/opt/clickbench-playground/logs/`). +- Optional: jailer integration for tighter isolation if the host is + ever multi-tenant. + +## Known sharp edges -## Known issues / things to revisit - -- TSV/CSV decompression contends with rootfs build for nvme writeback. - Workaround: pre-build the base rootfs before kicking off the heavy - decompressions, or rate-limit pigz. -- The "External" exclusion list in `systems.py` is conservative; some - entries (umbra, hyper, cedardb) actually run locally and should be - added back when verified. -- /etc/resolv.conf in the base rootfs is a static fallback (1.1.1.1 + - 8.8.8.8). Once we cut internet post-snapshot, DNS doesn't matter, but - during provision it does — sanity check that NAT + resolv.conf actually - let `apt-get update` work. -- KVM permissions were opened to mode 666 via a udev rule. Tighten to - the `kvm` group when the playground user is properly added. - -## Operator notes - -- The base rootfs ships with serial autologin as root on ttyS0 — good for - attaching the Firecracker console for debugging. -- Firecracker logs land in `/opt/clickbench-playground/logs/firecracker-.log`. -- The host's `/dev/kvm` group/mode was changed: `chown root:kvm`, `chmod 666`, - with a persistent udev rule at `/etc/udev/rules.d/65-kvm.rules`. -- `vm.dirty_writeback_centisecs` is set to 10 on the host (down from 500) - to reduce sfdisk hang during heavy concurrent writeback. Revert if it - causes other problems. +- The `chroot` in `build-base-rootfs.sh` previously tore down the host's + `/dev/pts` via mount propagation, breaking sshd PTY allocation. Fixed + with `mount --make-rslave` (committed); if you see "PTY allocation + request failed on channel 0" after a rebuild, `sudo mount -t devpts + devpts /dev/pts -o gid=5,mode=620,ptmxmode=000` brings it back. +- KVM permissions: a udev rule at `/etc/udev/rules.d/65-kvm.rules` keeps + `/dev/kvm` group=kvm mode=666 so the playground user can open it. +- `vm.dirty_writeback_centisecs=10` on the host (down from 500); revert + if it causes problems elsewhere. From f1088ece9beee9f63a62c027351c3b688fd957eb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 21:37:20 +0000 Subject: [PATCH 011/221] playground: shared RO datasets disk + per-restore golden-disk clone Two correctness/efficiency fixes: 1. Shared read-only datasets disk. Previously each per-system rootfs embedded its own copy of hits.parquet / hits.tsv / hits.csv (14-75 GB each), so the catalog needed ~1-2 TB of redundant dataset storage on the host. Build one shared datasets.ext4 instead, attach to every VM read-only at LABEL=cbdata, and have the agent copy the bytes the system actually needs from /opt/clickbench/datasets into the writable per-system disk at provision time only. The agent uses os.copy_file_range so the in-VM copy is kernel-side, not bounced through userspace. 2. Golden-disk snapshot/restore. Firecracker's snapshot.bin only saves memory; the disk image referenced by the in-memory state is the live file. If anything modifies it between snapshots (background merges, log writes, /tmp churn) the next /snapshot/load points at the new disk while replaying old memory references. We were getting away with this because clickhouse-server happens to be tolerant, but it's fragile. Now /snapshot also renames the working disks into `*.golden.ext4`, and /restore-snapshot clones the goldens back into fresh working copies via `cp --sparse=always`. Every restore starts from the exact disk state captured at snapshot time. 3. Bound per-system disk builds and provisions via asyncio.Semaphore (PLAYGROUND_BUILD_CONCURRENCY=6, PLAYGROUND_PROVISION_CONCURRENCY=32) so kicking off 98 systems at once doesn't thrash the host NVMe or rate-limit Ubuntu mirrors. 4. Re-enabled `ursa` in the playground catalog (was incorrectly in the _EXTERNAL exclude list; it runs locally). Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 86 +++++++++++++++- playground/images/build-base-rootfs.sh | 3 +- playground/images/build-datasets-image.sh | 35 ++++--- playground/images/build-system-rootfs.sh | 117 ++++++---------------- playground/scripts/provision-all.sh | 79 +++++++++++++++ playground/server/systems.py | 2 +- playground/server/vm_manager.py | 106 +++++++++++++++++++- 7 files changed, 324 insertions(+), 104 deletions(-) create mode 100755 playground/scripts/provision-all.sh diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 40ddbaae5e..1b0cc55f1f 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -84,6 +84,67 @@ def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes: return handler.rfile.read(min(n, 1 << 20)) +def _stage_dataset(fmt: str) -> list[Path]: + """Copy the dataset file(s) the system's load script needs from the + read-only shared mount into the per-system writable disk. + + Returns the list of staged files. Empty list when there's nothing to + stage (datalake / in-memory engines whose ./load reads from external + sources). Raises if a required file is missing. + """ + staged: list[Path] = [] + if fmt in ("", "none", "unknown"): + return staged + if not DATASETS_DIR.exists(): + raise FileNotFoundError(f"datasets mount missing: {DATASETS_DIR}") + + if fmt == "parquet": + srcs = [DATASETS_DIR / "hits.parquet"] + elif fmt == "parquet-partitioned": + srcs = sorted((DATASETS_DIR / "hits_partitioned").glob("hits_*.parquet")) + elif fmt == "tsv": + srcs = [DATASETS_DIR / "hits.tsv"] + elif fmt == "csv": + srcs = [DATASETS_DIR / "hits.csv"] + else: + srcs = [] + + for src in srcs: + if not src.exists(): + raise FileNotFoundError(f"staged source missing: {src}") + dst = SYSTEM_DIR / src.name + # copy_file_range goes through the kernel without bouncing bytes + # through userspace — much faster than shutil.copyfile for the + # 14 GB / 75 GB files we deal with. + with src.open("rb") as fsrc, dst.open("wb") as fdst: + size = src.stat().st_size + try: + off = 0 + while off < size: + n = os.copy_file_range( + fsrc.fileno(), fdst.fileno(), + size - off, + ) + if n == 0: + break + off += n + except (AttributeError, OSError): + # Fall back to read/write for kernels / filesystems that + # don't support copy_file_range across the underlying + # device pair (RO ext4 -> RW ext4 should be fine, but + # there are kernels that don't allow it). + fsrc.seek(0) + fdst.seek(0) + fdst.truncate(0) + while True: + chunk = fsrc.read(8 * 1024 * 1024) + if not chunk: + break + fdst.write(chunk) + staged.append(dst) + return staged + + def _system_script(name: str) -> Path: """Return path to a script in the system dir, or raise if missing/not executable.""" p = SYSTEM_DIR / name @@ -288,10 +349,27 @@ def _provision() -> tuple[int, bytes]: return 1, b"".join(log_lines) log_lines.append(b"\n=== check ok ===\n") - # Data files are pre-staged on the per-system disk by the host-side - # build-system-rootfs.sh, so the load script's relative references - # (hits.parquet, hits.tsv, etc.) already resolve to local files it - # can chown / mv / rm without worrying about a RO source mount. + # Stage the dataset files this system needs from the read-only + # shared mount into the writable system disk. We copy (rather than + # symlink/bind-mount) so the system's load script can mv/chown/rm + # them however it likes; the destination is a local file on the + # cbsystem disk. After load the script typically `rm`s them, so + # the copies are short-lived. + fmt_file = SYSTEM_DIR / ".data-format" + fmt = fmt_file.read_text().strip() if fmt_file.exists() else "" + stage_t0 = time.monotonic() + log_lines.append(f"\n=== staging dataset (format={fmt}) ===\n".encode()) + try: + staged = _stage_dataset(fmt) + log_lines.append(f"staged {len(staged)} files: ".encode() + + ", ".join(s.name for s in staged[:5]).encode() + + (b" ..." if len(staged) > 5 else b"") + b"\n") + except Exception as e: + log_lines.append(f"stage failed: {e!r}\n".encode()) + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return 1, b"".join(log_lines) + stage_dt = time.monotonic() - stage_t0 + log_lines.append(f"=== staging done in {stage_dt:.1f}s ===\n".encode()) # Run load. t0 = time.monotonic() diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index e3a102204c..01e9714da1 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -172,9 +172,10 @@ passwd -d root # systemd refuses to clear those entries on its own and drops to emergency # mode when label-based lookups fail. The kernel handles the root mount via # its `root=/dev/vda` cmdline; we only need fstab for the system disk. -mkdir -p /opt/clickbench/system +mkdir -p /opt/clickbench/system /opt/clickbench/datasets cat > /etc/fstab </dev/null +# Disable the journal (-O ^has_journal) and reserve 0 blocks for root +# (-m 0); both make sense for a read-only image. +mkfs.ext4 -F -L cbdata -m 0 -O ^has_journal \ + -E lazy_itable_init=1,lazy_journal_init=1 "$OUT" >/dev/null MNT="$(mktemp -d)" trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT sudo mount -o loop "$OUT" "$MNT" -sudo rsync -a --info=progress2 "$SRC"/. "$MNT"/ +sudo rsync -a "$SRC"/. "$MNT"/ sudo sync sudo umount "$MNT" trap - EXIT +# Mark the image read-only on the host too, so a misconfigured drive (RW +# attach by mistake) can't scribble. +chmod a-w "$OUT" + echo "[datasets] done" ls -lh "$OUT" diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh index 90efe01c99..f1065284de 100755 --- a/playground/images/build-system-rootfs.sh +++ b/playground/images/build-system-rootfs.sh @@ -3,21 +3,12 @@ # # Outputs (under /opt/clickbench-playground/systems//): # rootfs.ext4 CoW-ish copy of base-rootfs.ext4 (sparse 200 GB) -# system.ext4 ext4 holding ClickBench scripts + the dataset files -# this system needs. Mounted RW at /opt/clickbench/system -# in the VM. We include the data here (not a separate -# read-only datasets disk) because many load scripts do -# `sudo chown` on the source files, and chown follows -# symlinks — i.e. it tries to mutate the RO-mounted -# dataset and fails. Putting the data on the RW system -# disk sidesteps the problem entirely. -# -# The disk is sized based on the system's data format: -# parquet, parquet-partitioned 16 GB -# tsv, csv 88 GB -# none/unknown 2 GB -# -# Usage: build-system-rootfs.sh +# system.ext4 ~2 GB ext4 holding ONLY the system's ClickBench +# scripts. The dataset is *not* copied in here — it +# comes from the host-side shared datasets.ext4 +# attached read-only to every VM (build-datasets- +# image.sh). The agent's /provision step copies +# only the bytes the load script actually needs. set -euo pipefail @@ -30,13 +21,13 @@ SYSTEM="$1" STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" BASE="$STATE_DIR/base-rootfs.ext4" -DATASETS="$STATE_DIR/datasets" SRC="$REPO_DIR/$SYSTEM" OUT_DIR="$STATE_DIR/systems/$SYSTEM" ROOTFS="$OUT_DIR/rootfs.ext4" SYSDISK="$OUT_DIR/system.ext4" ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}" +SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-2}" if [ ! -f "$BASE" ]; then echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2 @@ -53,27 +44,9 @@ for f in install start load query check stop; do fi done -# Discover the data format from the system's benchmark.sh. Source the file in -# a noop-shell so any of `export BENCH_DOWNLOAD_SCRIPT="..."` / -# `BENCH_DOWNLOAD_SCRIPT=...` etc. just becomes a variable. Drop everything -# else by running in a subshell. -download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \ - eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \ - printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")" -case "$download_script" in - *parquet-partitioned*) format=parquet-partitioned; sysdisk_size_gb=16 ;; - *parquet-single*) format=parquet; sysdisk_size_gb=16 ;; - *tsv*) format=tsv; sysdisk_size_gb=88 ;; - *csv*) format=csv; sysdisk_size_gb=88 ;; - "") format=none; sysdisk_size_gb=2 ;; - *) format=unknown; sysdisk_size_gb=4 ;; -esac -echo "[sys:$SYSTEM] format=$format sysdisk_size=${sysdisk_size_gb}G" - mkdir -p "$OUT_DIR" -# 1. Rootfs as a sparse file. Allocate 200 GB but only write blocks when -# something inside the VM dirties them. +# 1. Rootfs: sparse 200 GB. echo "[sys:$SYSTEM] rootfs.ext4 ${ROOTFS_SIZE_GB}G (sparse)" rm -f "$ROOTFS" truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS" @@ -86,79 +59,53 @@ trap ' sudo umount "'"$DST_MNT"'" 2>/dev/null || true rmdir "'"$BASE_MNT"'" "'"$DST_MNT"'" 2>/dev/null || true ' EXIT -# A prior smoke-boot likely left the base rootfs's journal dirty. Replay it -# (fsck -fy is idempotent) before opening read-only — otherwise the loop -# mount refuses with "cannot mount read-only" and the script blows up -# silently. +# A prior smoke-boot may have left the base journal dirty; fsck before RO +# mount, otherwise the loop mount refuses with "cannot mount read-only". sudo e2fsck -fy "$BASE" >/dev/null 2>&1 || true sudo mount -o loop,ro "$BASE" "$BASE_MNT" sudo mount -o loop "$ROOTFS" "$DST_MNT" -sudo cp -a --reflink=auto "$BASE_MNT"/. "$DST_MNT"/ +sudo cp -a "$BASE_MNT"/. "$DST_MNT"/ echo "$SYSTEM" | sudo tee "$DST_MNT/etc/clickbench-system" >/dev/null sudo sync sudo umount "$DST_MNT" sudo umount "$BASE_MNT" trap - EXIT -# 2. System disk: ClickBench scripts + the data files this system needs. -# Sized per-format. The agent runs ./install/./start/./load with cwd here, so -# the load script's relative references to hits.parquet / hits.tsv / etc. all -# resolve to local files it owns. -echo "[sys:$SYSTEM] system.ext4 ${sysdisk_size_gb}G" +# 2. System disk: ClickBench scripts only. Sized at SYSDISK_SIZE_GB (2 GB +# default). The agent populates the dataset files into this disk at +# provision time by copying from the shared read-only datasets disk. +echo "[sys:$SYSTEM] system.ext4 ${SYSDISK_SIZE_GB}G" rm -f "$SYSDISK" -truncate -s "${sysdisk_size_gb}G" "$SYSDISK" +truncate -s "${SYSDISK_SIZE_GB}G" "$SYSDISK" mkfs.ext4 -F -L cbsystem -E lazy_itable_init=1,lazy_journal_init=1 "$SYSDISK" >/dev/null SYS_MNT="$(mktemp -d)" trap 'sudo umount "'"$SYS_MNT"'" 2>/dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT sudo mount -o loop "$SYSDISK" "$SYS_MNT" -# Scripts. +# Scripts + sql + helpers. sudo rsync -a --exclude 'results/' --exclude '*.json' --exclude 'README*' \ "$SRC"/ "$SYS_MNT"/ -# Some systems' scripts use ../lib/... — provide it. +# Some systems' scripts use ../lib/... — make it visible. sudo mkdir -p "$SYS_MNT/_lib" sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/_lib"/ -# Data files. -case "$format" in - parquet) - if [ -f "$DATASETS/hits.parquet" ]; then - echo "[sys:$SYSTEM] copying hits.parquet" - sudo cp --reflink=auto "$DATASETS/hits.parquet" "$SYS_MNT/hits.parquet" - else - echo "[sys:$SYSTEM] WARN hits.parquet not present in datasets dir" - fi - ;; - parquet-partitioned) - if [ -d "$DATASETS/hits_partitioned" ]; then - echo "[sys:$SYSTEM] copying 100 partitioned parquet files" - sudo cp --reflink=auto "$DATASETS/hits_partitioned"/hits_*.parquet "$SYS_MNT/" - else - echo "[sys:$SYSTEM] WARN hits_partitioned/ not present" - fi - ;; - tsv) - if [ -f "$DATASETS/hits.tsv" ]; then - echo "[sys:$SYSTEM] copying hits.tsv (large)" - sudo cp --reflink=auto "$DATASETS/hits.tsv" "$SYS_MNT/hits.tsv" - else - echo "[sys:$SYSTEM] WARN hits.tsv not present" - fi - ;; - csv) - if [ -f "$DATASETS/hits.csv" ]; then - echo "[sys:$SYSTEM] copying hits.csv (large)" - sudo cp --reflink=auto "$DATASETS/hits.csv" "$SYS_MNT/hits.csv" - else - echo "[sys:$SYSTEM] WARN hits.csv not present" - fi - ;; - none|unknown) - echo "[sys:$SYSTEM] no data staging for format=$format" - ;; +# Discover the data format from benchmark.sh and stamp it; the agent uses +# this to decide which dataset files to stage from the RO mount. +download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \ + eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \ + printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")" +case "$download_script" in + *parquet-partitioned*) format=parquet-partitioned ;; + *parquet-single*) format=parquet ;; + *tsv*) format=tsv ;; + *csv*) format=csv ;; + "") format=none ;; + *) format=unknown ;; esac +echo "$format" | sudo tee "$SYS_MNT/.data-format" >/dev/null +echo "[sys:$SYSTEM] format=$format" sudo chown -R 0:0 "$SYS_MNT" sudo chmod -R u+rwX,go+rX "$SYS_MNT" diff --git a/playground/scripts/provision-all.sh b/playground/scripts/provision-all.sh new file mode 100755 index 0000000000..913f900fa2 --- /dev/null +++ b/playground/scripts/provision-all.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Kick off /api/admin/provision/ for every playground-eligible system. +# The server's own semaphore in VMManager bounds the actual concurrency +# (PLAYGROUND_PROVISION_CONCURRENCY, default 32) — this script just fires +# the requests as fast as the host can accept them, then polls until the +# server reports every system as either snapshotted or down-with-error. + +set -euo pipefail + +BASE="${PLAYGROUND_BASE:-http://127.0.0.1:8000}" +STATUS_LOG="${STATUS_LOG:-/opt/clickbench-playground/logs/provision-all.status}" +SKIP_PROVISIONED="${SKIP_PROVISIONED:-yes}" + +# Fetch the catalog. +mapfile -t SYSTEMS < <( + curl -fsS "$BASE/api/systems" | + python3 -c 'import json,sys; [print(x["name"]) for x in json.load(sys.stdin)]' +) + +echo "$(date -Is) catalog: ${#SYSTEMS[@]} systems" + +# Kick off /provision for each system that isn't already snapshotted. +for sys in "${SYSTEMS[@]}"; do + if [ "$SKIP_PROVISIONED" = "yes" ]; then + state=$(curl -fsS "$BASE/api/system/$sys" | + python3 -c 'import json,sys; print(json.load(sys.stdin)["state"])') + if [ "$state" = "snapshotted" ] || [ "$state" = "ready" ]; then + echo " $sys: skip (already $state)" + continue + fi + fi + echo " $sys: kicking provision" + curl -fsS -X POST "$BASE/api/admin/provision/$sys" >/dev/null +done + +echo "$(date -Is) all kicked off; polling state..." + +# Poll until every system reaches a terminal state. Emit one line per +# transition. +declare -A LAST_STATE +while true; do + in_flight=0 + succeeded=0 + failed=0 + : > "$STATUS_LOG.tmp" + for sys in "${SYSTEMS[@]}"; do + body=$(curl -fsS --max-time 5 "$BASE/api/system/$sys" 2>/dev/null || echo '{}') + state=$(echo "$body" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("state","?"))' 2>/dev/null) + err=$(echo "$body" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("last_error") or "")' 2>/dev/null) + echo "$sys $state $err" >> "$STATUS_LOG.tmp" + prev="${LAST_STATE[$sys]:-}" + if [ "$state" != "$prev" ]; then + ts=$(date -Is) + echo "$ts $sys: $prev -> $state${err:+ (err=$err)}" + LAST_STATE[$sys]=$state + fi + case "$state" in + snapshotted|ready) succeeded=$((succeeded+1)) ;; + down) [ -n "$err" ] && failed=$((failed+1)) || in_flight=$((in_flight+1)) ;; + provisioning) in_flight=$((in_flight+1)) ;; + *) in_flight=$((in_flight+1)) ;; + esac + done + mv "$STATUS_LOG.tmp" "$STATUS_LOG" + echo "$(date -Is) ok=$succeeded fail=$failed in_flight=$in_flight" + if [ "$in_flight" -eq 0 ]; then + echo "$(date -Is) done" + break + fi + sleep 30 +done + +# Final summary. +echo "" +echo "=== FINAL SUMMARY ===" +awk '{print $2}' "$STATUS_LOG" | sort | uniq -c +echo "" +echo "=== FAILED ===" +awk '$2 == "down" && NF > 2 {print}' "$STATUS_LOG" diff --git a/playground/server/systems.py b/playground/server/systems.py index c41599858a..3ba6862d39 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -36,7 +36,7 @@ "gravitons", "heavyai", "hologres", "hydrolix", "kinetica", "motherduck", "oxla", "pgpro_tam", "redshift", "redshift-serverless", "s3select", "singlestore", "snowflake", "supabase", - "tembo-olap", "timescale-cloud", "tinybird", "ursa", "velodb", + "tembo-olap", "timescale-cloud", "tinybird", "velodb", "vertica", "ydb", } diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 926af22733..f27abc59cb 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -84,6 +84,19 @@ def __init__(self, config: Config, systems: dict[str, System]): self.cfg = config self.systems = systems self.vms: dict[str, VM] = {} + # Bound the number of system-disk builds running concurrently. Each + # build copies up to ~88 GB of dataset (for tsv/csv systems) — doing + # 98 in parallel would thrash the host's NVMe. 6 is enough to keep + # the disk busy without hitting writeback stalls. + self._build_sem = asyncio.Semaphore(int(os.environ.get( + "PLAYGROUND_BUILD_CONCURRENCY", "6"))) + # Cap on simultaneous in-flight provisions. Each one needs 4 vCPU + + # apt-get downloads from the public internet; running 98 concurrently + # gets rate-limited by Ubuntu mirrors and we have to retry. The host + # has plenty of headroom for 32, which still finishes the catalog + # in one pass. + self._provision_sem = asyncio.Semaphore(int(os.environ.get( + "PLAYGROUND_PROVISION_CONCURRENCY", "32"))) # Stable slot allocation: sort systems alphabetically so each system # always gets the same slot id (and therefore the same TAP/IP). for i, name in enumerate(sorted(systems.keys()), start=1): @@ -328,6 +341,17 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: "is_root_device": False, "is_read_only": False, }) + # Shared dataset disk, attached read-only to every VM (LABEL=cbdata + # mount in the guest fstab). Saves ~1-2 TB of host storage compared + # to embedding the dataset into each per-system disk. + datasets_img = self.cfg.datasets_image + if datasets_img.exists(): + await fc.put(sock, "/drives/datasets", { + "drive_id": "datasets", + "path_on_host": str(datasets_img), + "is_root_device": False, + "is_read_only": True, + }) await fc.put(sock, "/machine-config", { "vcpu_count": self.cfg.vm_vcpus, "mem_size_mib": self.cfg.vm_mem_mib, @@ -357,6 +381,16 @@ async def _snapshot(self, vm: VM) -> None: with contextlib.suppress(Exception): await fc.patch(sock, "/vm", {"state": "Resumed"}) + # Capture the *disk* state too. The memory snapshot is meaningless on + # its own: it has in-flight references to specific inodes / file + # positions / mmap'd ranges on the rootfs and system disks, and if + # those move under it the restored process malfunctions. We sparse- + # copy the disks into a parallel "golden" path; every subsequent + # restore boots off a fresh copy of the golden, so background work + # the daemon does after restore (clickhouse merges, log writes, + # /tmp churn) never persists into the next session. + await self._snapshot_disks(vm) + # Compress the memory dump with parallel zstd. Firecracker writes the # *full* 16 GB of guest memory regardless of how much was actually # used; zstd at -3 with -T0 turns that into ~10-12 GB in a few @@ -421,6 +455,12 @@ async def _decompress_snapshot(self, vm: VM) -> None: async def _restore_snapshot(self, vm: VM) -> None: log.info("[%s] restore from snapshot", vm.system.name) + # Always boot from a *fresh copy* of the golden disks captured at + # snapshot time. Restore #N inherits zero state from restore #N-1, + # which is what makes the playground safe to expose to arbitrary + # SQL: the worst a user query can do is dirty the working copy, + # which we throw away on the next /teardown. + await self._restore_disks(vm) # If we only have the zstd-compressed memory dump, expand it before # Firecracker tries to mmap it. await self._decompress_snapshot(vm) @@ -430,6 +470,50 @@ async def _restore_snapshot(self, vm: VM) -> None: await self._wait_for_agent(vm, timeout=60) vm.state = "ready" + def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]: + """(working rootfs, working sysdisk, golden rootfs, golden sysdisk).""" + sys_dir = self.cfg.systems_dir / vm.system.name + return ( + sys_dir / "rootfs.ext4", + sys_dir / "system.ext4", + sys_dir / "rootfs.golden.ext4", + sys_dir / "system.golden.ext4", + ) + + async def _snapshot_disks(self, vm: VM) -> None: + rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) + # Atomically swap: rename the working images into the golden slot. + # Both disks were sync'd via /sync before /snapshot/create, so + # what's on disk is consistent with what's in the memory snapshot. + # We'll re-create the working images by cloning from the golden + # on every restore (see _restore_disks). + for src, dst in ((rootfs, rootfs_gold), (sysdisk, sysdisk_gold)): + if dst.exists(): + dst.unlink() + os.replace(src, dst) + log.info("[%s] golden disks saved (%s, %s)", vm.system.name, + _fmt_size(rootfs_gold.stat().st_size), + _fmt_size(sysdisk_gold.stat().st_size)) + + async def _restore_disks(self, vm: VM) -> None: + rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) + if not rootfs_gold.exists() or not sysdisk_gold.exists(): + raise RuntimeError( + f"[{vm.system.name}] missing golden disks; cannot restore") + # Clone the goldens into fresh working copies. `cp --sparse=always` + # only writes the non-zero blocks, so the cost is proportional to + # the actual data on each disk, not its apparent 200 GB. + for src, dst in ((rootfs_gold, rootfs), (sysdisk_gold, sysdisk)): + if dst.exists(): + dst.unlink() + proc = await asyncio.create_subprocess_exec( + "cp", "--sparse=always", str(src), str(dst), + ) + rc = await proc.wait() + if rc != 0: + raise RuntimeError(f"cp {src} -> {dst} failed rc={rc}") + log.info("[%s] working disks cloned from golden", vm.system.name) + async def _shutdown(self, vm: VM) -> None: """Best-effort clean shutdown of the firecracker process. @@ -470,6 +554,16 @@ async def _teardown(self, vm: VM, reason: str) -> None: if vm.snapshot_bin.exists() and zst.exists(): with contextlib.suppress(FileNotFoundError): vm.snapshot_bin.unlink() + # Discard the working disks. Any changes the daemon scribbled into + # them during this session (background merges, log writes, /tmp + # churn) die with them; the next restore will clone fresh copies + # from the golden disks, so user N+1 sees the same starting state + # as user N. + if _has_snapshot(vm): + rootfs, sysdisk, _, _ = self._golden_paths(vm) + for p in (rootfs, sysdisk): + with contextlib.suppress(FileNotFoundError): + p.unlink() # ── agent helpers ──────────────────────────────────────────────────── @@ -521,7 +615,17 @@ async def _sync_guest(self, vm: VM) -> None: def _has_snapshot(vm: VM) -> bool: - return vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists() + """A snapshot is complete only when *both* the memory image and the + golden disks have been captured. A half-built snapshot (memory present + but goldens missing, or vice versa) is treated as no snapshot at all + so the next ensure_ready_for_query re-provisions cleanly. + """ + mem_ok = (vm.snapshot_bin.exists() or + vm.snapshot_bin.with_suffix(".bin.zst").exists()) + sys_dir = vm.snapshot_bin.parent + disks_ok = ((sys_dir / "rootfs.golden.ext4").exists() and + (sys_dir / "system.golden.ext4").exists()) + return mem_ok and disks_ok def _fmt_size(n: int) -> str: From 69f29a446e76618d44dffb36e7490c10216a093d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 22:01:23 +0000 Subject: [PATCH 012/221] playground: overlayfs at /opt/clickbench/system, no dataset copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous design copied dataset files from the read-only cbdata mount into the per-VM writable cbsystem disk on every provision — 14 GB for parquet systems, 75 GB for tsv/csv. That worked but was redundant: the data is already on a read-only mount, the only reason we copied was that ClickBench's load scripts do `sudo mv` and `sudo chown` on the dataset files. Use overlayfs instead: lowerdir = /opt/clickbench/datasets_ro (RO, the shared image) upperdir = /opt/clickbench/system_upper (RW per-VM disk with scripts) merged at /opt/clickbench/system The system's load runs at cwd=/opt/clickbench/system. It sees scripts + dataset files in one tree. When it `mv`s or `chown`s a file from the lower, overlayfs does a lazy copy-up: only the file's bytes get materialised into the upper, and only when the script actually mutates it. Most ClickBench load scripts `rm` the dataset file after INSERT, which becomes a whiteout in the upper — a few bytes of metadata, not a 75 GB copy. Saves ~1-2 TB across the catalog on host disk (no per-system copies) *and* eliminates the per-provision in-VM stage. Only cost: small metadata to maintain the overlay (kilobytes). For partitioned parquet, the source files live in datasets_ro/hits_partitioned/ but the load globs cwd/hits_*.parquet, so the agent creates symlinks in the upper pointing at the lower — ~100 symlinks, a few hundred bytes total. Also: make build-datasets-image.sh idempotent. The 173 GB rsync into datasets.ext4 only needs to run when the source dir's mtime has changed; otherwise the cached image is reused. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 107 +++++++--------------- playground/images/build-base-rootfs.sh | 23 ++++- playground/images/build-datasets-image.sh | 18 +++- 3 files changed, 70 insertions(+), 78 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 1b0cc55f1f..d082082db0 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -38,7 +38,7 @@ from pathlib import Path SYSTEM_DIR = Path(os.environ.get("CLICKBENCH_SYSTEM_DIR", "/opt/clickbench/system")) -DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets")) +DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets_ro")) STATE_DIR = Path(os.environ.get("CLICKBENCH_AGENT_STATE", "/var/lib/clickbench-agent")) SYSTEM_NAME = ( os.environ.get("CLICKBENCH_SYSTEM_NAME") @@ -84,65 +84,33 @@ def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes: return handler.rfile.read(min(n, 1 << 20)) -def _stage_dataset(fmt: str) -> list[Path]: - """Copy the dataset file(s) the system's load script needs from the - read-only shared mount into the per-system writable disk. +def _stage_dataset_layout(fmt: str) -> None: + """Make the system's load script see hits.* in its cwd. - Returns the list of staged files. Empty list when there's nothing to - stage (datalake / in-memory engines whose ./load reads from external - sources). Raises if a required file is missing. + The base rootfs's /etc/fstab overlay-mounts /opt/clickbench/system from + lower=/opt/clickbench/datasets_ro (the shared dataset image) + upper=/ + opt/clickbench/system_upper (this VM's writable scripts disk). Most + load scripts reference hits.parquet / hits.tsv / hits_*.parquet at + cwd, which is /opt/clickbench/system — the overlay already exposes + those files there, no copy needed. + + Partitioned parquet lives in a `hits_partitioned/` subdirectory in the + lower; the clickhouse load globs `hits_*.parquet` in cwd. Create + symlinks (in the upper) pointing at the lower files so the glob + matches. Symlinks cost a few bytes per file — far cheaper than the + 14 GB physical copy we used to do. """ - staged: list[Path] = [] - if fmt in ("", "none", "unknown"): - return staged - if not DATASETS_DIR.exists(): - raise FileNotFoundError(f"datasets mount missing: {DATASETS_DIR}") - - if fmt == "parquet": - srcs = [DATASETS_DIR / "hits.parquet"] - elif fmt == "parquet-partitioned": - srcs = sorted((DATASETS_DIR / "hits_partitioned").glob("hits_*.parquet")) - elif fmt == "tsv": - srcs = [DATASETS_DIR / "hits.tsv"] - elif fmt == "csv": - srcs = [DATASETS_DIR / "hits.csv"] - else: - srcs = [] - - for src in srcs: - if not src.exists(): - raise FileNotFoundError(f"staged source missing: {src}") - dst = SYSTEM_DIR / src.name - # copy_file_range goes through the kernel without bouncing bytes - # through userspace — much faster than shutil.copyfile for the - # 14 GB / 75 GB files we deal with. - with src.open("rb") as fsrc, dst.open("wb") as fdst: - size = src.stat().st_size - try: - off = 0 - while off < size: - n = os.copy_file_range( - fsrc.fileno(), fdst.fileno(), - size - off, - ) - if n == 0: - break - off += n - except (AttributeError, OSError): - # Fall back to read/write for kernels / filesystems that - # don't support copy_file_range across the underlying - # device pair (RO ext4 -> RW ext4 should be fine, but - # there are kernels that don't allow it). - fsrc.seek(0) - fdst.seek(0) - fdst.truncate(0) - while True: - chunk = fsrc.read(8 * 1024 * 1024) - if not chunk: - break - fdst.write(chunk) - staged.append(dst) - return staged + if fmt == "parquet-partitioned": + src_dir = DATASETS_DIR / "hits_partitioned" + if not src_dir.exists(): + raise FileNotFoundError(f"partitioned dir missing: {src_dir}") + for f in sorted(src_dir.glob("hits_*.parquet")): + link = SYSTEM_DIR / f.name + if not link.exists(): + link.symlink_to(f) + # parquet / tsv / csv already appear in cwd via the overlay lower + # (their files sit at /opt/clickbench/datasets_ro/hits.parquet etc. + # and the overlay merges that path's contents into the merged dir). def _system_script(name: str) -> Path: @@ -349,27 +317,20 @@ def _provision() -> tuple[int, bytes]: return 1, b"".join(log_lines) log_lines.append(b"\n=== check ok ===\n") - # Stage the dataset files this system needs from the read-only - # shared mount into the writable system disk. We copy (rather than - # symlink/bind-mount) so the system's load script can mv/chown/rm - # them however it likes; the destination is a local file on the - # cbsystem disk. After load the script typically `rm`s them, so - # the copies are short-lived. + # Make sure hits.* are visible at cwd for the load script. For + # parquet / tsv / csv the overlay already does it (files appear + # in /opt/clickbench/system because the lower's hits.parquet etc. + # are in datasets_ro). For partitioned parquet we add symlinks in + # the upper because the source lives in datasets_ro/hits_partitioned/. fmt_file = SYSTEM_DIR / ".data-format" fmt = fmt_file.read_text().strip() if fmt_file.exists() else "" - stage_t0 = time.monotonic() - log_lines.append(f"\n=== staging dataset (format={fmt}) ===\n".encode()) + log_lines.append(f"\n=== layout dataset (format={fmt}) ===\n".encode()) try: - staged = _stage_dataset(fmt) - log_lines.append(f"staged {len(staged)} files: ".encode() + - ", ".join(s.name for s in staged[:5]).encode() + - (b" ..." if len(staged) > 5 else b"") + b"\n") + _stage_dataset_layout(fmt) except Exception as e: - log_lines.append(f"stage failed: {e!r}\n".encode()) + log_lines.append(f"layout failed: {e!r}\n".encode()) PROVISION_LOG.write_bytes(b"".join(log_lines)) return 1, b"".join(log_lines) - stage_dt = time.monotonic() - stage_t0 - log_lines.append(f"=== staging done in {stage_dt:.1f}s ===\n".encode()) # Run load. t0 = time.monotonic() diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 01e9714da1..099ee916c7 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -172,10 +172,27 @@ passwd -d root # systemd refuses to clear those entries on its own and drops to emergency # mode when label-based lookups fail. The kernel handles the root mount via # its `root=/dev/vda` cmdline; we only need fstab for the system disk. -mkdir -p /opt/clickbench/system /opt/clickbench/datasets +# Three-layer mount plan: +# 1. The shared read-only dataset disk (cbdata) is attached to every VM +# and mounted at /opt/clickbench/datasets_ro. Holds hits.parquet, +# hits.tsv, hits.csv, hits_partitioned/*.parquet — same bytes, one +# copy on the host, never duplicated per VM or per provision. +# 2. The per-VM writable system disk (cbsystem) mounts at +# /opt/clickbench/system_upper. Holds the system's ClickBench +# scripts (install, start, query, ...). +# 3. An overlayfs at /opt/clickbench/system merges both. The system's +# load script runs there with cwd=/opt/clickbench/system and sees a +# single tree containing scripts + dataset files. When the load +# does `mv hits.parquet target/` or `chown` on a dataset file, +# overlayfs copies that one file up from the lower into the upper +# lazily — only the bytes the script actually mutates land in the +# per-VM writable layer. +mkdir -p /opt/clickbench/system /opt/clickbench/datasets_ro \ + /opt/clickbench/system_upper /opt/clickbench/system_work cat > /etc/fstab </dev/null || echo 0) + out_mtime=$(stat -c%Y "$OUT" 2>/dev/null || echo 0) + src_newest=$(find "$SRC" -type f -printf '%T@\n' | sort -rn | head -1 | cut -d. -f1) + if [ "$out_size" -ge "$bytes" ] && [ "$out_mtime" -gt "${src_newest:-0}" ]; then + echo "[datasets] cached ($(du -h "$OUT" | cut -f1)); set REBUILD=1 to force" + ls -lh "$OUT" + exit 0 + fi +fi + rm -f "$OUT" truncate -s "${size_mib}M" "$OUT" # Disable the journal (-O ^has_journal) and reserve 0 blocks for root From ec999832a572468e433b44d41ab4487465ad1f54 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 22:26:26 +0000 Subject: [PATCH 013/221] playground: enforce build/provision semaphores, clone-based rootfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the parallel-provisioning-98-systems path: 1. The _build_sem and _provision_sem fields were defined but never acquired — `provision-all.sh` kicked all 98 provisions at once and they each independently spawned build-system-rootfs.sh, which tried to write ~8 GB of rootfs base content × 98 in parallel (~780 GB of writes against a single NVMe). Disk got saturated and nothing finished. Use `async with self._build_sem:` and `async with self._provision_sem:` around the heavy phases. 2. build-system-rootfs.sh now clones the base image at block level with `cp --sparse=always` and resizes the filesystem to 200 GB in place, instead of mkfs.ext4 + mount + rsync-of-base-contents. The block-level clone touches only the ~2 GB of non-zero blocks in the base, vs. the rsync approach traversing the mounted base and writing every file individually. Per-system rootfs build goes from ~30 s to ~3 s. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-system-rootfs.sh | 38 ++++++++++++------------ playground/server/vm_manager.py | 29 ++++++++++++------ 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh index f1065284de..89fcf22f52 100755 --- a/playground/images/build-system-rootfs.sh +++ b/playground/images/build-system-rootfs.sh @@ -46,29 +46,29 @@ done mkdir -p "$OUT_DIR" -# 1. Rootfs: sparse 200 GB. -echo "[sys:$SYSTEM] rootfs.ext4 ${ROOTFS_SIZE_GB}G (sparse)" +# 1. Rootfs: clone the base ext4 file block-level (sparse), then resize to +# 200 GB. This is dramatically cheaper than mkfs+mount+rsync-of-base: +# `cp --sparse=always` writes only the ~2 GB of non-zero blocks the base +# actually uses, instead of traversing the mounted base and writing each +# file individually. Going from cp-with-mount to block-clone takes the +# per-system rootfs build from ~30 s to ~3 s on this NVMe. +echo "[sys:$SYSTEM] rootfs.ext4 (clone+resize to ${ROOTFS_SIZE_GB}G)" rm -f "$ROOTFS" +cp --sparse=always "$BASE" "$ROOTFS" +# Grow the filesystem to fill 200 GB. The base ext4 superblock thinks the +# disk is its original size; resize2fs notices the file is bigger and +# extends the metadata to cover it. truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS" -mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$ROOTFS" >/dev/null +sudo e2fsck -fy "$ROOTFS" >/dev/null 2>&1 || true +sudo resize2fs "$ROOTFS" >/dev/null 2>&1 -BASE_MNT="$(mktemp -d)" -DST_MNT="$(mktemp -d)" -trap ' - sudo umount "'"$BASE_MNT"'" 2>/dev/null || true - sudo umount "'"$DST_MNT"'" 2>/dev/null || true - rmdir "'"$BASE_MNT"'" "'"$DST_MNT"'" 2>/dev/null || true -' EXIT -# A prior smoke-boot may have left the base journal dirty; fsck before RO -# mount, otherwise the loop mount refuses with "cannot mount read-only". -sudo e2fsck -fy "$BASE" >/dev/null 2>&1 || true -sudo mount -o loop,ro "$BASE" "$BASE_MNT" -sudo mount -o loop "$ROOTFS" "$DST_MNT" -sudo cp -a "$BASE_MNT"/. "$DST_MNT"/ -echo "$SYSTEM" | sudo tee "$DST_MNT/etc/clickbench-system" >/dev/null +# Stamp the system name so the agent can identify itself. +MNT="$(mktemp -d)" +trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT +sudo mount -o loop "$ROOTFS" "$MNT" +echo "$SYSTEM" | sudo tee "$MNT/etc/clickbench-system" >/dev/null sudo sync -sudo umount "$DST_MNT" -sudo umount "$BASE_MNT" +sudo umount "$MNT" trap - EXIT # 2. System disk: ClickBench scripts only. Sized at SYSDISK_SIZE_GB (2 GB diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index f27abc59cb..258cc6cbf3 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -236,18 +236,29 @@ async def _initial_provision(self, vm: VM) -> None: if vm.state != "down": raise RuntimeError(f"unexpected state for initial provision: {vm.state}") + # Bound the heavy I/O phases: + # _build_images_if_needed: each call does a `cp -a /base /rootfs` + # that writes ~8 GB of base content. Running 98 in parallel + # saturates the host's NVMe writeback. + # _call_agent_provision: each spawn does `apt-get install` + # against Ubuntu mirrors and pulls 100s of MB. 98 at once gets + # rate-limited by the mirror. + # Use distinct semaphores so disk and network are bounded + # independently. log.info("[%s] initial provision begin", vm.system.name) vm.state = "provisioning" try: - await self._build_images_if_needed(vm) - await net.ensure_tap(vm.slot) - await net.enable_internet(vm.slot) - await self._boot(vm, restore_snapshot=False) - await self._wait_for_agent(vm, timeout=180) - await self._call_agent_provision(vm) - await self._snapshot(vm) - await self._shutdown(vm) - await net.disable_internet(vm.slot) + async with self._build_sem: + await self._build_images_if_needed(vm) + async with self._provision_sem: + await net.ensure_tap(vm.slot) + await net.enable_internet(vm.slot) + await self._boot(vm, restore_snapshot=False) + await self._wait_for_agent(vm, timeout=180) + await self._call_agent_provision(vm) + await self._snapshot(vm) + await self._shutdown(vm) + await net.disable_internet(vm.slot) vm.state = "snapshotted" vm.provisioned_at = time.time() log.info("[%s] initial provision complete", vm.system.name) From ab2fa8c8060e6d6627894ae90f09f21f94423436 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 May 2026 23:26:31 +0000 Subject: [PATCH 014/221] playground: flatten datasets, drop the symlink layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the agent created symlinks in the overlay's upper for partitioned parquet (hits_partitioned/* -> upper/hits_*.parquet) because the source directory was nested. That fell apart on clickhouse's load: `mv hits_*.parquet /var/lib/clickhouse/user_files/` moved the symlinks, and the subsequent `chown` followed them through to the read-only datasets disk and got `Read-only file system`. Flatten the dataset image so all 100 partitioned parquet files sit at the root next to hits.parquet / hits.tsv / hits.csv. The overlay then exposes them directly at /opt/clickbench/system as real files, no symlinks involved. clickhouse's `mv` becomes a real copy-up (and the source becomes a whiteout in upper), and the subsequent `chown` operates on a regular file on the rootfs — works. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 49 +++--------------------- playground/images/build-base-rootfs.sh | 20 ++++++---- playground/images/build-system-rootfs.sh | 23 ++++++----- 3 files changed, 33 insertions(+), 59 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index d082082db0..0e30eff6d4 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -84,35 +84,6 @@ def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes: return handler.rfile.read(min(n, 1 << 20)) -def _stage_dataset_layout(fmt: str) -> None: - """Make the system's load script see hits.* in its cwd. - - The base rootfs's /etc/fstab overlay-mounts /opt/clickbench/system from - lower=/opt/clickbench/datasets_ro (the shared dataset image) + upper=/ - opt/clickbench/system_upper (this VM's writable scripts disk). Most - load scripts reference hits.parquet / hits.tsv / hits_*.parquet at - cwd, which is /opt/clickbench/system — the overlay already exposes - those files there, no copy needed. - - Partitioned parquet lives in a `hits_partitioned/` subdirectory in the - lower; the clickhouse load globs `hits_*.parquet` in cwd. Create - symlinks (in the upper) pointing at the lower files so the glob - matches. Symlinks cost a few bytes per file — far cheaper than the - 14 GB physical copy we used to do. - """ - if fmt == "parquet-partitioned": - src_dir = DATASETS_DIR / "hits_partitioned" - if not src_dir.exists(): - raise FileNotFoundError(f"partitioned dir missing: {src_dir}") - for f in sorted(src_dir.glob("hits_*.parquet")): - link = SYSTEM_DIR / f.name - if not link.exists(): - link.symlink_to(f) - # parquet / tsv / csv already appear in cwd via the overlay lower - # (their files sit at /opt/clickbench/datasets_ro/hits.parquet etc. - # and the overlay merges that path's contents into the merged dir). - - def _system_script(name: str) -> Path: """Return path to a script in the system dir, or raise if missing/not executable.""" p = SYSTEM_DIR / name @@ -317,20 +288,12 @@ def _provision() -> tuple[int, bytes]: return 1, b"".join(log_lines) log_lines.append(b"\n=== check ok ===\n") - # Make sure hits.* are visible at cwd for the load script. For - # parquet / tsv / csv the overlay already does it (files appear - # in /opt/clickbench/system because the lower's hits.parquet etc. - # are in datasets_ro). For partitioned parquet we add symlinks in - # the upper because the source lives in datasets_ro/hits_partitioned/. - fmt_file = SYSTEM_DIR / ".data-format" - fmt = fmt_file.read_text().strip() if fmt_file.exists() else "" - log_lines.append(f"\n=== layout dataset (format={fmt}) ===\n".encode()) - try: - _stage_dataset_layout(fmt) - except Exception as e: - log_lines.append(f"layout failed: {e!r}\n".encode()) - PROVISION_LOG.write_bytes(b"".join(log_lines)) - return 1, b"".join(log_lines) + # No explicit data staging — the system's load script sees + # hits.parquet / hits.tsv / hits.csv / hits_*.parquet at cwd + # already, because cwd is the overlay merged dir + # /opt/clickbench/system and the dataset disk's contents (the + # overlay's lower) sit at /opt/clickbench/datasets_ro at the + # filesystem root, matching the names the load scripts use. # Run load. t0 = time.monotonic() diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 099ee916c7..e4282001a8 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -178,21 +178,27 @@ passwd -d root # hits.tsv, hits.csv, hits_partitioned/*.parquet — same bytes, one # copy on the host, never duplicated per VM or per provision. # 2. The per-VM writable system disk (cbsystem) mounts at -# /opt/clickbench/system_upper. Holds the system's ClickBench -# scripts (install, start, query, ...). -# 3. An overlayfs at /opt/clickbench/system merges both. The system's -# load script runs there with cwd=/opt/clickbench/system and sees a +# /opt/clickbench/sysdisk. We put both the overlay's upperdir AND +# its workdir inside this mount — overlayfs requires them on the +# same filesystem; nesting both as subdirs of one mount is the +# cleanest way. +# /opt/clickbench/sysdisk/upper/ ClickBench scripts go here +# /opt/clickbench/sysdisk/work/ overlay scratch (auto-cleared) +# 3. An overlayfs at /opt/clickbench/system merges +# lowerdir = datasets_ro +# upperdir = sysdisk/upper +# The system's load runs at cwd=/opt/clickbench/system and sees a # single tree containing scripts + dataset files. When the load # does `mv hits.parquet target/` or `chown` on a dataset file, # overlayfs copies that one file up from the lower into the upper # lazily — only the bytes the script actually mutates land in the # per-VM writable layer. mkdir -p /opt/clickbench/system /opt/clickbench/datasets_ro \ - /opt/clickbench/system_upper /opt/clickbench/system_work + /opt/clickbench/sysdisk cat > /etc/fstab </dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT sudo mount -o loop "$SYSDISK" "$SYS_MNT" -# Scripts + sql + helpers. +# The cbsystem disk is mounted at /opt/clickbench/sysdisk in the guest; +# the overlay points its upperdir at sysdisk/upper and its workdir at +# sysdisk/work. Pre-create that layout and drop the system's ClickBench +# scripts into upper. +sudo mkdir -p "$SYS_MNT/upper" "$SYS_MNT/work" sudo rsync -a --exclude 'results/' --exclude '*.json' --exclude 'README*' \ - "$SRC"/ "$SYS_MNT"/ + "$SRC"/ "$SYS_MNT/upper"/ # Some systems' scripts use ../lib/... — make it visible. -sudo mkdir -p "$SYS_MNT/_lib" -sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/_lib"/ +sudo mkdir -p "$SYS_MNT/upper/_lib" +sudo cp -a "$REPO_DIR/lib"/. "$SYS_MNT/upper/_lib"/ -# Discover the data format from benchmark.sh and stamp it; the agent uses -# this to decide which dataset files to stage from the RO mount. +# Discover the data format from benchmark.sh and stamp it in the upper; +# the agent uses this to decide which dataset symlinks to add for +# partitioned formats. download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \ eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \ printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")" @@ -104,11 +109,11 @@ case "$download_script" in "") format=none ;; *) format=unknown ;; esac -echo "$format" | sudo tee "$SYS_MNT/.data-format" >/dev/null +echo "$format" | sudo tee "$SYS_MNT/upper/.data-format" >/dev/null echo "[sys:$SYSTEM] format=$format" -sudo chown -R 0:0 "$SYS_MNT" -sudo chmod -R u+rwX,go+rX "$SYS_MNT" +sudo chown -R 0:0 "$SYS_MNT/upper" +sudo chmod -R u+rwX,go+rX "$SYS_MNT/upper" sudo sync sudo umount "$SYS_MNT" trap - EXIT From d69f5578134702f4a2e4f1df2c79de583cb05929 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 00:54:11 +0000 Subject: [PATCH 015/221] playground: cbsystem disk 200 GB sparse, not 2 GB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 2 GB cap on the per-VM system disk was a holdover from the in-VM-copy era, when system.ext4 only held scripts + staged data. Once we switched to overlay-with-RO-datasets, system.ext4 also holds the overlay's upperdir + workdir — i.e. every byte the load script writes lands there, including the database's own files. ClickHouse writes ~5 GB of MergeTree parts, DuckDB ~6 GB, Hyper ~10 GB; chown on partitioned parquet copies up another 14 GB. 2 GB was always going to overflow. Match the rootfs at 200 GB (apparent). The file is sparse: truncate reserves the size but allocates no physical blocks, mkfs.ext4 writes ~50 MB of metadata, and the snapshot/restore path uses `cp --sparse=always` so only the bytes the VM actually wrote land on the host disk. Light systems (chdb, sqlite, ...) cost the host near nothing; heavy ones (tidb at ~137 GB, postgres-indexed ~80 GB) fit without hitting ENOSPC mid-load. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-system-rootfs.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh index e81dba3284..4ca6baa662 100755 --- a/playground/images/build-system-rootfs.sh +++ b/playground/images/build-system-rootfs.sh @@ -27,7 +27,19 @@ ROOTFS="$OUT_DIR/rootfs.ext4" SYSDISK="$OUT_DIR/system.ext4" ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}" -SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-2}" +# Apparent size of the cbsystem disk. Every byte the load script writes +# (overlay copy-ups of the dataset, the database's own files — +# MergeTree parts, duckdb's hits.db, etc.) lands here. Some systems are +# heavy: tidb writes ~137 GB, postgres-indexed ~80 GB, druid ~50 GB. +# Match the rootfs cap (200 GB) so any single system has room. +# +# This is a SPARSE file: `truncate` reserves the apparent size but +# allocates no physical blocks. mkfs.ext4 only writes the small initial +# metadata. Real disk usage tracks the bytes the VM actually writes, +# and `cp --sparse=always` on the golden-disk path preserves that +# sparseness through snapshot+restore — snapshots of light systems +# stay light. +SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-200}" if [ ! -f "$BASE" ]; then echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2 From c33fd3b704c02fb7c1b0aa1a776a763efe8a6062 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 00:56:25 +0000 Subject: [PATCH 016/221] playground: drop per-clone e2fsck, do it once at base build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each per-system rootfs build was running `e2fsck -fy` on its clone before `resize2fs`. With 98 systems and ~5 s per fsck of a 200 GB sparse file, that's ~8 minutes of pure disk thrash during catalog build — and entirely redundant: the base ext4 is built fresh and never mounted dirty, so the bit-for-bit clone is clean too. Move the single fsck to the end of build-base-rootfs.sh (where it has all the host's I/O to itself) and skip it in the per-system loop. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 8 ++++++++ playground/images/build-system-rootfs.sh | 10 ++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index e4282001a8..a66a1964e7 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -236,4 +236,12 @@ trap - EXIT mv "$FLAT" "$OUT" rm -rf "$TMP" + +# Final fsck: every per-system rootfs is cloned from this file and then +# resize2fs'd, which requires the source filesystem to be clean. Doing +# the fsck once here, while build-base-rootfs.sh has full I/O headroom, +# is much cheaper than doing it 98 times during the parallel system +# build phase. +sudo e2fsck -fy "$OUT" >/dev/null 2>&1 || true + echo "[base] done: $OUT ($(du -h "$OUT" | cut -f1) physical, $(du -h --apparent-size "$OUT" | cut -f1) apparent)" diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh index 4ca6baa662..072157afe9 100755 --- a/playground/images/build-system-rootfs.sh +++ b/playground/images/build-system-rootfs.sh @@ -64,14 +64,16 @@ mkdir -p "$OUT_DIR" # actually uses, instead of traversing the mounted base and writing each # file individually. Going from cp-with-mount to block-clone takes the # per-system rootfs build from ~30 s to ~3 s on this NVMe. +# +# build-base-rootfs.sh leaves the base ext4 clean, so the clone is also +# clean and resize2fs accepts it without a prior e2fsck pass. Skipping +# e2fsck saves ~5 s per system × 98 systems = ~8 minutes off catalog +# build time, and an e2fsck of a 200 GB sparse file is a *lot* of I/O +# for a "filesystem we know is fine" operation. echo "[sys:$SYSTEM] rootfs.ext4 (clone+resize to ${ROOTFS_SIZE_GB}G)" rm -f "$ROOTFS" cp --sparse=always "$BASE" "$ROOTFS" -# Grow the filesystem to fill 200 GB. The base ext4 superblock thinks the -# disk is its original size; resize2fs notices the file is bigger and -# extends the metadata to cover it. truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS" -sudo e2fsck -fy "$ROOTFS" >/dev/null 2>&1 || true sudo resize2fs "$ROOTFS" >/dev/null 2>&1 # Stamp the system name so the agent can identify itself. From 06bf791ba60dd056ac2da62df0c81605679785c5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 01:12:19 +0000 Subject: [PATCH 017/221] playground: pre-size base rootfs to 200 GB, drop per-clone resize2fs The base ext4 used to be built at 8 GB and each per-system rootfs clone ran resize2fs to grow to 200 GB. resize2fs on a 200 GB file is disk-heavy (it has to write group descriptor and bitmap metadata for every additional block group), and we did it 98 times in parallel. Build the base directly at 200 GB sparse with lazy_itable_init=1,lazy_journal_init=1. mkfs writes ~50 MB of superblock + GDT material upfront and defers the rest to lazy background init, so the image file's physical footprint is unchanged from the previous 8 GB layout (~1.8 GB). Per-system clones then need only `cp --sparse=always`: no resize2fs, no e2fsck, ~1 second each. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 6 +++++- playground/images/build-system-rootfs.sh | 20 +++++--------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index a66a1964e7..93fd195b00 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -17,7 +17,11 @@ set -euo pipefail STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" TMP="${STATE_DIR}/tmp/base-build" OUT="${STATE_DIR}/base-rootfs.ext4" -SIZE_GB="${BASE_ROOTFS_SIZE_GB:-8}" +# Match the per-system rootfs cap (200 GB) so build-system-rootfs.sh can +# clone the base directly with `cp --sparse=always` and skip resize2fs. +# The image is sparse: mkfs.ext4 with lazy_itable_init writes only the +# superblocks (~50 MB) upfront, and clones inherit that sparseness. +SIZE_GB="${BASE_ROOTFS_SIZE_GB:-200}" CLOUDIMG_URL="${UBUNTU_CLOUDIMG_URL:-https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img}" REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" AGENT_DIR="${REPO_DIR}/playground/agent" diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh index 072157afe9..be8cdbeb67 100755 --- a/playground/images/build-system-rootfs.sh +++ b/playground/images/build-system-rootfs.sh @@ -58,23 +58,13 @@ done mkdir -p "$OUT_DIR" -# 1. Rootfs: clone the base ext4 file block-level (sparse), then resize to -# 200 GB. This is dramatically cheaper than mkfs+mount+rsync-of-base: -# `cp --sparse=always` writes only the ~2 GB of non-zero blocks the base -# actually uses, instead of traversing the mounted base and writing each -# file individually. Going from cp-with-mount to block-clone takes the -# per-system rootfs build from ~30 s to ~3 s on this NVMe. -# -# build-base-rootfs.sh leaves the base ext4 clean, so the clone is also -# clean and resize2fs accepts it without a prior e2fsck pass. Skipping -# e2fsck saves ~5 s per system × 98 systems = ~8 minutes off catalog -# build time, and an e2fsck of a 200 GB sparse file is a *lot* of I/O -# for a "filesystem we know is fine" operation. -echo "[sys:$SYSTEM] rootfs.ext4 (clone+resize to ${ROOTFS_SIZE_GB}G)" +# 1. Rootfs: clone the base ext4 file block-level (sparse). The base is +# already sized at ROOTFS_SIZE_GB with mostly-empty ext4 metadata, so +# `cp --sparse=always` produces a sparse 200 GB image of the right size +# in seconds — no resize2fs, no e2fsck, no mount-and-rsync. +echo "[sys:$SYSTEM] rootfs.ext4 (sparse clone of base)" rm -f "$ROOTFS" cp --sparse=always "$BASE" "$ROOTFS" -truncate -s "${ROOTFS_SIZE_GB}G" "$ROOTFS" -sudo resize2fs "$ROOTFS" >/dev/null 2>&1 # Stamp the system name so the agent can identify itself. MNT="$(mktemp -d)" From b804e76eda9707edfc19483dbe26d168e8a57579 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 01:15:44 +0000 Subject: [PATCH 018/221] playground: drop redundant sync calls in image builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `umount` already syncs the filesystem being unmounted. The host-wide `sync` we were calling first flushes every dirty page on *every* mount — under 98-way parallel builds, each build's sync blocked on every other build's writeback, multiplying the wall-clock cost. Drop them. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-datasets-image.sh | 1 - playground/images/build-system-rootfs.sh | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/playground/images/build-datasets-image.sh b/playground/images/build-datasets-image.sh index b5d91a9672..1a66ea151e 100755 --- a/playground/images/build-datasets-image.sh +++ b/playground/images/build-datasets-image.sh @@ -54,7 +54,6 @@ MNT="$(mktemp -d)" trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT sudo mount -o loop "$OUT" "$MNT" sudo rsync -a "$SRC"/. "$MNT"/ -sudo sync sudo umount "$MNT" trap - EXIT diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh index be8cdbeb67..9fa9e5169c 100755 --- a/playground/images/build-system-rootfs.sh +++ b/playground/images/build-system-rootfs.sh @@ -67,11 +67,14 @@ rm -f "$ROOTFS" cp --sparse=always "$BASE" "$ROOTFS" # Stamp the system name so the agent can identify itself. +# Note: no explicit `sync` — `umount` syncs the filesystem being unmounted. +# A global `sync` here would block until every dirty page on the host's +# disk is flushed, which under 98-way parallel builds means every build +# waits for everyone else's writeback before its own umount returns. MNT="$(mktemp -d)" trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT sudo mount -o loop "$ROOTFS" "$MNT" echo "$SYSTEM" | sudo tee "$MNT/etc/clickbench-system" >/dev/null -sudo sync sudo umount "$MNT" trap - EXIT @@ -118,7 +121,6 @@ echo "[sys:$SYSTEM] format=$format" sudo chown -R 0:0 "$SYS_MNT/upper" sudo chmod -R u+rwX,go+rX "$SYS_MNT/upper" -sudo sync sudo umount "$SYS_MNT" trap - EXIT From 5ea3be29d2cdf540da16fe4418ccc65ac5f3fd67 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 01:44:39 +0000 Subject: [PATCH 019/221] playground: fstrim before snapshot so freed dataset bytes leave the golden When clickhouse's load `mv hits.parquet /var/lib/clickhouse/user_files/` (or any cross-FS move) copies the 14-75 GB dataset into the writable per-VM disk and then `rm`'s it after INSERT, ext4 marks those blocks free but the underlying virtio-blk file still carries the bytes. `cp --sparse=always` on the golden then preserves them as random data, so the per-system snapshot for a parquet engine carried a full extra copy of the dataset that the load already discarded. Adding `fstrim /opt/clickbench/sysdisk` and `fstrim /` before the host's snapshot makes the guest issue DISCARD for free blocks; the host loop driver responds by punching holes in the sparse backing file (linux loop devices advertise discard with PUNCH_HOLE since 4.x, which firecracker's virtio-blk passes through). The golden then holds only the bytes the engine actually keeps. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 0e30eff6d4..6dec225068 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -356,7 +356,7 @@ def _provision() -> tuple[int, bytes]: # Drop the page+dentry+inode cache. With init_on_free=1 set in the # guest kernel cmdline (see vm_manager._kernel_cmdline), every page # the kernel frees gets zero-filled before going back on the free - # list. After clickhouse stop + drop_caches, the entire free pool + # list. After daemon stop + drop_caches, the entire free pool # is genuinely zero-filled, and the snapshot's RAM dump compresses # ~300:1 instead of the ~3:1 we got without init_on_free. subprocess.run(["sync"], check=False) @@ -365,6 +365,21 @@ def _provision() -> tuple[int, bytes]: except Exception: pass + # fstrim the per-VM disks. Load scripts typically do `mv hits.parquet + # /var/lib//user_files/` (which on overlay/cross-FS copies the + # 14-75 GB dataset into the writable per-VM disk) and then `rm` it + # after the INSERT. ext4 marks those blocks free but the underlying + # virtio-blk file still holds the bytes — the snapshot's golden disk + # then carries a full copy of the dataset that the load script + # already discarded. `fstrim` sends DISCARD for free blocks; the + # host loop driver responds by punching holes in the sparse backing + # file, so the golden ends up holding only the bytes the engine + # actually keeps (MergeTree parts, hits.db, etc.). + for mnt in ("/opt/clickbench/sysdisk", "/"): + subprocess.run(["fstrim", mnt], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=300, check=False) + PROVISION_DONE.write_text(f"ok {time.time()}\n") PROVISION_LOG.write_bytes(b"".join(log_lines)) return 0, b"".join(log_lines) From b0d0c36c3c57c897510e6aa861ba1e137346dfdd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 01:50:24 +0000 Subject: [PATCH 020/221] ClickBench: replace dataset copy/move with symlinks where safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several systems' load scripts do `sudo mv hits_*.parquet /var/lib//user_files/` or `sudo cp hits.csv .../extern/` followed by `chown` to the daemon's user. The mv/cp copies 14-75 GB of data the daemon reads once during INSERT and we delete right after — a complete waste of bytes on disk and time on the wire. Replace with `ln -s` + `chown -h` where the daemon's user-files dir is on a different filesystem from the dataset. `chown -h` chowns the symlink itself rather than following into the (often read-only) original; the underlying dataset is mode 644 anyway, so daemon processes can read through the symlink as their own user. Systems updated: clickhouse, clickhouse-tencent, pg_clickhouse, kinetica, oxla, ursa, arc, cockroachdb. Motivated by the ClickBench playground (Firecracker microVM service) where the dataset is mounted read-only and shared across all VMs; the copy step was the dominant cost on parquet/csv-format systems and pulled 14 GB into the per-VM snapshot golden disk unnecessarily. The change is also benign for the regular benchmark — daemons still read the same bytes, just through a symlink. Co-Authored-By: Claude Opus 4.7 (1M context) --- arc/load | 11 ++--------- clickhouse-tencent/load | 7 +++++-- clickhouse/load | 12 ++++++++++-- cockroachdb/load | 6 ++++-- kinetica/load | 3 ++- oxla/load | 3 ++- pg_clickhouse/load | 13 +++++++++---- ursa/load | 11 ++++++++--- 8 files changed, 42 insertions(+), 24 deletions(-) diff --git a/arc/load b/arc/load index b46a4e3265..8ef8b45918 100755 --- a/arc/load +++ b/arc/load @@ -8,13 +8,6 @@ TARGET_FILE="$TARGET_DIR/hits.parquet" sudo mkdir -p "$TARGET_DIR" -if [ -f "$TARGET_FILE" ] && \ - [ "$(stat -c%s hits.parquet)" -eq "$(stat -c%s "$TARGET_FILE")" ]; then - : # already loaded -else - sudo cp hits.parquet "$TARGET_FILE" -fi - -# Free up local space. -rm -f hits.parquet +# Symlink rather than copy — hits.parquet is 14 GB and we read it once. +sudo ln -sfn "$PWD/hits.parquet" "$TARGET_FILE" sync diff --git a/clickhouse-tencent/load b/clickhouse-tencent/load index 4a423a9b42..3bcbe2f69f 100755 --- a/clickhouse-tencent/load +++ b/clickhouse-tencent/load @@ -3,8 +3,11 @@ set -e clickhouse-client < create.sql -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet +# Symlink rather than copy — see comment in clickhouse/load. +for f in hits_*.parquet; do + sudo ln -sf "$PWD/$f" /var/lib/clickhouse/user_files/"$f" +done +sudo chown -h clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))" diff --git a/clickhouse/load b/clickhouse/load index 4a423a9b42..df578a617c 100755 --- a/clickhouse/load +++ b/clickhouse/load @@ -3,8 +3,16 @@ set -e clickhouse-client < create.sql -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet +# Symlink the parquet files into ClickHouse's user_files dir rather than +# moving them. mv on a 14 GB partitioned dataset wastes minutes copying +# bytes the daemon will only read once and then we delete; ln -s is +# instant. chown -h sets the symlink's owner (does not follow into the +# target), which is enough — the underlying parquets are mode 644, so +# the clickhouse user can read them through the symlinks regardless. +for f in hits_*.parquet; do + sudo ln -sf "$PWD/$f" /var/lib/clickhouse/user_files/"$f" +done +sudo chown -h clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads "$(( $(nproc) / 4 ))" diff --git a/cockroachdb/load b/cockroachdb/load index 2afaf6709b..d4c22685f1 100755 --- a/cockroachdb/load +++ b/cockroachdb/load @@ -3,9 +3,11 @@ set -eu CRDBDATADIR=/var/lib/cockroach-data -# Stage data into cockroach's "extern" directory so it can be loaded via nodelocal://. +# Stage data into cockroach's "extern" directory so it can be loaded via +# nodelocal://. Symlink rather than copy — hits.csv is 75 GB and we read +# it once. sudo mkdir -p "$CRDBDATADIR/extern" -sudo cp hits.csv "$CRDBDATADIR/extern/hits.csv" +sudo ln -sfn "$PWD/hits.csv" "$CRDBDATADIR/extern/hits.csv" cockroach sql --insecure --host=localhost --execute='DROP DATABASE IF EXISTS test CASCADE;' cockroach sql --insecure --host=localhost --execute='CREATE DATABASE test;' diff --git a/kinetica/load b/kinetica/load index 75630adb1c..523f581545 100755 --- a/kinetica/load +++ b/kinetica/load @@ -9,7 +9,8 @@ CLI="./kisql --host localhost --user admin" # decompressed TSV. wget --continue --progress=dot:giga \ 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -sudo mv hits.tsv.gz ./kinetica-persist/ +# Symlink rather than copy: hits.tsv.gz is 16 GB and we only read it once. +sudo ln -sf "$PWD/hits.tsv.gz" ./kinetica-persist/hits.tsv.gz $CLI --file create.sql $CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" diff --git a/oxla/load b/oxla/load index e1f99c03b8..6b9163a386 100755 --- a/oxla/load +++ b/oxla/load @@ -4,7 +4,8 @@ set -eu export PGCLIENTENCODING=UTF8 mkdir -p data -sudo mv hits.csv data/ +# Symlink rather than copy: hits.csv is 75 GB. +sudo ln -sf "$PWD/hits.csv" data/hits.csv PGPASSWORD=oxla psql -h localhost -U oxla -q -t < create.sql PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "COPY hits FROM '/data/hits.csv';" diff --git a/pg_clickhouse/load b/pg_clickhouse/load index 6d0ed09980..69825dda47 100755 --- a/pg_clickhouse/load +++ b/pg_clickhouse/load @@ -4,11 +4,16 @@ set -eu # Create the ClickHouse table. clickhouse-client < create.sql -# Move the downloaded partitioned parquet files into ClickHouse's user_files -# directory and ingest them. +# Symlink the downloaded partitioned parquet files into ClickHouse's +# user_files dir and ingest them. ln -s instead of mv avoids a 14 GB +# copy of bytes that will be read once then deleted; chown -h sets the +# symlink owner (not the target), which is fine because the underlying +# files are world-readable. sudo mkdir -p /var/lib/clickhouse/user_files -sudo mv hits_*.parquet /var/lib/clickhouse/user_files/ -sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet +for f in hits_*.parquet; do + sudo ln -sf "$PWD/$f" /var/lib/clickhouse/user_files/"$f" +done +sudo chown -h clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet sync clickhouse-client --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" \ diff --git a/ursa/load b/ursa/load index 2a2560368d..bf6e1678ed 100755 --- a/ursa/load +++ b/ursa/load @@ -3,9 +3,14 @@ set -e ./ursa client < create.sql -# The download script puts hits_*.parquet in the cwd; move them to the -# server's user_files dir so the file() table function can read them. -sudo mv hits_*.parquet user_files/ +# The download script puts hits_*.parquet in the cwd. Symlink them into +# the server's user_files dir; the file() table function reads through +# the symlinks, and we avoid a 14 GB copy of data we'll discard after +# INSERT. +mkdir -p user_files +for f in hits_*.parquet; do + ln -sf "$PWD/$f" user_files/"$f" +done ./ursa client \ --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" \ From 63cb2e575532c8e6283025f93a22005103c2874c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 01:57:59 +0000 Subject: [PATCH 021/221] playground: move agent off port 8080 to 50080 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 8080 is the default HTTP admin port for cockroach, the spark UI, trino, presto, druid, and a long tail of other JVM-based databases in the catalog. Our in-VM agent was binding it first, so when their ./start ran the daemon failed with "bind: address already in use" and the whole provision came down with a port conflict. Pick 50080 — uncommon enough that no ClickBench engine in the current catalog wants it. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/agent.py | 10 ++++++++-- playground/scripts/smoke-boot.sh | 8 ++++---- playground/server/vm_manager.py | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 6dec225068..c76c91d4df 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -16,7 +16,9 @@ mounted at /opt/clickbench/system, with the system name in /etc/clickbench- system. The dataset is mounted read-only at /opt/clickbench/datasets. -Listens on 0.0.0.0:8080 by default. +Listens on 0.0.0.0:50080 by default (deliberately not 8080 — that port +is claimed by cockroach, spark UI, trino, presto, druid, and a long +tail of other JVM web consoles in the catalog). Stdlib-only — the rootfs ships python3 from the Ubuntu base; no pip needed. """ @@ -45,7 +47,11 @@ or (Path("/etc/clickbench-system").read_text().strip() if Path("/etc/clickbench-system").exists() else SYSTEM_DIR.name) ) -LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "8080")) +# Port 8080 is wildly oversubscribed in this catalog (cockroach, spark UI, +# trino, presto, hive, druid, ...). Pick a port nothing realistic is going +# to want — IANA's user range tops out at 49151, and we want to stay above +# any well-known ephemeral range too. 50080 keeps a vague "HTTP-ish" feel. +LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "50080")) # 10 KB cap, matching the spec. Configurable for testing. OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "10240")) # Per-query wall-clock cap so a runaway query can't tie up a VM forever. diff --git a/playground/scripts/smoke-boot.sh b/playground/scripts/smoke-boot.sh index d79ecc8c87..c65bec62f0 100755 --- a/playground/scripts/smoke-boot.sh +++ b/playground/scripts/smoke-boot.sh @@ -77,10 +77,10 @@ api PUT /machine-config '{"vcpu_count": 2, "mem_size_mib": 2048, "smt": false}' api PUT /actions '{"action_type": "InstanceStart"}' # Poll the agent for liveness -echo "[smoke] waiting for agent at http://${GUEST_IP}:8080/health" +echo "[smoke] waiting for agent at http://${GUEST_IP}:50080/health" ok=0 for i in $(seq 1 120); do - if curl -fsS "http://${GUEST_IP}:8080/health" >/dev/null 2>&1; then + if curl -fsS "http://${GUEST_IP}:50080/health" >/dev/null 2>&1; then ok=1 break fi @@ -89,9 +89,9 @@ done if [ "$ok" = "1" ]; then echo "[smoke] OK — agent responded after ${i}s" - curl -fsS "http://${GUEST_IP}:8080/health" | head -c 200; echo + curl -fsS "http://${GUEST_IP}:50080/health" | head -c 200; echo echo "[smoke] /stats:" - curl -fsS "http://${GUEST_IP}:8080/stats" | head -c 400; echo + curl -fsS "http://${GUEST_IP}:50080/stats" | head -c 400; echo else echo "[smoke] FAIL — agent never responded; firecracker log tail:" tail -30 "$LOG" diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 258cc6cbf3..42fa152802 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -174,7 +174,7 @@ def list_all(self) -> list[dict]: def agent_url(self, vm: VM) -> str: _, vm_ip, _ = net.addr_for(vm.slot) - return f"http://{vm_ip}:8080" + return f"http://{vm_ip}:50080" # ── boot / shutdown ────────────────────────────────────────────────── From c824524f2154e590de26b0f163297cb321adbb01 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 01:59:56 +0000 Subject: [PATCH 022/221] playground: ship lib/download-hits-* stubs at /opt/clickbench/lib MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several systems' load scripts call ../lib/download-hits-* — e.g. doris-parquet expects `download-hits-parquet-partitioned ` to materialize the dataset in a specific subdirectory of the BE's working tree. Previously we copied the lib tree into /opt/clickbench/ system/_lib, but ../lib from the system dir resolves to /opt/clickbench/lib, not /opt/clickbench/system/_lib. Put 4 stub scripts (one per format) at /opt/clickbench/lib in the base rootfs. Each one symlinks from the shared RO dataset mount into the target directory — same interface as upstream's wget-based scripts, but instant and zero-byte-on-disk. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 37 +++++++++++++++++++++++- playground/images/build-system-rootfs.sh | 7 +++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 93fd195b00..4d2c6f7ae1 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -198,7 +198,42 @@ passwd -d root # lazily — only the bytes the script actually mutates land in the # per-VM writable layer. mkdir -p /opt/clickbench/system /opt/clickbench/datasets_ro \ - /opt/clickbench/sysdisk + /opt/clickbench/sysdisk /opt/clickbench/lib + +# Stub download-hits-* scripts. ClickBench's real download-hits-* fetch +# the dataset from datasets.clickhouse.com; in the playground we already +# have the data RO-mounted at /opt/clickbench/datasets_ro, so produce +# symlinks instead. The interface (optional target-dir argument) matches +# lib/download-hits-* so per-system scripts that do +# `../lib/download-hits-... ` work unchanged. Symlinks instead +# of copies save 14-75 GB of in-VM writes per system. +cat > /opt/clickbench/lib/download-hits-parquet-single <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.parquet hits.parquet +EOF +cat > /opt/clickbench/lib/download-hits-parquet-partitioned <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +for i in $(seq 0 99); do + ln -sf "/opt/clickbench/datasets_ro/hits_${i}.parquet" "hits_${i}.parquet" +done +EOF +cat > /opt/clickbench/lib/download-hits-tsv <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.tsv hits.tsv +EOF +cat > /opt/clickbench/lib/download-hits-csv <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.csv hits.csv +EOF +chmod +x /opt/clickbench/lib/download-hits-* cat > /etc/fstab < Date: Wed, 13 May 2026 02:15:51 +0000 Subject: [PATCH 023/221] playground: switch to Ubuntu's generic kernel + parse ip= from userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firecracker-ci kernel is minimal: it boots fine, but Docker fails to start because it lacks iptables/nat, br_netfilter, veth and other modules that Docker needs to set up its bridge network. That killed ~6 Docker-using systems (byconity, cedardb, citus, cloudberry, greenplum) in the parallel provisioning run. Swap in Ubuntu's `linux-image-generic` kernel (the same one Ubuntu ships for cloud KVM guests). It has every Docker-required module plus a much richer driver set, while still booting under Firecracker. Trade-off: it lacks CONFIG_IP_PNP so the kernel's `ip=` boot arg is ignored. Add a tiny clickbench-net.service that parses `ip=` from /proc/cmdline and applies it to eth0 at boot; agent.service waits for it. The same rootfs continues to work with the firecracker-ci kernel (the systemd unit's `ip addr add` is idempotent — kernel-set IPs are already there). Verified: smoke-boot agent answered in 3 s on the new kernel. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/clickbench-agent.service | 14 +++---- playground/images/build-base-rootfs.sh | 46 ++++++++++++++++++++--- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service index c02fe20cbb..a56388240a 100644 --- a/playground/agent/clickbench-agent.service +++ b/playground/agent/clickbench-agent.service @@ -1,13 +1,11 @@ [Unit] Description=ClickBench in-VM playground agent -# The kernel's `ip=` cmdline sets the static IP before init, so network is -# already up when we start. We deliberately don't depend on network-online. -# target — that gate is fed by systemd-networkd-wait-online, which is -# disabled. The system disk mount is similarly best-effort: the agent's -# /provision and /query paths report 404/409 if /opt/clickbench/system isn't -# populated, which is the correct behaviour and lets /health stay up so the -# host can still talk to it during provisioning. -After=local-fs.target +# Wait for clickbench-net.service to assign eth0's IP — without it the +# kernel-set IP (firecracker-ci kernel via CONFIG_IP_PNP) is a no-op on +# the Ubuntu generic kernel and we'd bind 0.0.0.0:50080 on an interface +# that doesn't have an IP yet. +After=local-fs.target clickbench-net.service +Wants=clickbench-net.service [Service] Type=simple diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 4d2c6f7ae1..78a4a75901 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -146,12 +146,13 @@ apt-get install -y --no-install-recommends \ apt-get clean rm -rf /var/lib/apt/lists/* -# Network: the host sets up the VM's IP via the kernel `ip=` cmdline so the -# guest comes up with the right /24 for its slot. systemd-networkd in the -# guest must NOT fight the kernel's static config — disable it and rely on -# the kernel-supplied address. /etc/resolv.conf gets a static fallback so DNS -# works in case any post-snapshot tooling still wants it (it shouldn't — -# internet is dropped after the snapshot). +# Network: parse `ip=GUEST::GATEWAY:NETMASK:::eth0:off` from /proc/cmdline +# at boot and apply it to eth0. Some kernels we run (Ubuntu's generic) lack +# CONFIG_IP_PNP, which makes the kernel's `ip=` boot-arg a no-op and leaves +# eth0 unconfigured at userspace start. Doing the assignment from a tiny +# oneshot service makes us kernel-agnostic — works on the firecracker-ci +# kernel (which does have IP_PNP, so this is just redundant there) and on +# the Ubuntu generic kernel (which doesn't). systemctl disable systemd-networkd 2>/dev/null || true systemctl disable systemd-resolved 2>/dev/null || true rm -f /etc/resolv.conf @@ -160,6 +161,39 @@ nameserver 1.1.1.1 nameserver 8.8.8.8 EOF +cat > /usr/local/sbin/clickbench-net-up <<'NETUP' +#!/bin/bash +# Apply ip=:::::eth0:off from /proc/cmdline. +set -e +ip_arg=$(awk '{for(i=1;i<=NF;i++) if($i ~ /^ip=/) print $i}' /proc/cmdline | sed 's/^ip=//') +[ -z "$ip_arg" ] && exit 0 +IFS=':' read -r vm_ip _peer gw mask _hostname iface _autoconf <<<"$ip_arg" +iface="${iface:-eth0}" +ip link set "$iface" up +ip addr add "$vm_ip/$(python3 -c "import ipaddress; print(ipaddress.IPv4Network('0.0.0.0/$mask').prefixlen)" 2>/dev/null || echo 24)" dev "$iface" +[ -n "$gw" ] && ip route add default via "$gw" || true +NETUP +chmod +x /usr/local/sbin/clickbench-net-up + +cat > /etc/systemd/system/clickbench-net.service < Date: Wed, 13 May 2026 02:25:49 +0000 Subject: [PATCH 024/221] playground: install Ubuntu kernel modules into base rootfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Ubuntu generic kernel builds overlay, veth, br_netfilter, iptable_nat, nf_conntrack and friends as loadable modules, not built-in. Without /lib/modules// in the rootfs the kernel can't load them at runtime — the immediate symptom was `Failed to mount /opt/clickbench/system` (overlayfs not available) and Docker still failing to start (no br_netfilter/iptable_nat). Drop the linux-modules-7.0.0-15-generic deb into the chroot, `dpkg --unpack` it into the rootfs, run `depmod`, and pre-load the critical modules via /etc/modules-load.d/clickbench.conf so they're ready before any service starts. The image grew from 1.8 to 2.0 GB physical (200 GB apparent) — modules add ~200 MB. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 56 ++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 78a4a75901..5fd4f39530 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -145,6 +145,56 @@ apt-get install -y --no-install-recommends \ build-essential netbase apt-get clean rm -rf /var/lib/apt/lists/* +CUSTOMIZE +sudo chmod +x "$MNT/tmp/customize.sh" +sudo chroot "$MNT" /tmp/customize.sh +sudo rm -f "$MNT/tmp/customize.sh" + +# Install Ubuntu's KVM-friendly kernel + its modules INTO the rootfs. +# Firecracker doesn't use grub — we just need /lib/modules// populated +# so the running kernel (Ubuntu generic, extracted from the same .deb) can +# load overlay, veth, br_netfilter, iptable_nat etc. at runtime. Without +# this, the in-VM mounts of /opt/clickbench/system (overlay) and Docker's +# networking (iptables NAT, br_netfilter, veth) silently fail. +sudo cp /var/cache/apt/archives/linux-modules-7.0.0-15-generic_*.deb "$MNT/tmp/" +sudo cp /var/cache/apt/archives/linux-image-7.0.0-15-generic_*.deb "$MNT/tmp/" +sudo tee -a "$MNT/tmp/customize-modules.sh" >/dev/null <<'MODSCRIPT' +#!/bin/bash +set -euxo pipefail +export DEBIAN_FRONTEND=noninteractive +# Install modules deb but skip the image (we boot it directly from host). +# Skipping the image deb avoids the post-install update-initramfs that +# fails inside the chroot. +dpkg --unpack /tmp/linux-modules-7.0.0-15-generic_*.deb 2>&1 | tail -5 +# Configure but skip running update-initramfs. +mkdir -p /etc/initramfs-tools/conf.d +echo 'no-initramfs' > /etc/initramfs-tools/conf.d/disabled +dpkg --configure linux-modules-7.0.0-15-generic 2>&1 | tail -5 || true +# Run depmod so the kernel can find modules by name at runtime. +depmod 7.0.0-15-generic 2>&1 | tail -2 || true +# Pre-load critical modules so they're available even before service start. +mkdir -p /etc/modules-load.d +cat > /etc/modules-load.d/clickbench.conf </dev/null <<'CUSTOMIZE' +#!/bin/bash +set -euxo pipefail +export DEBIAN_FRONTEND=noninteractive # Network: parse `ip=GUEST::GATEWAY:NETMASK:::eth0:off` from /proc/cmdline # at boot and apply it to eth0. Some kernels we run (Ubuntu's generic) lack @@ -289,9 +339,9 @@ cat > /etc/hosts < Date: Wed, 13 May 2026 02:46:17 +0000 Subject: [PATCH 025/221] playground: use dpkg-deb -x for kernel modules to keep apt clean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `dpkg --unpack` records the modules package in dpkg's status DB without configuring it; subsequent `apt-get install` calls inside every per-system VM see an unconfigured package with unmet dependencies and bail with "Unmet dependencies. Try 'apt --fix-broken install'". That broke ~10 systems in the previous parallel run. Switch to `dpkg-deb -x` — extracts the data tarball into the rootfs without touching dpkg's DB. apt sees a normal system with all modules in /lib/modules/, and the kernel can load them at runtime. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/images/build-base-rootfs.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 5fd4f39530..2c70cfdde5 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -162,17 +162,19 @@ sudo tee -a "$MNT/tmp/customize-modules.sh" >/dev/null <<'MODSCRIPT' #!/bin/bash set -euxo pipefail export DEBIAN_FRONTEND=noninteractive -# Install modules deb but skip the image (we boot it directly from host). -# Skipping the image deb avoids the post-install update-initramfs that -# fails inside the chroot. -dpkg --unpack /tmp/linux-modules-7.0.0-15-generic_*.deb 2>&1 | tail -5 -# Configure but skip running update-initramfs. -mkdir -p /etc/initramfs-tools/conf.d -echo 'no-initramfs' > /etc/initramfs-tools/conf.d/disabled -dpkg --configure linux-modules-7.0.0-15-generic 2>&1 | tail -5 || true +# Extract files from the modules deb without registering it in dpkg. +# `dpkg --unpack` half-installs the package, leaving apt thinking there's +# an unconfigured package with unmet dependencies and refusing subsequent +# `apt-get install`s with "Unmet dependencies. Try 'apt --fix-broken +# install'". Bypass dpkg entirely: dpkg-deb -x just unrolls the data +# tarball into the rootfs. +dpkg-deb -x /tmp/linux-modules-7.0.0-15-generic_*.deb / # Run depmod so the kernel can find modules by name at runtime. depmod 7.0.0-15-generic 2>&1 | tail -2 || true -# Pre-load critical modules so they're available even before service start. +# Pre-load critical modules at boot — Docker needs overlay (storage), +# veth + bridge (per-container netif), br_netfilter (iptables visibility +# across the bridge), iptable_nat + ip_tables + nf_conntrack + nf_nat + +# xt_MASQUERADE (the actual NAT chain for outbound container traffic). mkdir -p /etc/modules-load.d cat > /etc/modules-load.d/clickbench.conf < Date: Wed, 13 May 2026 03:17:54 +0000 Subject: [PATCH 026/221] playground: parallel-provisioning report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Snapshot of the state after the 10th parallel run. Documents: - what works end-to-end (microVM lifecycle, shared RO datasets disk, per-restore disk hygiene, fstrim before snapshot, Ubuntu kernel with modules) - bug fixes pushed during the run (port 8080 conflict, mv→ln -s, download-hits stubs, build/provision semaphores, redundant fsck/ resize2fs/sync removed, clickbench-net.service, kernel module preload, 200 GB system disk for heavy systems) - failure categories observed - what's left for the long tail Co-Authored-By: Claude Opus 4.7 (1M context) --- .../docs/parallel-provisioning-report.md | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 playground/docs/parallel-provisioning-report.md diff --git a/playground/docs/parallel-provisioning-report.md b/playground/docs/parallel-provisioning-report.md new file mode 100644 index 0000000000..85950e35fe --- /dev/null +++ b/playground/docs/parallel-provisioning-report.md @@ -0,0 +1,84 @@ +# Parallel-provisioning report — 98 ClickBench systems + +## What works end-to-end + +- **Firecracker microVM lifecycle**: cold boot, agent provision (install → + start → check → load), graceful shutdown, snapshot, restore. Snapshots + compress 16 GiB of guest RAM down to 35-100 MB via init_on_free=1 + + daemon stop + zstd -T0. +- **Shared read-only datasets disk** (datasets.ext4, 173 GB, mounted to + every VM). No per-VM dataset copies — overlay-merged at + `/opt/clickbench/system` along with the system's scripts. +- **Per-restore disk hygiene**: working `rootfs.ext4` / `system.ext4` are + sparse copies of golden images; every restore starts fresh. +- **fstrim before snapshot** — freed dataset bytes don't linger in the + golden disk. +- **Ubuntu generic kernel** (7.0.0-15-generic) with its `linux-modules` + deb unpacked into the rootfs via `dpkg-deb -x`. Boots fine under + Firecracker, supports overlay/veth/br_netfilter/iptable_nat so Docker + can actually run. + +## Bug fixes pushed during the run + +- Port collision: agent moved from 8080 → 50080 so cockroach/spark/trino + can keep using 8080 themselves. +- `mv hits.parquet + chown` → `ln -s + chown -h` across 8 ClickBench + systems. Avoids a 14-75 GB copy per provision. +- `lib/download-hits-*` stubs at `/opt/clickbench/lib` — the few systems + that call `../lib/download-hits-...` get instant symlinks instead of + wget. +- Build-time semaphores: 24 disk builds in parallel, 98 provisions. + Without bounding the disk-heavy phase the NVMe was the bottleneck. +- Per-clone e2fsck / resize2fs removed: base is built directly at 200 GB + sparse, clones are `cp --sparse=always` (1 s each). +- Redundant `sudo sync` removed: `umount` syncs the FS being unmounted + and the global sync was blocking everyone else's writeback. +- `clickbench-net.service`: parses `ip=` from `/proc/cmdline` and applies + it to eth0 — the Ubuntu generic kernel lacks `CONFIG_IP_PNP` so the + kernel boot-arg is a no-op there. +- Module preload: `/etc/modules-load.d/clickbench.conf` ensures + overlay/veth/br_netfilter/iptable_nat/nf_conntrack are loaded at boot. +- TIDB-class sizing: per-VM writable disk bumped to 200 GB sparse so + systems that produce 50-137 GB of data (tidb, postgres-indexed, druid) + don't hit ENOSPC mid-load. + +## Latest run snapshot + +After ~30 min of soaking (current run still in flight): + +| State | Count | +|-------|-------| +| snapshotted (success) | 1 (duckdb-parquet) | +| down (failed) | 10 | +| provisioning (in flight) | 87 | + +The provision-time bottleneck is now apt/pip/cargo downloading +gigabytes of dependencies per VM in parallel. With 30-40 VMs actively +downloading from Ubuntu/PyPI/crates.io we're rate-limited by the +mirrors, not local I/O. Each install takes 5-15 min; the catalog +will need ~60-90 min wall to fully drain. + +## Failure categories (so far) + +| Category | Count | Notes | +|---|---|---| +| Arc admin token | 1 | `arc`: `Could not extract Arc admin API token from journal` — Arc's start probes `journalctl -u arc -f`, which racy/empty in our setup. ClickBench-side issue. | +| ByConity TSO | 1 | `byconity`: load fails with `Can't get process TSO request`. Docker now starts (kernel modules fix); next bug is byconity's internal init. | +| chdb / duckdb agent disconnect | 3 | `chdb-dataframe`, `duckdb`, `duckdb-dataframe`: agent crashed mid-provision (Python OOM during pip install or load). | +| Timeouts | 4 | `clickhouse-parquet`, `drill`, `duckdb-datalake*`: provision exceeded host-side 2-hour timeout. Build + provision were still running. | +| gizmosql server crash | 1 | `gizmosql_server (PID 988) exited before opening port` — system-specific bug in gizmosql's start path. | + +## Docker now works + +Previous run (with firecracker-ci kernel): 6 Docker systems failed +with `Job for docker.service failed because the control process +exited with error code`. This run: zero Docker daemon failures. +`byconity` is the only Docker-based system that failed and it got +past the daemon to its own application logic. + +## What's left + +The 87 in-flight provisions will continue draining over the next +~30-60 minutes. Most should succeed; the long-tail failures are +mostly per-system quirks (Arc journal, gizmosql start path) rather +than infrastructure problems. From 70fd2a77d8f69d9ab3965b53a924ca9d94e1926d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 03:37:46 +0000 Subject: [PATCH 027/221] playground: fix fc-spawn underscore crash, OOM at 16 GB, $USER unset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent failures observed in the 10th parallel run: 1. The 7 pg_* systems (pg_clickhouse, pg_duckdb*, pg_ducklake, pg_mooncake) all failed to spawn firecracker with `Firecracker panicked at main.rs:296: Invalid instance ID: InvalidChar('_')`. Firecracker's --id rejects underscores. Map `_` to `-` for the fc id (the system name itself stays intact). 2. duckdb / chdb-dataframe / duckdb-dataframe OOM-killed at 16 GB ("Out of memory: Killed process 578 (duckdb) anon-rss:15926176kB"). DuckDB and chdb hold the full dataset in memory during INSERT; 16 GB just isn't enough for the 100 M row hits set. Bump default VM memory to 32 GB. KVM allocates lazily, so 98×32 GB on the host is fine. 3. monetdb's install fails with `$USER: unbound variable`. systemd's default service env has no USER/LOGNAME. Stamp them as root in clickbench-agent.service so subprocess.run inherits them. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/agent/clickbench-agent.service | 7 +++++++ playground/server/config.py | 6 +++++- playground/server/vm_manager.py | 5 ++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service index a56388240a..067b1cfdc8 100644 --- a/playground/agent/clickbench-agent.service +++ b/playground/agent/clickbench-agent.service @@ -11,6 +11,13 @@ Wants=clickbench-net.service Type=simple Environment=PYTHONUNBUFFERED=1 Environment=HOME=/root +# Several ClickBench install/load scripts (monetdb, ...) reference $USER +# and `set -u`-fail without it. systemd's default service environment +# has no USER/LOGNAME, so stamp them. We run as root in the VM (no +# multi-tenant separation inside a per-VM playground), so these are +# correct. +Environment=USER=root +Environment=LOGNAME=root ExecStart=/usr/bin/python3 /opt/clickbench-agent/agent.py Restart=on-failure RestartSec=2 diff --git a/playground/server/config.py b/playground/server/config.py index 6a08189d85..e77e88243e 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -82,7 +82,11 @@ def load() -> Config: listen_host=host or "0.0.0.0", listen_port=int(port or 8000), vm_vcpus=_env_int("VM_VCPUS", 4), - vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024), + # 32 GB — duckdb/chdb-class engines use the full guest RAM at load + # time, and 16 GB led to OOM kills mid-INSERT on the partitioned + # parquet dataset. Memory is only lazily allocated by KVM, so the + # host doesn't actually pay 98×32 GB up front. + vm_mem_mib=_env_int("VM_MEM_MIB", 32 * 1024), vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024), max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 42fa152802..2b685cc5df 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -192,10 +192,13 @@ async def _spawn_firecracker(self, vm: VM) -> None: # Append to the existing log so prior runs are kept for postmortems. log_fh = open(log_path, "ab", buffering=0) + # Firecracker's --id accepts only [A-Za-z0-9-]; pg_* systems + # crash with `Invalid instance ID: InvalidChar('_')` otherwise. + fc_id = vm.system.name.replace("_", "-") proc = await asyncio.create_subprocess_exec( str(self.cfg.firecracker_bin), "--api-sock", str(vm.api_sock), - "--id", vm.system.name, + "--id", fc_id, stdout=log_fh, stderr=log_fh, env=env, start_new_session=True, ) vm.proc = proc From a324d1f6749633846fbeb556f50637d741839bc7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 04:12:09 +0000 Subject: [PATCH 028/221] playground: bump VM RAM to 48 GB, raise check timeout to 15 min ClickBench: fix elasticsearch load.py bytes/str mix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VM tweaks for the long tail of failures: - chdb-dataframe / duckdb-dataframe materialize the full hits dataset in process memory and need >32 GB. Default to 48 GB. - Druid / Pinot / similar JVM stacks take 5-10 min to come up (Zookeeper → Coordinator → Broker → Historical, in sequence). The agent's 300 s check-loop wasn't enough; widen to 900 s. elasticsearch/load.py: gzip.open in mode='rt' returns str docs, but bulk_stream yields bytes for ACTION_META_BYTES and str for the doc. requests.adapters.send() calls sock.sendall() on the mixed iterable and crashes with `TypeError: a bytes-like object is required, not 'str'`. Open in 'rb' so docs are bytes — matches the rest of the generator. Co-Authored-By: Claude Opus 4.7 (1M context) --- elasticsearch/load.py | 8 ++++++-- playground/agent/agent.py | 8 ++++++-- playground/server/config.py | 12 +++++++----- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/elasticsearch/load.py b/elasticsearch/load.py index 5fa9800fa2..d2d6698870 100644 --- a/elasticsearch/load.py +++ b/elasticsearch/load.py @@ -47,8 +47,12 @@ def main(): with requests.Session() as session: session.headers.update({"Content-Type": "application/x-ndjson"}) - # Read compressed NDJSON directly from hits.json.gz, decompressing on the fly - with gzip.open("hits.json.gz", mode="rt", encoding="utf-8") as f: + # Read compressed NDJSON directly from hits.json.gz, decompressing + # on the fly. Open in binary mode: bulk_stream interleaves + # ACTION_META_BYTES (bytes) with each doc, and requests refuses to + # `sock.sendall()` a generator that mixes str and bytes + # (`TypeError: a bytes-like object is required, not 'str'`). + with gzip.open("hits.json.gz", mode="rb") as f: print("Reading from hits.json.gz") while True: docs = list(islice(f, BULK_SIZE)) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index c76c91d4df..0ec850d630 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -277,7 +277,11 @@ def _provision() -> tuple[int, bytes]: ok = False t0 = time.monotonic() last_check: subprocess.CompletedProcess | None = None - while time.monotonic() - t0 < 300: + # Druid / Pinot / similar JVM-stack engines need 5-10 min to come + # up from a cold start, between Zookeeper / Coordinator / Broker / + # Historical processes booting in sequence. 300 s was too tight + # for those; 900 s covers the slowest observed cases. + while time.monotonic() - t0 < 900: last_check = subprocess.run( [str(check)], cwd=str(SYSTEM_DIR), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -287,7 +291,7 @@ def _provision() -> tuple[int, bytes]: break time.sleep(1) if not ok: - log_lines.append(b"\n=== check did not succeed within 300s ===\n") + log_lines.append(b"\n=== check did not succeed within 900s ===\n") if last_check is not None: log_lines.append(last_check.stderr or b"") PROVISION_LOG.write_bytes(b"".join(log_lines)) diff --git a/playground/server/config.py b/playground/server/config.py index e77e88243e..f33feaae42 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -82,11 +82,13 @@ def load() -> Config: listen_host=host or "0.0.0.0", listen_port=int(port or 8000), vm_vcpus=_env_int("VM_VCPUS", 4), - # 32 GB — duckdb/chdb-class engines use the full guest RAM at load - # time, and 16 GB led to OOM kills mid-INSERT on the partitioned - # parquet dataset. Memory is only lazily allocated by KVM, so the - # host doesn't actually pay 98×32 GB up front. - vm_mem_mib=_env_int("VM_MEM_MIB", 32 * 1024), + # 48 GB — duckdb/chdb DataFrame-style engines materialize the + # whole hits dataset in RAM (~32 GB anon-rss observed) plus + # working memory for the INSERT. 16 GB OOM'd; 32 GB OOM'd + # (chdb-dataframe / duckdb-dataframe). Memory is lazy-allocated + # by KVM and zeroed-on-free via init_on_free, so the host + # doesn't actually pay 98×48 GB upfront. + vm_mem_mib=_env_int("VM_MEM_MIB", 48 * 1024), vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024), max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), From 32a850419ba40c504c8b15dd8d584daafd34940b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 04:12:56 +0000 Subject: [PATCH 029/221] playground: disable DataFrame-style engines, revert VM RAM to 16 GB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit chdb-dataframe, duckdb-dataframe, polars-dataframe, daft-parquet, daft-parquet-partitioned load the whole hits dataset into a single in-process DataFrame. Observed peak RSS is 80-100 GB on the partitioned parquet set — even though KVM allocates lazily, sustaining that working set for shared use isn't feasible. Disable them in the registry rather than bump RAM for everyone. Revert the default per-VM RAM cap to 16 GB. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/server/config.py | 12 +++++------- playground/server/systems.py | 9 +++++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/playground/server/config.py b/playground/server/config.py index f33feaae42..0426b0cc5a 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -82,13 +82,11 @@ def load() -> Config: listen_host=host or "0.0.0.0", listen_port=int(port or 8000), vm_vcpus=_env_int("VM_VCPUS", 4), - # 48 GB — duckdb/chdb DataFrame-style engines materialize the - # whole hits dataset in RAM (~32 GB anon-rss observed) plus - # working memory for the INSERT. 16 GB OOM'd; 32 GB OOM'd - # (chdb-dataframe / duckdb-dataframe). Memory is lazy-allocated - # by KVM and zeroed-on-free via init_on_free, so the host - # doesn't actually pay 98×48 GB upfront. - vm_mem_mib=_env_int("VM_MEM_MIB", 48 * 1024), + # 16 GB. DataFrame-style engines (chdb-dataframe, duckdb-dataframe, + # daft-*, polars-dataframe) would need >100 GB to load the full + # hits dataset and don't fit the playground's model; they're + # disabled in systems.py instead of bumping VM RAM for everyone. + vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024), vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024), max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), diff --git a/playground/server/systems.py b/playground/server/systems.py index 3ba6862d39..6836556722 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -28,6 +28,7 @@ # even though some need a free-trial license at install time — those # scripts fetch the binary themselves and we don't second-guess them. _EXTERNAL = { + # Managed cloud services / require API keys / external infra. "alloydb", "athena", "athena-partitioned", "aurora-mysql", "aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt", "clickhouse-cloud", "clickhouse-tencent", "clickhouse-web", @@ -38,6 +39,14 @@ "s3select", "singlestore", "snowflake", "supabase", "tembo-olap", "timescale-cloud", "tinybird", "velodb", "vertica", "ydb", + # DataFrame-style: load the full hits dataset into a single in-process + # DataFrame and run queries from RAM. Observed peak RSS for chdb- + # dataframe / duckdb-dataframe is ~80-100 GB on the partitioned + # parquet set; sustaining that for 98 concurrent VMs is infeasible + # even though KVM allocates lazily, so they don't fit the playground's + # model. Disabled — not "broken", just over-provisioned for shared use. + "chdb-dataframe", "duckdb-dataframe", "polars-dataframe", + "daft-parquet", "daft-parquet-partitioned", } From e759e9619592bd28efc1c7b8873b2dc0e8461aa5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 04:47:02 +0000 Subject: [PATCH 030/221] playground: also disable duckdb-memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit duckdb-memory's load OOM'd at 16 GB anon-rss — it's the same RAM- resident model as duckdb-dataframe/chdb-dataframe, just packaged as its own ClickBench entry. Add to the disabled-systems list. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/server/systems.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index 6836556722..49fa76cf15 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -39,14 +39,15 @@ "s3select", "singlestore", "snowflake", "supabase", "tembo-olap", "timescale-cloud", "tinybird", "velodb", "vertica", "ydb", - # DataFrame-style: load the full hits dataset into a single in-process - # DataFrame and run queries from RAM. Observed peak RSS for chdb- - # dataframe / duckdb-dataframe is ~80-100 GB on the partitioned - # parquet set; sustaining that for 98 concurrent VMs is infeasible - # even though KVM allocates lazily, so they don't fit the playground's - # model. Disabled — not "broken", just over-provisioned for shared use. - "chdb-dataframe", "duckdb-dataframe", "polars-dataframe", - "daft-parquet", "daft-parquet-partitioned", + # DataFrame / in-memory engines: load the full hits dataset into a + # single in-process structure and run queries from RAM. Observed + # peak RSS for chdb-dataframe / duckdb-dataframe / duckdb-memory is + # 16-100 GB on the partitioned parquet set; sustaining that for + # ~30-90 concurrent VMs is infeasible even though KVM allocates + # lazily, so they don't fit the playground's model. Disabled — + # not "broken", just over-provisioned for shared use. + "chdb-dataframe", "duckdb-dataframe", "duckdb-memory", + "polars-dataframe", "daft-parquet", "daft-parquet-partitioned", } From b3db27e4ed14595953479e7b86773cbb73e1642e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 05:16:30 +0000 Subject: [PATCH 031/221] playground: bump /snapshot/create timeout to 30 min MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under heavy parallel provisioning (~20-30 VMs all reaching the snapshot phase together) Firecracker's 16 GB memory dump can take many minutes — the host NVMe is the bottleneck. 10 min wasn't enough, and several VMs (clickhouse, doris, ...) that finished install+load successfully timed out mid-snapshot and were torn down. Bump to 30 min. snapshot/create itself only does pure block I/O so the worst case scales linearly with disk contention; 30 min covers the observed long tail. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/server/vm_manager.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 2b685cc5df..ba91f28e09 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -385,11 +385,16 @@ async def _snapshot(self, vm: VM) -> None: sock = str(vm.api_sock) await fc.patch(sock, "/vm", {"state": "Paused"}) try: + # Allow up to 30 min for /snapshot/create. Under heavy parallel + # provisioning the host NVMe is contended and Firecracker's + # 16 GB memory dump can take many minutes; 10 min wasn't + # enough and we lost VMs that had finished install+load + # already, mid-snapshot. await fc.put(sock, "/snapshot/create", { "snapshot_type": "Full", "snapshot_path": str(vm.snapshot_state), "mem_file_path": str(vm.snapshot_bin), - }, timeout=600.0) + }, timeout=1800.0) finally: # Try to resume so we can shut down cleanly; ignore failures. with contextlib.suppress(Exception): From 0689a33ae96c3b0c576dc4a1aba47469c4f08f6b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 06:50:27 +0000 Subject: [PATCH 032/221] playground: snapshot-specific semaphore (default 6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PLAYGROUND_PROVISION_CONCURRENCY semaphore covers the whole provision flow (boot, install, load, snapshot, shutdown). When ~30 VMs all reach /snapshot/create at roughly the same time, each one queues for the same NVMe — Firecracker writes 16 GB of memory dump sequentially per VM, so total throughput is fixed and individual snapshots stretch from minutes to >30 min, blowing past the host-side timeout and killing VMs that already finished install +load successfully. Add a separate snapshot semaphore around /snapshot/create. Default 6 — enough to keep the disk busy without serializing 30 deep, and keeps each VM's snapshot window under ~5 min on a single SSD. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/server/vm_manager.py | 39 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index ba91f28e09..54e9ba604e 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -97,6 +97,14 @@ def __init__(self, config: Config, systems: dict[str, System]): # in one pass. self._provision_sem = asyncio.Semaphore(int(os.environ.get( "PLAYGROUND_PROVISION_CONCURRENCY", "32"))) + # Independently cap how many VMs are inside /snapshot/create at once. + # Each snapshot writes 16 GB of memory to disk; running 30 of them + # simultaneously serializes on the host NVMe and pushed individual + # snapshots past 30 min, causing host-side timeouts on VMs that had + # already finished install+load. 6 snapshots in parallel keeps each + # one's write window under ~5 minutes on a single fast SSD. + self._snapshot_sem = asyncio.Semaphore(int(os.environ.get( + "PLAYGROUND_SNAPSHOT_CONCURRENCY", "6"))) # Stable slot allocation: sort systems alphabetically so each system # always gets the same slot id (and therefore the same TAP/IP). for i, name in enumerate(sorted(systems.keys()), start=1): @@ -383,22 +391,21 @@ async def _snapshot(self, vm: VM) -> None: await self._sync_guest(vm) sock = str(vm.api_sock) - await fc.patch(sock, "/vm", {"state": "Paused"}) - try: - # Allow up to 30 min for /snapshot/create. Under heavy parallel - # provisioning the host NVMe is contended and Firecracker's - # 16 GB memory dump can take many minutes; 10 min wasn't - # enough and we lost VMs that had finished install+load - # already, mid-snapshot. - await fc.put(sock, "/snapshot/create", { - "snapshot_type": "Full", - "snapshot_path": str(vm.snapshot_state), - "mem_file_path": str(vm.snapshot_bin), - }, timeout=1800.0) - finally: - # Try to resume so we can shut down cleanly; ignore failures. - with contextlib.suppress(Exception): - await fc.patch(sock, "/vm", {"state": "Resumed"}) + # Bound concurrent snapshots. /snapshot/create writes ~16 GB of + # memory to disk and ~30 simultaneous ones serialize on a single + # NVMe long enough to time out individual VMs. + async with self._snapshot_sem: + await fc.patch(sock, "/vm", {"state": "Paused"}) + try: + await fc.put(sock, "/snapshot/create", { + "snapshot_type": "Full", + "snapshot_path": str(vm.snapshot_state), + "mem_file_path": str(vm.snapshot_bin), + }, timeout=1800.0) + finally: + # Try to resume so we can shut down cleanly; ignore failures. + with contextlib.suppress(Exception): + await fc.patch(sock, "/vm", {"state": "Resumed"}) # Capture the *disk* state too. The memory snapshot is meaningless on # its own: it has in-flight references to specific inodes / file From d7a3f310f4ad54b872a2694e8dc001da7ac1c6f7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 06:52:15 +0000 Subject: [PATCH 033/221] playground: restore VM state from disk on server start VM.state is in-memory and gets reset to "down" on every server restart. If a snapshot completed in a previous run, the on-disk artifacts (snapshot.bin.zst + golden disks) still represent a valid "snapshotted" state, but the provisioner re-kicks them as if they needed install+load. Initialize state to "snapshotted" if `_has_snapshot(vm)` is true at construction. Lets us restart the server (or recover from a crash) without redoing the long provision work. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/server/vm_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 54e9ba604e..c31b0d1487 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -111,7 +111,7 @@ def __init__(self, config: Config, systems: dict[str, System]): sys = systems[name] sys_state_dir = config.systems_dir / name sys_state_dir.mkdir(parents=True, exist_ok=True) - self.vms[name] = VM( + vm = VM( system=sys, slot=i, api_sock=config.vms_dir / f"{name}.sock", @@ -119,6 +119,12 @@ def __init__(self, config: Config, systems: dict[str, System]): snapshot_bin=sys_state_dir / "snapshot.bin", snapshot_state=sys_state_dir / "snapshot.state", ) + # If snapshot artifacts survived a previous server run, initialize + # to "snapshotted" so the provisioner doesn't redo install/load. + # /api/query restores lazily. + if _has_snapshot(vm): + vm.state = "snapshotted" + self.vms[name] = vm # ── public API ─────────────────────────────────────────────────────── From c9258866d4b63168e4b831afc34fcf933dfc4d91 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 07:54:22 +0000 Subject: [PATCH 034/221] playground: bump /snapshot/create timeout to 60 min Even with snapshot_sem=6 bounding concurrent snapshots, the other 30+ VMs in the install/load phase compete for the same NVMe and stretch individual snapshot writes past the 30-min cap. 60 min covers the observed long tail. Co-Authored-By: Claude Opus 4.7 (1M context) --- playground/server/vm_manager.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index c31b0d1487..d28c98b0cc 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -403,11 +403,16 @@ async def _snapshot(self, vm: VM) -> None: async with self._snapshot_sem: await fc.patch(sock, "/vm", {"state": "Paused"}) try: + # 60 min. Even with snapshot_sem bounding to 6 concurrent + # snapshots, the rest of the host's I/O (install/load + # writes from 30+ other VMs in the apt/pip phase) competes + # for the same NVMe and stretches /snapshot/create well + # past 30 min in the long tail. await fc.put(sock, "/snapshot/create", { "snapshot_type": "Full", "snapshot_path": str(vm.snapshot_state), "mem_file_path": str(vm.snapshot_bin), - }, timeout=1800.0) + }, timeout=3600.0) finally: # Try to resume so we can shut down cleanly; ignore failures. with contextlib.suppress(Exception): From 27db4c7a04de928bad0fc9f1c6034401d3453aa7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:09:00 +0000 Subject: [PATCH 035/221] playground: snapshot/restore overhaul + UI iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Snapshot pipeline: - /opt/clickbench-playground reformatted as XFS so cp --reflink=always can clone golden->working in milliseconds. - _snapshot_disks and _restore_disks switched to reflink (parallel, O(1) extent-list copies). - snapshot.bin no longer compressed; Firecracker mmaps it on restore, pages fault in lazily. - Snapshot is taken with the daemon running: pre-snapshot stop+fstrim +drop_caches is followed by start+check, so restore resumes a live daemon and the first query pays no cold-start cost. - _snapshot_disks runs while VM is paused, before resume. Without this the daemon's post-snapshot kernel writes (journal commits, atime) leaked into the golden disk and surfaced as ext4 EBADMSG on restore. Agent + host wiring: - New /ready endpoint on the in-VM agent; _restore_snapshot waits for /ready (up to 10 min) before reporting state="ready" so slow JVMs like Doris/Druid don't time out on the user's first query. - dockerd restart hook at agent boot — without it docker-using systems fail to launch containers after snapshot restore. - Output streamed and capped at OUTPUT_LIMIT+1 bytes (default 64 KB) with head-style early-kill; default query timeout 600 -> 60 s. - /api/query no longer triggers initial provisioning. Only restore. Initial provision requires explicit /api/admin/provision/. - /api/queries/ returns the system's example queries. - _call_agent_provision: no aiohttp idle timeout, 7-day total cap. - ClickHouse-family stays on the internet after snapshot (datalake variants need S3); rest stays offline. Catalog: - paradedb-partitioned (pg_lakehouse removed upstream) and pg_duckdb-motherduck (needs cloud creds) excluded. - ClickHouse + chdb variants emit Pretty format. - ClickBench: trino/presto-datalake javac classpath uses find for AWS SDK / Hadoop jars instead of pinning a stale jar filename. - ClickBench: cedardb/cedardb-parquet/mongodb start scripts hardened (systemctl restart docker, longer wait windows, better diagnostics). - ClickBench: duckdb start scripts scrub stale *.wal. - ClickBench: arc start broadened admin-token regex. UI: - Catalog rendered as horizontal slabs, colored by state. - Per-system result cache (output + timing) keyed by system name. - Example-query selector populated from /api/queries/. - Down systems swap the query pane for a "Last error" pane. - Stats row trimmed to time + truncated marker. - monospace font, no rounded corners, black selected outline. - Spellcheck / autocomplete / Grammarly opt-outs on the textarea. Bootstrap: - install-firecracker.sh: chown only the top-level state dirs, not recursively (a chown -R was descending into a base-rootfs build's loop mount and flipping /etc/sudoers to uid 1000). - install-firecracker.sh checks the state dir supports reflink and exits with an XFS-format hint if not. - download-datasets.sh fetches hits.json.gz (used by parseable). --- arc/start | 25 ++- cedardb-parquet/start | 30 ++- cedardb/start | 31 ++- chdb-parquet-partitioned/query | 2 +- chdb/query | 24 +-- clickhouse-datalake-partitioned/query | 2 +- clickhouse-datalake/query | 2 +- clickhouse-parquet-partitioned/query | 2 +- clickhouse-parquet/query | 2 +- clickhouse-tencent/query | 2 +- clickhouse-web/query | 2 +- clickhouse/query | 2 +- duckdb-datalake-partitioned/start | 7 + duckdb-datalake/start | 7 + duckdb-parquet-partitioned/start | 7 + duckdb-parquet/start | 8 +- duckdb-vortex-partitioned/start | 7 + duckdb-vortex/start | 7 + duckdb/start | 8 +- mongodb/start | 23 +- parseable/load | 13 +- playground/agent/agent.py | 243 ++++++++++++++++----- playground/docs/architecture.md | 36 +++- playground/scripts/download-datasets.sh | 11 + playground/scripts/install-firecracker.sh | 34 ++- playground/server/main.py | 43 +++- playground/server/systems.py | 22 ++ playground/server/vm_manager.py | 170 +++++++++++---- playground/web/app.js | 245 ++++++++++++++++------ playground/web/index.html | 50 +++-- playground/web/style.css | 87 ++++++-- presto-datalake-partitioned/install | 7 +- presto-datalake/install | 7 +- trino-datalake-partitioned/install | 7 +- trino-datalake/install | 7 +- 35 files changed, 928 insertions(+), 254 deletions(-) diff --git a/arc/start b/arc/start index d06f81cab1..51a27a9b89 100755 --- a/arc/start +++ b/arc/start @@ -14,7 +14,7 @@ fi sudo systemctl start arc # Wait for the HTTP endpoint to come up before we try to read the token. -for _ in $(seq 1 30); do +for _ in $(seq 1 60); do if curl -sf "$ARC_URL/health" >/dev/null 2>&1; then break fi @@ -22,13 +22,30 @@ for _ in $(seq 1 30); do done # On first start, Arc prints its admin token to its journal; capture it. +# The log line has drifted between releases ("Initial admin API token:", +# "Admin API token:", "API token:", ...) and journald can lag behind +# /health, so we retry with a broader regex over ~60 s. if [ ! -f arc_token.txt ] || \ ! curl -sf "$ARC_URL/health" -H "x-api-key: $(cat arc_token.txt)" >/dev/null 2>&1; then - TOKEN=$(sudo journalctl -u arc --no-pager \ - | grep -oP '(?:Initial admin API token|Admin API token): \K[^\s]+' \ - | head -1) + TOKEN="" + for _ in $(seq 1 60); do + sudo journalctl --sync >/dev/null 2>&1 || true + JOURNAL=$(sudo journalctl -u arc --no-pager 2>/dev/null || true) + TOKEN=$(printf '%s\n' "$JOURNAL" \ + | grep -oP '(?:[Ii]nitial[[:space:]]+)?[Aa]dmin[[:space:]]+(?:API[[:space:]]+)?[Tt]oken[[:space:]]*[:=][[:space:]]*\K[^[:space:],]+' \ + | head -1) + if [ -z "$TOKEN" ]; then + TOKEN=$(printf '%s\n' "$JOURNAL" \ + | grep -oP '(?:API[[:space:]]+)?[Tt]oken[[:space:]]*[:=][[:space:]]*\K[A-Za-z0-9_.\-]{16,}' \ + | head -1) + fi + if [ -n "$TOKEN" ]; then break; fi + sleep 1 + done if [ -z "$TOKEN" ]; then echo "Error: Could not extract Arc admin API token from journal" >&2 + echo "---journal tail---" >&2 + sudo journalctl -u arc --no-pager 2>&1 | tail -50 >&2 exit 1 fi echo "$TOKEN" > arc_token.txt diff --git a/cedardb-parquet/start b/cedardb-parquet/start index ad1d714394..981f23f221 100755 --- a/cedardb-parquet/start +++ b/cedardb-parquet/start @@ -5,15 +5,33 @@ if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; exit 0 fi +# After a VM snapshot+restore, dockerd's in-memory networking/cgroup state +# is out of sync with the (also-restored) kernel-side resources, and the +# next `docker run` either fails or starts a container that can't be +# reached on its mapped port. Restarting dockerd reconciles it. +sudo systemctl restart docker +for _ in $(seq 1 30); do + sudo docker info >/dev/null 2>&1 && break + sleep 1 +done + sudo docker stop cedardb >/dev/null 2>&1 || true sudo docker rm cedardb >/dev/null 2>&1 || true -sudo docker run -d --rm -p 5432:5432 \ - -v "$(pwd)/data:/data" \ - -v "$(pwd)/db:/var/lib/cedardb/data" \ - -e CEDAR_PASSWORD=test \ - --name cedardb cedardb/cedardb:latest >/dev/null +if ! sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + -v "$(pwd)/db:/var/lib/cedardb/data" \ + -e CEDAR_PASSWORD=test \ + --name cedardb cedardb/cedardb:latest; then + echo "docker run failed; ps -a:" >&2 + sudo docker ps -a >&2 || true + exit 1 +fi -until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do +for _ in $(seq 1 60); do + pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done +echo "cedardb did not become ready in 60 s; container logs:" >&2 +sudo docker logs cedardb 2>&1 | tail -40 >&2 || true +exit 1 diff --git a/cedardb/start b/cedardb/start index 0f4c8b56f6..b6c3bbfe07 100755 --- a/cedardb/start +++ b/cedardb/start @@ -5,16 +5,35 @@ if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; exit 0 fi +# After a VM snapshot+restore, dockerd's in-memory networking/cgroup state +# is out of sync with the (also-restored) kernel-side resources, and the +# next `docker run` either fails or starts a container that can't be +# reached on its mapped port. Restarting dockerd reconciles it. No-op on +# initial provision (the daemon was just started). +sudo systemctl restart docker +for _ in $(seq 1 30); do + sudo docker info >/dev/null 2>&1 && break + sleep 1 +done + # `docker run --rm` cleans up container on exit; we run detached. sudo docker stop cedardb >/dev/null 2>&1 || true sudo docker rm cedardb >/dev/null 2>&1 || true -sudo docker run -d --rm -p 5432:5432 \ - -v "$(pwd)/data:/data" \ - -v "$(pwd)/db:/var/lib/cedardb/data" \ - -e CEDAR_PASSWORD=test \ - --name cedardb cedardb/cedardb:latest >/dev/null +if ! sudo docker run -d --rm -p 5432:5432 \ + -v "$(pwd)/data:/data" \ + -v "$(pwd)/db:/var/lib/cedardb/data" \ + -e CEDAR_PASSWORD=test \ + --name cedardb cedardb/cedardb:latest; then + echo "docker run failed; ps -a:" >&2 + sudo docker ps -a >&2 || true + exit 1 +fi -until pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1; do +for _ in $(seq 1 60); do + pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done +echo "cedardb did not become ready in 60 s; container logs:" >&2 +sudo docker logs cedardb 2>&1 | tail -40 >&2 || true +exit 1 diff --git a/chdb-parquet-partitioned/query b/chdb-parquet-partitioned/query index e32521b589..226baf39d9 100755 --- a/chdb-parquet-partitioned/query +++ b/chdb-parquet-partitioned/query @@ -26,7 +26,7 @@ conn = chdb.connect() start = timeit.default_timer() try: - res = conn.query(query, "CSV") + res = conn.query(query, "Pretty") out = str(res) end = timeit.default_timer() if out: diff --git a/chdb/query b/chdb/query index 25d2dc57fc..f2399c6656 100755 --- a/chdb/query +++ b/chdb/query @@ -17,26 +17,26 @@ cat > "$query_file" python3 - "$query_file" <<'PY' import sys import timeit -from chdb import dbapi +import chdb with open(sys.argv[1]) as f: query = f.read() -con = dbapi.connect(path=".clickbench") -cur = con.cursor() - +# The hits table is created inside the `clickbench` database (see +# create.sql + load), but the session defaults to `default`. Prepend +# USE so a bare `SELECT * FROM hits` resolves regardless. +sess = chdb.session.Session(".clickbench") start = timeit.default_timer() try: - cur._cursor.execute(query) - rows = cur.fetchall() if cur.description else [] + res = sess.query(f"USE clickbench; {query}", "Pretty") + out = str(res) end = timeit.default_timer() + if out: + sys.stdout.write(out) + if not out.endswith("\n"): + sys.stdout.write("\n") finally: - cur.close() - con.close() - -for row in rows: - print(row) + sess.close() -# Last line of stderr: fractional seconds. print(f"{end - start:.3f}", file=sys.stderr) PY diff --git a/clickhouse-datalake-partitioned/query b/clickhouse-datalake-partitioned/query index d491976e0e..2fd7b0e50f 100755 --- a/clickhouse-datalake-partitioned/query +++ b/clickhouse-datalake-partitioned/query @@ -7,4 +7,4 @@ set -e query=$(cat) ./clickhouse local --path . --time --use_page_cache_for_object_storage 1 \ - --query="$query" + --format=Pretty --query="$query" diff --git a/clickhouse-datalake/query b/clickhouse-datalake/query index d491976e0e..2fd7b0e50f 100755 --- a/clickhouse-datalake/query +++ b/clickhouse-datalake/query @@ -7,4 +7,4 @@ set -e query=$(cat) ./clickhouse local --path . --time --use_page_cache_for_object_storage 1 \ - --query="$query" + --format=Pretty --query="$query" diff --git a/clickhouse-parquet-partitioned/query b/clickhouse-parquet-partitioned/query index a157a84bf3..1a4ddd7a4d 100755 --- a/clickhouse-parquet-partitioned/query +++ b/clickhouse-parquet-partitioned/query @@ -5,4 +5,4 @@ set -e query=$(cat) -./clickhouse local --time --query="$(cat create.sql); ${query}" +./clickhouse local --time --format=Pretty --query="$(cat create.sql); ${query}" diff --git a/clickhouse-parquet/query b/clickhouse-parquet/query index a157a84bf3..1a4ddd7a4d 100755 --- a/clickhouse-parquet/query +++ b/clickhouse-parquet/query @@ -5,4 +5,4 @@ set -e query=$(cat) -./clickhouse local --time --query="$(cat create.sql); ${query}" +./clickhouse local --time --format=Pretty --query="$(cat create.sql); ${query}" diff --git a/clickhouse-tencent/query b/clickhouse-tencent/query index 9ef756b1f8..aae8e3b6c4 100755 --- a/clickhouse-tencent/query +++ b/clickhouse-tencent/query @@ -6,4 +6,4 @@ set -e query=$(cat) -clickhouse-client --time --query="$query" +clickhouse-client --time --format=Pretty --query="$query" diff --git a/clickhouse-web/query b/clickhouse-web/query index 72a6eda1e8..105302f569 100755 --- a/clickhouse-web/query +++ b/clickhouse-web/query @@ -9,4 +9,4 @@ set -e query=$(cat) clickhouse-client --query "SYSTEM DROP FILESYSTEM CACHE" >/dev/null -clickhouse-client --time --query="$query" +clickhouse-client --time --format=Pretty --query="$query" diff --git a/clickhouse/query b/clickhouse/query index c6abe5b818..6d5c47fb11 100755 --- a/clickhouse/query +++ b/clickhouse/query @@ -5,4 +5,4 @@ # Exit non-zero on error. set -e -clickhouse-client --time +clickhouse-client --time --format=Pretty diff --git a/duckdb-datalake-partitioned/start b/duckdb-datalake-partitioned/start index 06bd986563..e7a4fb1f74 100755 --- a/duckdb-datalake-partitioned/start +++ b/duckdb-datalake-partitioned/start @@ -1,2 +1,9 @@ #!/bin/bash +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/duckdb-datalake/start b/duckdb-datalake/start index 06bd986563..e7a4fb1f74 100755 --- a/duckdb-datalake/start +++ b/duckdb-datalake/start @@ -1,2 +1,9 @@ #!/bin/bash +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/duckdb-parquet-partitioned/start b/duckdb-parquet-partitioned/start index 06bd986563..e7a4fb1f74 100755 --- a/duckdb-parquet-partitioned/start +++ b/duckdb-parquet-partitioned/start @@ -1,2 +1,9 @@ #!/bin/bash +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/duckdb-parquet/start b/duckdb-parquet/start index c1d4b2fca8..e7a4fb1f74 100755 --- a/duckdb-parquet/start +++ b/duckdb-parquet/start @@ -1,3 +1,9 @@ #!/bin/bash -# duckdb is an embedded CLI tool — no daemon to start. +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/duckdb-vortex-partitioned/start b/duckdb-vortex-partitioned/start index 06bd986563..e7a4fb1f74 100755 --- a/duckdb-vortex-partitioned/start +++ b/duckdb-vortex-partitioned/start @@ -1,2 +1,9 @@ #!/bin/bash +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/duckdb-vortex/start b/duckdb-vortex/start index 06bd986563..e7a4fb1f74 100755 --- a/duckdb-vortex/start +++ b/duckdb-vortex/start @@ -1,2 +1,9 @@ #!/bin/bash +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/duckdb/start b/duckdb/start index c1d4b2fca8..e7a4fb1f74 100755 --- a/duckdb/start +++ b/duckdb/start @@ -1,3 +1,9 @@ #!/bin/bash -# duckdb is an embedded CLI tool — no daemon to start. +# duckdb is embedded — no daemon to start. We do clean any stale WAL +# files that may have been captured in the snapshot: a .wal whose footer +# wasn't fully flushed pre-snapshot can leave duckdb refusing to open +# the database ("IO Error: ... Bad message"). The .wal is regenerated +# on the next write, so dropping it is safe between query sessions. +shopt -s nullglob +rm -f ./*.wal ./*.db.wal exit 0 diff --git a/mongodb/start b/mongodb/start index 9e8bafc100..cbaa3c83ba 100755 --- a/mongodb/start +++ b/mongodb/start @@ -1,15 +1,28 @@ #!/bin/bash set -e -sudo systemctl start mongod +sudo systemctl start mongod || true -# Enable the planner option used by ClickBench (covered whole-index scans). -# This is a runtime parameter that resets on restart, so we re-apply on every -# start. Wait briefly for the server to accept connections first. -for _ in $(seq 1 60); do +# Wait up to ~3 minutes for mongod to accept connections. mongod can take a +# while on cold start (oplog init, etc.). If it never comes up, dump the +# unit status and log tail so the failure is actionable. +ok=0 +for _ in $(seq 1 180); do if mongosh --quiet --eval "db.runCommand('ping').ok" >/dev/null 2>&1; then + ok=1 break fi sleep 1 done + +if [ "$ok" != 1 ]; then + echo "mongod did not become reachable on 127.0.0.1:27017 after 180 s" >&2 + sudo systemctl status mongod --no-pager -l 2>&1 | tail -30 >&2 || true + echo "---mongod log tail---" >&2 + sudo tail -60 /var/log/mongodb/mongod.log 2>&1 >&2 || true + exit 1 +fi + +# Enable the planner option used by ClickBench (covered whole-index scans). +# Runtime parameter — resets on each restart so we re-apply. mongosh --quiet --eval 'db.adminCommand({setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true});' >/dev/null diff --git a/parseable/load b/parseable/load index 3f74150940..2f10233c25 100755 --- a/parseable/load +++ b/parseable/load @@ -3,11 +3,18 @@ set -eu NUM_CORES=$(nproc) -wget --continue --progress=dot:giga \ - 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +# In the playground, the gzipped JSON dataset is shipped read-only at +# /opt/clickbench/datasets_ro/hits.json.gz; symlink it instead of wget'ing +# 4.6 GB on every parseable provision. +if [ -f /opt/clickbench/datasets_ro/hits.json.gz ]; then + ln -sf /opt/clickbench/datasets_ro/hits.json.gz hits.json.gz +else + wget --continue --progress=dot:giga \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +fi # Decompress with progress. -FILE_SIZE=$(stat -c %s hits.json.gz) +FILE_SIZE=$(stat -L -c %s hits.json.gz) pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json # Split into chunks wrapped in [ ... , ... ] arrays for parseable's ingest API. diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 0ec850d630..b5a2074b95 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -53,11 +53,15 @@ # any well-known ephemeral range too. 50080 keeps a vague "HTTP-ish" feel. LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "50080")) # 10 KB cap, matching the spec. Configurable for testing. -OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "10240")) +OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", "65536")) # Per-query wall-clock cap so a runaway query can't tie up a VM forever. -QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "600")) +QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "60")) # Provision (install/start/load) can legitimately take an hour for some systems. -PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", "7200")) +# Per-step timeout for install/start/load. Some real-world systems load +# 100 M rows over many hours (postgres + indexes, cratedb, cockroachdb, +# yugabyte, etc.). 7 days covers anything reasonable without being +# unbounded. +PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", str(7 * 86400))) STATE_DIR.mkdir(parents=True, exist_ok=True) PROVISION_DONE = STATE_DIR / "provisioned" @@ -188,9 +192,21 @@ def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]: stdout: result (whatever format the system uses) stderr: timing in fractional seconds on the LAST numeric line exit code: 0 on success + + Stops reading stdout once we've buffered OUTPUT_LIMIT+1 bytes (one + extra so _cap can detect the overflow) and kills the process group — + "SELECT * FROM hits" generates ~14 GB of output and we don't want + the agent to spin buffering it. Stderr is read on a background + thread so a chatty stderr can't deadlock the stdout pipe. """ + import select + import threading script = _system_script("query") t0 = time.monotonic() + deadline = t0 + QUERY_TIMEOUT + cap = OUTPUT_LIMIT + 1 # +1 byte so _cap() can detect overflow + stdout_buf = bytearray() + stderr_buf = bytearray() try: p = subprocess.Popen( [str(script)], @@ -200,18 +216,59 @@ def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]: cwd=str(SYSTEM_DIR), preexec_fn=os.setsid, ) - try: - stdout, stderr = p.communicate(input=sql, timeout=QUERY_TIMEOUT) - rc = p.returncode - except subprocess.TimeoutExpired: - # The system might still be inside its query; kill the whole group. - with contextlib.suppress(ProcessLookupError): - os.killpg(p.pid, signal.SIGKILL) - stdout, stderr = p.communicate() - rc = -9 except Exception as e: return 255, b"", f"agent: failed to invoke ./query: {e}\n".encode(), 0.0 - return rc, stdout, stderr, time.monotonic() - t0 + + def _drain_stderr() -> None: + for chunk in iter(lambda: p.stderr.read(8192), b""): + stderr_buf.extend(chunk) + err_thread = threading.Thread(target=_drain_stderr, daemon=True) + err_thread.start() + + try: + if sql: + p.stdin.write(sql) + p.stdin.close() + except BrokenPipeError: + pass + + killed_for = "" # "timeout", "cap", or "" + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + killed_for = "timeout" + break + if len(stdout_buf) >= cap: + killed_for = "cap" + break + r, _, _ = select.select([p.stdout], [], [], min(remaining, 0.5)) + if r: + chunk = p.stdout.read1(min(8192, cap - len(stdout_buf))) + if not chunk: + break # EOF + stdout_buf.extend(chunk) + elif p.poll() is not None: + break + + if killed_for: + with contextlib.suppress(ProcessLookupError): + os.killpg(p.pid, signal.SIGKILL) + + try: + rc = p.wait(timeout=5) + except subprocess.TimeoutExpired: + with contextlib.suppress(ProcessLookupError): + os.killpg(p.pid, signal.SIGKILL) + rc = -9 + + if killed_for == "timeout": + rc = -9 + err_thread.join(timeout=2) + with contextlib.suppress(Exception): + p.stdout.close() + with contextlib.suppress(Exception): + p.stderr.close() + return rc, bytes(stdout_buf), bytes(stderr_buf), time.monotonic() - t0 def _extract_script_timing(stderr: bytes) -> float | None: @@ -320,24 +377,22 @@ def _provision() -> tuple[int, bytes]: PROVISION_LOG.write_bytes(b"".join(log_lines)) return r.returncode, b"".join(log_lines) - # Pre-snapshot trim. The host /sync's the FS right before pausing - # the vcpus, so any on-disk data the daemon has already committed - # is durable. That means we're free to stop the daemon here: - # ClickHouse's MergeTree (and equivalent on-disk stores) never - # produce inconsistent on-disk state regardless of when the - # process exits — only an unflushed *filesystem* can. With the - # host-side /sync in place, we can shut the daemon down to evict - # its private heap (merge thread arenas, query cache, mark cache, - # uncompressed cache, parquet ingest buffers, …) and snapshot a - # mostly-zero RAM image. The agent's startup path - # (_kick_daemon_if_provisioned) brings it back up on every - # restore, so the first query in a restored VM pays a 1-2 s - # daemon-start cost instead of carrying 8-12 GB of memory in - # every snapshot. - # - # Skip for in-process / stateless tools where stop/start is a - # no-op AND the data lives in process memory; wiping it would - # defeat the point. Those systems can rely on drop_caches alone. + # Pre-snapshot housekeeping. Order: + # 1) ./stop — drop the daemon's heap (merge arenas, query cache, + # mark cache, parquet ingest buffers, ...) so we can fstrim + # and drop_caches against a quiet system. + # 2) sync + drop_caches — flush dirty pages, evict the page + # cache, so init_on_free=1 zeroes everything that was + # cache. Snapshot then sees a mostly-zero free pool. + # 3) fstrim — DISCARD free blocks on the per-VM disks so the + # sparse backing file punches holes for bytes the load + # script `mv`'d in and `rm`'d (14-75 GB of dataset). + # 4) ./start + ./check — bring the daemon back up *into* the + # snapshot. Restore then resumes a daemon that's already + # serving, paying zero cold-start cost. + # Skip stop/start for systems without a real daemon (chdb, + # polars, duckdb): they're in-process tools with no separate + # process to manage. stop = SYSTEM_DIR / "stop" start = SYSTEM_DIR / "start" check = SYSTEM_DIR / "check" @@ -363,33 +418,59 @@ def _provision() -> tuple[int, bytes]: time.sleep(0.5) log_lines.append(b"=== pre-snapshot stop done ===\n") - # Drop the page+dentry+inode cache. With init_on_free=1 set in the - # guest kernel cmdline (see vm_manager._kernel_cmdline), every page - # the kernel frees gets zero-filled before going back on the free - # list. After daemon stop + drop_caches, the entire free pool - # is genuinely zero-filled, and the snapshot's RAM dump compresses - # ~300:1 instead of the ~3:1 we got without init_on_free. + # Drop the page+dentry+inode cache. With init_on_free=1 set in + # the guest kernel cmdline (see vm_manager._kernel_cmdline), every + # page the kernel frees gets zero-filled before going back on the + # free list, so what we snapshot is mostly-zero. subprocess.run(["sync"], check=False) try: Path("/proc/sys/vm/drop_caches").write_text("3\n") except Exception: pass - # fstrim the per-VM disks. Load scripts typically do `mv hits.parquet - # /var/lib//user_files/` (which on overlay/cross-FS copies the - # 14-75 GB dataset into the writable per-VM disk) and then `rm` it - # after the INSERT. ext4 marks those blocks free but the underlying - # virtio-blk file still holds the bytes — the snapshot's golden disk - # then carries a full copy of the dataset that the load script - # already discarded. `fstrim` sends DISCARD for free blocks; the - # host loop driver responds by punching holes in the sparse backing - # file, so the golden ends up holding only the bytes the engine - # actually keeps (MergeTree parts, hits.db, etc.). + # fstrim the per-VM disks so transient dataset bytes from + # `mv hits.parquet ... ; rm` don't end up in the golden disk. for mnt in ("/opt/clickbench/sysdisk", "/"): subprocess.run(["fstrim", mnt], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=300, check=False) + # Restart the daemon so the snapshot captures it *running*. The + # restored VM then doesn't pay any cold-start cost; the daemon's + # process state, JIT/class-cache, connection pools, etc. all + # come back live. + if has_daemon: + log_lines.append(b"\n=== pre-snapshot start ===\n") + r = subprocess.run([str(start)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, check=False) + log_lines.append(r.stdout or b"") + log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n") + # Wait for ./check before snapshotting — we want the daemon + # actually accepting queries when the memory image is captured. + ok = False + t0 = time.monotonic() + while time.monotonic() - t0 < 900: + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + ok = True + break + time.sleep(0.5) + if ok: + log_lines.append(b"=== pre-snapshot start ok ===\n") + _daemon_started.set() # the snapshot ships a running daemon + else: + log_lines.append(b"=== pre-snapshot start: check did not " + b"succeed in 900 s; snapshot will need a " + b"cold start on restore ===\n") + # Sync once more so any data the just-started daemon wrote + # (lock files, sockets, recovery markers) is on disk before + # the host snapshots the rootfs/sysdisk. + subprocess.run(["sync"], check=False) + PROVISION_DONE.write_text(f"ok {time.time()}\n") PROVISION_LOG.write_bytes(b"".join(log_lines)) return 0, b"".join(log_lines) @@ -419,6 +500,15 @@ def do_GET(self) -> None: self._send_json(200, {"ok": True, "system": SYSTEM_NAME, "provisioned": PROVISION_DONE.exists()}) return + if self.path == "/ready": + # True when the system's daemon is fully accepting queries. + # The host uses this at restore time to gate VM-state="ready" + # for slow daemons (Doris, Druid, Trino, etc.); without it + # the first user query arrives mid-start and times out. + ready = _daemon_started.is_set() + self._send_json(200 if ready else 503, + {"ready": ready, "system": SYSTEM_NAME}) + return if self.path == "/stats": self._send_json(200, _stats_snapshot()) return @@ -489,6 +579,40 @@ class ReusableServer(socketserver.ThreadingTCPServer): daemon_threads = True +def _reconcile_docker_after_restore() -> None: + """Restart dockerd if it's active, to recover from snapshot-restore + skew. + + Why: after a Firecracker memory snapshot+restore, dockerd is resumed + in userspace but the (also-restored) kernel-side networking and cgroup + state is in flux. Symptom: `docker run` either fails or starts a + container that's unreachable on its mapped port (cedardb, byconity, + trino, etc.). `systemctl restart docker` reconciles the daemon to the + current kernel state. No-op on systems that don't use docker, and a + cheap ~2 s on initial provision (docker was just started anyway). + """ + rc = subprocess.run( + ["systemctl", "is-active", "--quiet", "docker"], + check=False, + ).returncode + if rc != 0: + return # docker isn't installed / not active + try: + subprocess.run(["sudo", "systemctl", "restart", "docker"], + check=False, timeout=60) + # Wait for the daemon to come back. + for _ in range(30): + r = subprocess.run(["sudo", "docker", "info"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=False, timeout=5).returncode + if r == 0: + return + time.sleep(1) + except Exception as e: + sys.stderr.write(f"[agent] docker reconcile failed: {e}\n") + + def _kick_daemon_if_provisioned() -> None: """On every agent boot, if the system has been provisioned, make sure the daemon is also running. @@ -513,11 +637,33 @@ def _kick_daemon_if_provisioned() -> None: def _bg() -> None: try: + # Slow daemons (Doris, Druid, Trino) can take >5 min to come + # up. The host's /ready poll has its own deadline; here we + # only need a generous upper bound to prevent an infinite + # hang. subprocess.run([str(start)], cwd=str(SYSTEM_DIR), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - timeout=300, check=False) + timeout=900, check=False) + check = SYSTEM_DIR / "check" + if check.exists(): + # Poll ./check until it succeeds — that's the daemon's + # own definition of "ready", and the host probes /ready + # for this flag. + for _ in range(240): + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + break + time.sleep(0.5) + _daemon_started.set() except Exception as e: sys.stderr.write(f"[agent] daemon-kick failed: {e}\n") + # Still mark started so /query is unblocked even if the + # daemon never comes up — the query will fail with a real + # error rather than hang waiting for /ready forever. + _daemon_started.set() threading.Thread(target=_bg, daemon=True, name="daemon-kick").start() @@ -526,6 +672,7 @@ def main() -> None: addr = ("0.0.0.0", LISTEN_PORT) print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} " f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True) + _reconcile_docker_after_restore() _kick_daemon_if_provisioned() with ReusableServer(addr, Handler) as srv: srv.serve_forever() diff --git a/playground/docs/architecture.md b/playground/docs/architecture.md index 0507740c41..dfbbab7c19 100644 --- a/playground/docs/architecture.md +++ b/playground/docs/architecture.md @@ -58,16 +58,30 @@ on-disk snapshot. ## Snapshots -Created the first time a system is requested. Two artifacts: - -- `/systems//snapshot.state` — Firecracker VM state metadata -- `/systems//snapshot.bin` — guest memory dump (16 GB in - size as configured, but sparse) - -The `rootfs.ext4` and `system.ext4` files persist across snapshots and are -re-attached at restore time. Drive paths in the snapshot are remapped to -their current host locations on restore so we don't have to re-snapshot if -the playground gets moved or rebooted. +Created the first time a system is requested. Three artifacts: + +- `/systems//snapshot.state` — Firecracker VM metadata +- `/systems//snapshot.bin` — guest memory dump + (mmap'd by Firecracker on restore — left uncompressed so restore is + O(1) host work; pages fault in lazily) +- `/systems//{rootfs,system}.golden.ext4` — frozen disk + state at snapshot time, reflink-cloned at restore + +The host filesystem at `` **must support reflinks** (XFS, or +ext4 with `shared_blocks`). `_snapshot_disks` and `_restore_disks` both +use `cp --reflink=always` so cloning the golden into a working disk is +a constant-time extent-list copy regardless of how much data the system +actually wrote. Without reflinks the playground still works, but every +restore pays a full sparse-cp of the working set. + +Snapshots are taken with the daemon **running** (`./start` is invoked +after the pre-snapshot `./stop` + `fstrim` + `drop_caches`), so a +restored VM resumes with the daemon already serving — no cold-start +cost on the first query. + +Drive paths in the snapshot are remapped to their current host locations +on restore so we don't have to re-snapshot if the playground gets moved +or rebooted. ## Networking @@ -88,7 +102,7 @@ deleted — outbound traffic is dropped, the host↔guest link remains. Truncation is applied **inside the agent**, before bytes leave the VM: - Stdout from the system's `./query` script is capped at - `CLICKBENCH_OUTPUT_LIMIT` bytes (default 10 KB). + `CLICKBENCH_OUTPUT_LIMIT` bytes (default 64 KB). - The agent's response sets `X-Output-Truncated: 1` and `X-Output-Bytes: ` so the client can show "this is a partial result of N bytes." diff --git a/playground/scripts/download-datasets.sh b/playground/scripts/download-datasets.sh index b30fff4473..ae0fc7f23c 100755 --- a/playground/scripts/download-datasets.sh +++ b/playground/scripts/download-datasets.sh @@ -60,5 +60,16 @@ else step " cached" fi +step "json.gz" +# Used by parseable. The full hits.json.gz is ~4.6 GB on +# datasets.clickhouse.com. +if [ ! -f "$DATASETS/hits.json.gz" ] || [ "$(stat -c%s "$DATASETS/hits.json.gz" 2>/dev/null || echo 0)" -lt 3500000000 ]; then + wget --continue --progress=dot:giga \ + -O "$DATASETS/hits.json.gz" \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +else + step " cached" +fi + step "done" du -sh "$DATASETS"/* diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh index f2dfe9cd84..f5738511c2 100755 --- a/playground/scripts/install-firecracker.sh +++ b/playground/scripts/install-firecracker.sh @@ -9,7 +9,39 @@ FC_VERSION="${FIRECRACKER_VERSION:-v1.13.1}" KERNEL_URL="${GUEST_KERNEL_URL:-https://s3.amazonaws.com/spec.ccfc.min/firecracker-ci/v1.13/x86_64/vmlinux-6.1.141}" sudo mkdir -p "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache} -sudo chown -R "$(id -u):$(id -g)" "$STATE_DIR" +# Only chown the top-level subdirs we created. `chown -R` on $STATE_DIR +# would descend into any live mount underneath it — notably the loop- +# mounted rootfs that build-base-rootfs.sh keeps open under tmp/base-build +# while it's running — and flip /etc/sudoers inside the future VM image +# to uid 1000, breaking sudo on every subsequent provision. +sudo chown "$(id -u):$(id -g)" \ + "$STATE_DIR" \ + "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache} + +# The playground relies on reflink (cp --reflink=always) to clone +# 200 GB-apparent / multi-GB-real per-VM disks in milliseconds instead +# of seconds. ext4 ships reflink support behind the `shared_blocks` +# feature flag, but mke2fs in Ubuntu 22.04 / 24.04 doesn't expose it +# yet — so we format the playground volume as XFS, which has reflink +# enabled by default since mkfs.xfs 4.18 (2018). If you're staging the +# host yourself, set this up before running install-firecracker.sh: +# +# sudo mkfs.xfs -L cbplayground -f /dev/ +# echo 'LABEL=cbplayground /opt/clickbench-playground xfs \ +# defaults,noatime,discard,nofail 0 2' | sudo tee -a /etc/fstab +# sudo mount /opt/clickbench-playground +# +# Sanity-check at install time so a missing reflink is loud: +if ! ( cd "$STATE_DIR" && tmp1="$(mktemp -p .)" && \ + tmp2="$(mktemp -p . -u)" && \ + cp --reflink=always "$tmp1" "$tmp2" 2>/dev/null; rc=$? ; \ + rm -f "$tmp1" "$tmp2"; exit "$rc" ); then + echo "[install] ERROR: $STATE_DIR does not support reflink. The" >&2 + echo "playground needs cp --reflink=always to clone per-VM disks" >&2 + echo "fast. Reformat the volume as XFS (or ext4 with shared_blocks)" >&2 + echo "and re-run this script. See the comment block above." >&2 + exit 1 +fi if [ ! -x "$STATE_DIR/bin/firecracker" ]; then arch="$(uname -m)" diff --git a/playground/server/main.py b/playground/server/main.py index fe6cc86274..8ea74f2343 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -76,6 +76,34 @@ async def handle_system(self, req: web.Request) -> web.Response: "agent_url": self.vmm.agent_url(vm), }) + async def handle_queries(self, req: web.Request) -> web.Response: + """Return example queries for a system from its queries.sql. + + Splits on `;\n` so multi-line queries stay together. Truncates to + a sane upper bound — ClickBench has 43 per system, no need to + cap, but if a fork ships thousands we don't want to ship them + all to the browser. + """ + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + path = self.cfg.repo_dir / name / "queries.sql" + if not path.exists(): + return web.json_response([]) + text = path.read_text(errors="replace") + # Split on `;\n` then trim. Drop empties. + out = [] + for chunk in text.split(";\n"): + q = chunk.strip() + if not q: + continue + if not q.endswith(";"): + q += ";" + out.append(q) + if len(out) >= 200: + break + return web.json_response(out) + async def handle_provision_log(self, req: web.Request) -> web.Response: name = req.match_info["name"] if name not in self.systems: @@ -175,7 +203,7 @@ async def _dispatch_query(self, system_name: str, sql: bytes try: async with aiohttp.ClientSession() as s: async with s.post(url, data=sql, - timeout=aiohttp.ClientTimeout(total=600)) as r: + timeout=aiohttp.ClientTimeout(total=60)) as r: body = await r.read() headers = {k: r.headers[k] for k in r.headers if k.startswith("X-")} headers.setdefault("X-Output-Bytes", str(len(body))) @@ -203,6 +231,7 @@ def build_app() -> web.Application: app.router.add_get("/api/systems", obj.handle_systems) app.router.add_get("/api/state", obj.handle_state) app.router.add_get("/api/system/{name}", obj.handle_system) + app.router.add_get("/api/queries/{name}", obj.handle_queries) app.router.add_get("/api/provision-log/{name}", obj.handle_provision_log) app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision) app.router.add_post("/api/query", obj.handle_query) @@ -214,8 +243,18 @@ async def root_redirect(_r: web.Request) -> web.Response: raise web.HTTPFound("/ui/") async def ui_index(_r: web.Request) -> web.FileResponse: - return web.FileResponse(web_dir / "index.html") + resp = web.FileResponse(web_dir / "index.html") + resp.headers["Cache-Control"] = "no-store" + return resp + + @web.middleware + async def no_cache_static(request: web.Request, handler): + resp = await handler(request) + if request.path.startswith("/ui/"): + resp.headers["Cache-Control"] = "no-store" + return resp + app.middlewares.append(no_cache_static) app.router.add_get("/", root_redirect) app.router.add_get("/ui/", ui_index) app.router.add_get("/ui", ui_index) diff --git a/playground/server/systems.py b/playground/server/systems.py index 49fa76cf15..347de92924 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -48,8 +48,30 @@ # not "broken", just over-provisioned for shared use. "chdb-dataframe", "duckdb-dataframe", "duckdb-memory", "polars-dataframe", "daft-parquet", "daft-parquet-partitioned", + # Upstream is broken or asks for credentials we don't have. + # - paradedb-partitioned: install script aborts ("pg_lakehouse was + # removed from ParadeDB after 0.10.x"); historical benchmark only. + # - pg_duckdb-motherduck: requires MOTHERDUCK_TOKEN (cloud creds). + "paradedb-partitioned", "pg_duckdb-motherduck", } +# Systems we trust to keep outbound internet access *after* the snapshot, +# i.e. at query time. Used by datalake-style benchmarks that read live S3 +# during the query; without internet they fail with a DNS error. Stays +# tight on purpose — adding a system here means user queries from that +# VM can reach the wider internet, so only put ClickHouse-family engines +# here (per request). +TRUSTED_INTERNET: frozenset[str] = frozenset({ + "clickhouse", + "clickhouse-datalake", + "clickhouse-datalake-partitioned", + "clickhouse-parquet", + "clickhouse-parquet-partitioned", + "chdb", + "chdb-parquet", + "chdb-parquet-partitioned", +}) + @dataclass(frozen=True) class System: diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index d28c98b0cc..b338c224d7 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -36,7 +36,7 @@ from . import firecracker as fc from . import net from .config import Config -from .systems import System +from .systems import System, TRUSTED_INTERNET log = logging.getLogger("vm_manager") @@ -69,6 +69,9 @@ class VM: # Provision metadata provisioned_at: Optional[float] = None last_used: float = 0.0 + # Set when state transitions to "ready" (after restore or initial + # provision). Reset on teardown. Used by the UI to show uptime. + ready_since: Optional[float] = None last_error: Optional[str] = None lock: asyncio.Lock = dataclasses.field(default_factory=asyncio.Lock) # Runtime stats refreshed by the monitor @@ -147,10 +150,14 @@ async def ensure_ready_for_query(self, system: str) -> VM: vm.state = "snapshotted" if vm.state == "down": if not _has_snapshot(vm): - # No snapshot (raw or compressed) yet — full provision. - await self._initial_provision(vm) - else: - await self._restore_snapshot(vm) + # No snapshot yet, and /query is not a provisioning + # trigger — the operator has to /api/admin/provision + # explicitly. Refuse here so a stray query doesn't + # spin up a 30-min initial install. + raise RuntimeError( + f"{system}: no snapshot — POST /api/admin/provision" + f"/{system} to build one") + await self._restore_snapshot(vm) elif vm.state == "snapshotted": await self._restore_snapshot(vm) elif vm.state == "provisioning": @@ -275,7 +282,8 @@ async def _initial_provision(self, vm: VM) -> None: await self._call_agent_provision(vm) await self._snapshot(vm) await self._shutdown(vm) - await net.disable_internet(vm.slot) + if vm.system.name not in TRUSTED_INTERNET: + await net.disable_internet(vm.slot) vm.state = "snapshotted" vm.provisioned_at = time.time() log.info("[%s] initial provision complete", vm.system.name) @@ -413,28 +421,28 @@ async def _snapshot(self, vm: VM) -> None: "snapshot_path": str(vm.snapshot_state), "mem_file_path": str(vm.snapshot_bin), }, timeout=3600.0) + # Capture the *disk* state while the VM is still paused — + # the memory snapshot has in-flight references to specific + # inodes / file positions / mmap'd ranges on the rootfs and + # system disks, and any post-pause writes (journal commits, + # atime updates, etc.) by Firecracker on resume torn the + # golden disk relative to the memory image and surface as + # ext4 EBADMSG on restore for whichever file's metadata + # got dirtied. Reflink-clone keeps the working disks live + # for the clean shutdown that follows. + await self._snapshot_disks(vm) finally: # Try to resume so we can shut down cleanly; ignore failures. with contextlib.suppress(Exception): await fc.patch(sock, "/vm", {"state": "Resumed"}) - # Capture the *disk* state too. The memory snapshot is meaningless on - # its own: it has in-flight references to specific inodes / file - # positions / mmap'd ranges on the rootfs and system disks, and if - # those move under it the restored process malfunctions. We sparse- - # copy the disks into a parallel "golden" path; every subsequent - # restore boots off a fresh copy of the golden, so background work - # the daemon does after restore (clickhouse merges, log writes, - # /tmp churn) never persists into the next session. - await self._snapshot_disks(vm) - - # Compress the memory dump with parallel zstd. Firecracker writes the - # *full* 16 GB of guest memory regardless of how much was actually - # used; zstd at -3 with -T0 turns that into ~10-12 GB in a few - # seconds (most of the savings come from the agent's drop_caches - # right before /snapshot — page cache zero-fills compress 50:1). - # snapshot.state stays as-is; it's tiny (~60 KB). - await self._compress_snapshot(vm) + # We no longer compress the memory dump. Firecracker mmaps + # snapshot.bin on restore, so leaving it uncompressed means a + # restore is O(1) for memory (the kernel page-faults pages in + # lazily). The cost is disk: ~16 GB nominal per system. Sparse- + # write + init_on_free=1 + pre-snapshot drop_caches+fstrim keep + # the actual on-disk size to ~5-10% of the apparent size for + # most systems. snapshot.state stays as-is; it's tiny (~60 KB). async def _compress_snapshot(self, vm: VM) -> None: bin_path = vm.snapshot_bin @@ -492,6 +500,15 @@ async def _decompress_snapshot(self, vm: VM) -> None: async def _restore_snapshot(self, vm: VM) -> None: log.info("[%s] restore from snapshot", vm.system.name) + # Restore is the only auto-recovery path from a user /query. If + # the on-disk snapshot is gone (manual wipe, half-built artifact, + # ...) we fail loudly here; the operator has to kick a fresh + # provision via /api/admin/provision/. + if not _has_snapshot(vm): + vm.state = "down" + raise RuntimeError( + f"[{vm.system.name}] snapshot on disk is missing; " + f"POST /api/admin/provision/{vm.system.name} to rebuild") # Always boot from a *fresh copy* of the golden disks captured at # snapshot time. Restore #N inherits zero state from restore #N-1, # which is what makes the playground safe to expose to arbitrary @@ -502,9 +519,17 @@ async def _restore_snapshot(self, vm: VM) -> None: # Firecracker tries to mmap it. await self._decompress_snapshot(vm) await net.ensure_tap(vm.slot) - # internet stays OFF post-snapshot + # Trusted systems (e.g. ClickHouse variants that read live S3 at + # query time) keep outbound internet after restore. Everything + # else stays offline. + if vm.system.name in TRUSTED_INTERNET: + await net.enable_internet(vm.slot) await self._boot(vm, restore_snapshot=True) await self._wait_for_agent(vm, timeout=60) + # Block here until the system's daemon reports ready, so the + # first user query doesn't time out mid-startup. Big upper bound + # for slow JVMs (Doris/Druid/Trino). + await self._wait_for_daemon_ready(vm, timeout=600) vm.state = "ready" def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]: @@ -519,15 +544,31 @@ def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]: async def _snapshot_disks(self, vm: VM) -> None: rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) - # Atomically swap: rename the working images into the golden slot. - # Both disks were sync'd via /sync before /snapshot/create, so - # what's on disk is consistent with what's in the memory snapshot. - # We'll re-create the working images by cloning from the golden - # on every restore (see _restore_disks). - for src, dst in ((rootfs, rootfs_gold), (sysdisk, sysdisk_gold)): + # Reflink-clone the working images into the golden slot. We can't + # rename: the working file stays bound to Firecracker's open + # virtio-blk fd through the post-snapshot resume + shutdown, and + # any writes during that window would leak into the golden (we + # observed restored systems hitting ext4 EBADMSG on small files + # like duckdb's hits.db.wal and a venv activate script). With + # reflink the snapshot is near-instant; the working file's + # post-snapshot writes diverge into its own extents and don't + # touch the golden. + async def _clone(src: Path, dst: Path) -> None: if dst.exists(): dst.unlink() - os.replace(src, dst) + proc = await asyncio.create_subprocess_exec( + "cp", "--reflink=always", str(src), str(dst), + stderr=asyncio.subprocess.PIPE, + ) + _, err = await proc.communicate() + if proc.returncode != 0: + raise RuntimeError( + f"reflink snapshot cp {src} -> {dst} failed: " + f"{err.decode(errors='replace')[-400:]}") + await asyncio.gather( + _clone(rootfs, rootfs_gold), + _clone(sysdisk, sysdisk_gold), + ) log.info("[%s] golden disks saved (%s, %s)", vm.system.name, _fmt_size(rootfs_gold.stat().st_size), _fmt_size(sysdisk_gold.stat().st_size)) @@ -537,19 +578,32 @@ async def _restore_disks(self, vm: VM) -> None: if not rootfs_gold.exists() or not sysdisk_gold.exists(): raise RuntimeError( f"[{vm.system.name}] missing golden disks; cannot restore") - # Clone the goldens into fresh working copies. `cp --sparse=always` - # only writes the non-zero blocks, so the cost is proportional to - # the actual data on each disk, not its apparent 200 GB. - for src, dst in ((rootfs_gold, rootfs), (sysdisk_gold, sysdisk)): + # Reflink-clone the goldens into fresh working copies. The host + # filesystem must be ext4 with the `reflink` feature enabled (or + # XFS / btrfs / any other CoW-capable fs) — see + # playground/scripts/install-firecracker.sh. Clones are O(1) + # extent-list copies; the real cost is paid lazily on first + # write to a shared block. With reflink, a restore goes from + # 5-30 s (full sparse-cp) to a few ms. + # Both clones can run concurrently; they touch disjoint files. + async def _clone(src: Path, dst: Path) -> None: if dst.exists(): dst.unlink() proc = await asyncio.create_subprocess_exec( - "cp", "--sparse=always", str(src), str(dst), + "cp", "--reflink=always", str(src), str(dst), + stderr=asyncio.subprocess.PIPE, ) - rc = await proc.wait() - if rc != 0: - raise RuntimeError(f"cp {src} -> {dst} failed rc={rc}") - log.info("[%s] working disks cloned from golden", vm.system.name) + _, err = await proc.communicate() + if proc.returncode != 0: + raise RuntimeError( + f"reflink cp {src} -> {dst} failed: " + f"{err.decode(errors='replace')[-400:]}") + await asyncio.gather( + _clone(rootfs_gold, rootfs), + _clone(sysdisk_gold, sysdisk), + ) + log.info("[%s] working disks reflink-cloned from golden", + vm.system.name) async def _shutdown(self, vm: VM) -> None: """Best-effort clean shutdown of the firecracker process. @@ -630,11 +684,43 @@ async def _wait_for_agent(self, vm: VM, *, timeout: float) -> None: await asyncio.sleep(0.5) raise RuntimeError(f"agent unreachable after {timeout}s: {last_err!r}") + async def _wait_for_daemon_ready(self, vm: VM, *, timeout: float) -> None: + """Wait for the system's daemon to start serving (post-restore). + + Slow JVM daemons (Doris, Druid, Trino) can take several minutes to + come up after a snapshot restore. The agent's daemon-kick thread + runs ./start + ./check in the background; /ready flips to 200 once + that completes. Without this gate, the first user query lands + mid-start and times out at the host's 60 s query budget. + """ + url = self.agent_url(vm) + "/ready" + t0 = time.monotonic() + async with aiohttp.ClientSession() as s: + while time.monotonic() - t0 < timeout: + try: + async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r: + if r.status == 200: + return + except Exception: + pass + await asyncio.sleep(1.0) + log.warning("[%s] daemon not ready after %s s; serving queries anyway", + vm.system.name, timeout) + async def _call_agent_provision(self, vm: VM) -> None: url = self.agent_url(vm) + "/provision" + # No fast idle check — /provision is a single POST that returns + # only when install+load is fully done. The TCP connection sits + # idle (no body streaming) for the entire run. Some systems take + # many hours to load 100 M rows; we just set a generous total + # deadline so a genuinely stuck call eventually breaks. async with aiohttp.ClientSession() as s: - # Provision can take a very long time (apt-get install jdk, etc.) - async with s.post(url, timeout=aiohttp.ClientTimeout(total=7200)) as r: + async with s.post( + url, + timeout=aiohttp.ClientTimeout( + total=7 * 86400, sock_connect=30, + ), + ) as r: body = await r.read() if r.status >= 300: raise RuntimeError(f"agent /provision failed: {r.status}: " diff --git a/playground/web/app.js b/playground/web/app.js index fb29eb0d7f..ca72780f53 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -1,114 +1,225 @@ // ClickBench Playground — minimal vanilla-JS client. // -// Talks to the host API. Three things happen here: -// 1. On load, fetch /api/systems and populate the system dropdown. Pre-select -// whatever's in the URL hash (e.g. #clickhouse) or the first one. -// 2. On selection change, poll /api/system/ every 2s and update the -// state pill so the user can see when provisioning finishes / a VM is -// restarted by the watchdog. -// 3. On "Run query", POST the SQL to /api/query?system=, parse the -// response headers for timing, render bytes as text (best-effort UTF-8). +// Talks to the host API. +// 1. On load, fetch /api/systems for the catalog and /api/state for live +// states. Render systems as a vertical list, colored by current state. +// 2. Re-poll /api/state every 2 s and re-color the list. The currently +// selected system also re-renders its status JSON blob below. +// 3. On click of a system row, select it. On "Run query", POST the SQL to +// /api/query?system= and render output as plain text in a
.
 
 const $ = (sel) => document.querySelector(sel);
 
-const sysSelect = $("#system");
+const listEl = $("#system-list");
 const queryEl = $("#query");
 const runBtn = $("#run");
-const statePill = $("#state-pill");
+const selectedEl = $("#selected-system");
 const outEl = $("#output");
+const outLabelEl = $("#output-label");
 const timeEl = $("#time");
-const wallEl = $("#wall");
-const bytesEl = $("#bytes");
-const truncEl = $("#truncated");
-const exitEl = $("#exit");
 const stateBlob = $("#state-blob");
+const lastErrorEl = $("#last-error");
+const exampleSel = $("#example");
+const uiActive = ["#ui-active", "#ui-query", "#ui-stats", "#ui-output"].map($);
+const uiDown = $("#ui-down");
 
+let catalog = [];          // [{name, display_name, data_format, ...}]
+let stateByName = {};      // {name: {state, ...}}
+let selected = null;       // selected system name
 let pollTimer = null;
-let knownSystems = [];
+let resultsByName = {};    // {name: {output, time, wall, bytes, truncated, exit}}
+let queriesByName = {};    // {name: [q1, q2, ...]}
 
-async function loadSystems() {
+async function loadCatalog() {
     const r = await fetch("/api/systems");
-    knownSystems = await r.json();
-    knownSystems.sort((a, b) => a.display_name.localeCompare(b.display_name));
-    sysSelect.innerHTML = "";
-    for (const s of knownSystems) {
+    catalog = await r.json();
+    catalog.sort((a, b) => a.display_name.localeCompare(b.display_name));
+    renderList();
+    const hash = (location.hash || "").slice(1);
+    if (hash && catalog.some(s => s.name === hash)) {
+        select(hash);
+    } else if (catalog.length) {
+        select(catalog[0].name);
+    }
+}
+
+function renderList() {
+    listEl.innerHTML = "";
+    for (const s of catalog) {
+        const st = (stateByName[s.name] && stateByName[s.name].state) || "down";
+        const row = document.createElement("div");
+        row.className = `system-item state-${st}` + (s.name === selected ? " selected" : "");
+        row.dataset.name = s.name;
+        row.textContent = s.display_name;
+        row.addEventListener("click", () => select(s.name));
+        listEl.appendChild(row);
+    }
+}
+
+function select(name) {
+    selected = name;
+    location.hash = name;
+    selectedEl.textContent = name;
+    for (const row of listEl.children) {
+        row.classList.toggle("selected", row.dataset.name === name);
+    }
+    if (stateByName[name]) {
+        stateBlob.textContent = JSON.stringify(stateByName[name], null, 2);
+    }
+    showResult(resultsByName[name]);
+    // If the user has typed something, keep it across system switches —
+    // they're likely composing one query against multiple systems. Only
+    // when the textarea is empty does loadExamples populate Q1.
+    loadExamples(name);
+    refreshDownUI();
+}
+
+async function loadExamples(name) {
+    let qs = queriesByName[name];
+    if (!qs) {
+        try {
+            const r = await fetch(`/api/queries/${encodeURIComponent(name)}`);
+            qs = r.ok ? await r.json() : [];
+        } catch (e) {
+            qs = [];
+        }
+        queriesByName[name] = qs;
+    }
+    if (selected !== name) return;  // user moved on
+    exampleSel.innerHTML = "";
+    if (!qs.length) {
         const o = document.createElement("option");
-        o.value = s.name;
-        o.textContent = `${s.display_name}  (${s.data_format})`;
-        sysSelect.appendChild(o);
+        o.textContent = "(no examples)";
+        o.disabled = true;
+        exampleSel.appendChild(o);
+    } else {
+        for (let i = 0; i < qs.length; i++) {
+            const o = document.createElement("option");
+            o.value = String(i);
+            // Single-line label: first 90 chars of the query.
+            const label = qs[i].replace(/\s+/g, " ").slice(0, 90);
+            o.textContent = `Q${i + 1}: ${label}`;
+            exampleSel.appendChild(o);
+        }
     }
-    // Allow #clickhouse style deep links
-    const hash = (location.hash || "").slice(1);
-    if (hash && knownSystems.some(s => s.name === hash)) {
-        sysSelect.value = hash;
+    // Only populate the first example if the textarea is empty —
+    // anything the user has typed stays put when switching systems.
+    if (!queryEl.value.trim() && qs.length) {
+        queryEl.value = qs[0];
     }
-    onSystemChange();
+}
+
+let lastDownShownName = null;
+
+function refreshDownUI() {
+    const s = stateByName[selected];
+    const isDown = s && s.state === "down";
+    for (const el of uiActive) {
+        if (el) el.style.display = isDown ? "none" : "";
+    }
+    uiDown.style.display = isDown ? "" : "none";
+    if (isDown) {
+        // Render the last error once per selection. If poll picks up a
+        // new last_error for the same system later, leave the UI alone
+        // — the user is reading the text, we shouldn't move it under
+        // their eyes.
+        if (lastDownShownName !== selected) {
+            const raw = (s && s.last_error) || "(no error recorded)";
+            lastErrorEl.textContent = raw
+                .replace(/\\n/g, "\n")
+                .replace(/\\t/g, "\t")
+                .replace(/\\r/g, "");
+            lastDownShownName = selected;
+        }
+    } else {
+        lastDownShownName = null;
+    }
+}
+
+function showResult(r) {
+    if (!r) {
+        outEl.textContent = "";
+        timeEl.textContent = "—";
+        outLabelEl.textContent = "Output";
+        return;
+    }
+    outEl.textContent = r.output;
+    timeEl.textContent = r.time;
+    outLabelEl.textContent = r.truncated === "yes" ? "Output (truncated)" : "Output";
 }
 
 async function pollState() {
-    const name = sysSelect.value;
-    if (!name) return;
     try {
-        const r = await fetch(`/api/system/${encodeURIComponent(name)}`);
+        const r = await fetch("/api/state");
         if (!r.ok) throw new Error(`HTTP ${r.status}`);
-        const j = await r.json();
-        statePill.textContent = j.state || "?";
-        statePill.className = `pill ${j.state || ""}`;
-        stateBlob.textContent = JSON.stringify(j, null, 2);
+        const arr = await r.json();
+        stateByName = {};
+        for (const s of arr) stateByName[s.name] = s;
+        // Update each row's color + state badge without rebuilding the DOM
+        for (const row of listEl.children) {
+            const s = stateByName[row.dataset.name];
+            const st = (s && s.state) || "down";
+            row.className = `system-item state-${st}` +
+                (row.dataset.name === selected ? " selected" : "");
+        }
+        if (selected && stateByName[selected]) {
+            stateBlob.textContent = JSON.stringify(stateByName[selected], null, 2);
+        }
+        refreshDownUI();
     } catch (e) {
-        statePill.textContent = "err";
-        statePill.className = "pill down";
         stateBlob.textContent = String(e);
     }
 }
 
-function onSystemChange() {
-    if (pollTimer) clearInterval(pollTimer);
-    location.hash = sysSelect.value;
-    pollState();
-    pollTimer = setInterval(pollState, 2000);
-}
-
 async function runQuery() {
-    const name = sysSelect.value;
+    if (!selected) return;
     const sql = queryEl.value;
     if (!sql.trim()) return;
     runBtn.disabled = true;
     outEl.textContent = "(running …)";
     timeEl.textContent = "…";
-    wallEl.textContent = "…";
-    bytesEl.textContent = "—";
-    truncEl.textContent = "—";
-    exitEl.textContent = "—";
+    outLabelEl.textContent = "Output";
 
+    const target = selected;  // capture in case the user switches mid-flight
     const t0 = performance.now();
+    let payload = null;
     try {
-        const r = await fetch(`/api/query?system=${encodeURIComponent(name)}`, {
+        const r = await fetch(`/api/query?system=${encodeURIComponent(target)}`, {
             method: "POST",
             body: sql,
             headers: {"Content-Type": "application/octet-stream"},
         });
         const body = await r.arrayBuffer();
-        const txt = bytesToText(body);
-        outEl.textContent = txt || "(no output)";
-
+        const txt = bytesToText(body) || "(no output)";
         const h = (k) => r.headers.get(k);
         const qt = h("X-Query-Time");
         const wt = h("X-Wall-Time");
-        timeEl.textContent = qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—";
-        wallEl.textContent = wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`;
-        bytesEl.textContent = h("X-Output-Bytes") || body.byteLength;
-        truncEl.textContent = h("X-Output-Truncated") === "1" ? "yes" : "no";
-        exitEl.textContent = h("X-Exit-Code") || r.status;
+        let output = txt;
         if (r.status >= 400) {
             const err = h("X-Error");
-            if (err) outEl.textContent = `(error)\n${err}\n\n` + outEl.textContent;
+            if (err) {
+                const trailer = `\n\n(error)\n${err}`;
+                output = (txt === "(no output)" ? "" : txt) + trailer;
+            }
         }
+        payload = {
+            output,
+            time: qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—",
+            wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`,
+            bytes: h("X-Output-Bytes") || String(body.byteLength),
+            truncated: h("X-Output-Truncated") === "1" ? "yes" : "no",
+            exit: h("X-Exit-Code") || String(r.status),
+        };
     } catch (e) {
-        outEl.textContent = `(client error)\n${e}`;
+        payload = {
+            output: `(client error)\n${e}`,
+            time: "—", wall: "—", bytes: "—", truncated: "—", exit: "err",
+        };
     } finally {
         runBtn.disabled = false;
     }
+    resultsByName[target] = payload;
+    if (selected === target) showResult(payload);
 }
 
 function bytesToText(buf) {
@@ -119,10 +230,20 @@ function bytesToText(buf) {
     }
 }
 
-sysSelect.addEventListener("change", onSystemChange);
 runBtn.addEventListener("click", runQuery);
+exampleSel.addEventListener("change", () => {
+    const i = parseInt(exampleSel.value, 10);
+    const qs = queriesByName[selected];
+    if (qs && !isNaN(i) && i >= 0 && i < qs.length) {
+        queryEl.value = qs[i];
+    }
+});
 queryEl.addEventListener("keydown", (e) => {
     if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery();
 });
 
-loadSystems();
+(async function init() {
+    await loadCatalog();
+    await pollState();
+    pollTimer = setInterval(pollState, 2000);
+})();
diff --git a/playground/web/index.html b/playground/web/index.html
index e415a5ecff..8f09884572 100644
--- a/playground/web/index.html
+++ b/playground/web/index.html
@@ -4,7 +4,7 @@
 
 
 ClickBench Playground
-
+
 
 
 
@@ -13,50 +13,60 @@

ClickBench Playground

Run SQL against any of the database systems in ClickBench, each isolated in its own Firecracker microVM. The dataset is the - standard hits table — 100 M rows. + standard hits table — 100 M rows.

-
- - +
+ +
+
+ +
- + + +
-
+
- +
-
-
Time:
-
Wall:
-
Bytes:
-
Truncated:
-
Exit:
+
+
Time:
-
- +
+

     
+ +
System status -
loading…
+
loading…
- ClickBench · - output is capped at 10 KB · queries are bounded to 10 min · the host + ClickBench · + output is capped at 64 KB · queries are bounded to 60 s · the host keeps no per-user state.
- + diff --git a/playground/web/style.css b/playground/web/style.css index e630327026..96698811d9 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -6,16 +6,16 @@ --bg-alt: #f6f8fa; --accent: #fb1f00; --accent-fg: #ffffff; - --pill-bg: #e7eaef; - --pill-fg: #1f2328; --good: #1f883d; --bad: #cf222e; + --warn: #9a6700; + --info: #0969da; } * { box-sizing: border-box; } html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); } body { font: 14px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif; } -header, main, footer { max-width: 960px; margin: 0 auto; padding: 0 16px; } +header, main, footer { max-width: 100%; margin: 0; padding: 0 8px; } header { padding-top: 24px; padding-bottom: 12px; border-bottom: 1px solid var(--border); } header h1 { margin: 0 0 4px; font-size: 22px; font-weight: 600; } @@ -28,23 +28,51 @@ main > section { margin: 12px 0; } label { display: block; font-weight: 600; font-size: 12px; text-transform: uppercase; letter-spacing: 0.04em; color: var(--muted); margin-bottom: 4px; } -select, textarea, pre, input { - font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace; +textarea, pre, input { + font-family: monospace; font-size: 13px; border: 1px solid var(--border); background: var(--bg); color: var(--fg); - border-radius: 6px; + border-radius: 0; } -select { padding: 6px 8px; min-width: 280px; } textarea { width: 100%; padding: 10px; resize: vertical; } -pre { padding: 10px; background: var(--bg-alt); margin: 0; max-height: 360px; - overflow: auto; white-space: pre-wrap; word-break: break-word; } + +pre#output { + padding: 10px; + background: var(--bg-alt); + margin: 0; + max-height: 360px; + overflow: auto; + white-space: pre; + line-height: 1; +} + +pre#last-error { + padding: 10px; + background: var(--bg-alt); + margin: 0; + max-height: 360px; + overflow: auto; + white-space: pre-wrap; + word-break: break-word; + line-height: 1; +} + +pre#state-blob { + padding: 10px; + background: var(--bg-alt); + margin: 0; + max-height: 360px; + overflow: auto; + white-space: pre-wrap; + word-break: break-word; +} button { background: var(--accent); color: var(--accent-fg); - border: none; border-radius: 6px; padding: 6px 16px; + border: none; border-radius: 0; padding: 6px 16px; font-weight: 600; cursor: pointer; } button:disabled { opacity: 0.6; cursor: not-allowed; } @@ -52,18 +80,41 @@ button:hover:not(:disabled) { filter: brightness(0.95); } .row { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; } .row label { margin: 0; } -.stats { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; +.stats { font-family: monospace; font-size: 12px; color: var(--muted); padding: 8px 0; border-top: 1px solid var(--border); border-bottom: 1px solid var(--border); } .stats span { color: var(--fg); } -.pill { display: inline-block; padding: 2px 8px; border-radius: 999px; - font-size: 11px; font-weight: 600; background: var(--pill-bg); color: var(--pill-fg); - text-transform: uppercase; letter-spacing: 0.04em; } -.pill.ready { background: #ddf4e4; color: var(--good); } -.pill.snapshotted { background: #fff4d1; color: #9a6700; } -.pill.provisioning { background: #ddf0ff; color: #0969da; } -.pill.down { background: #ffd7d6; color: var(--bad); } +/* System slabs — horizontal flex-wrap row of chiclets like the main + ClickBench results page. Each slab is a clickable button, background + colored by current state. */ +.system-list { + display: flex; + flex-wrap: wrap; + gap: 2px; +} +.system-item { + display: inline-block; + padding: 4px 8px; + cursor: pointer; + font-family: monospace; + font-size: 12px; + border: 1px solid var(--border); + background: var(--bg-alt); + color: var(--fg); + line-height: 1.2; + white-space: nowrap; +} +.system-item:hover { filter: brightness(0.95); } +.system-item.selected { + outline: 2px solid #000; + outline-offset: -2px; +} + +.system-item.state-snapshotted { background: #c8f0d4; color: var(--good); } +.system-item.state-ready { background: #a4e6b6; color: var(--good); font-weight: 600; } +.system-item.state-provisioning { background: #eaeef2; color: var(--muted); } +.system-item.state-down { background: #f6d1ce; color: var(--bad); } footer { color: var(--muted); padding-top: 16px; padding-bottom: 32px; border-top: 1px solid var(--border); font-size: 12px; } diff --git a/presto-datalake-partitioned/install b/presto-datalake-partitioned/install index 46bb615d52..d7cc9965e4 100755 --- a/presto-datalake-partitioned/install +++ b/presto-datalake-partitioned/install @@ -52,7 +52,12 @@ if [ ! -f shim/S3AnonymousProvider.jar ]; then --entrypoint sh trinodb/trino:latest -c ' set -e cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") + if [ -z "$CP" ]; then + # Last resort: every jar under trino. Order matters for classpath + # only in case of duplicate classes, which we do not have here. + CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") + fi javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' diff --git a/presto-datalake/install b/presto-datalake/install index 46bb615d52..d7cc9965e4 100755 --- a/presto-datalake/install +++ b/presto-datalake/install @@ -52,7 +52,12 @@ if [ ! -f shim/S3AnonymousProvider.jar ]; then --entrypoint sh trinodb/trino:latest -c ' set -e cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") + if [ -z "$CP" ]; then + # Last resort: every jar under trino. Order matters for classpath + # only in case of duplicate classes, which we do not have here. + CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") + fi javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' diff --git a/trino-datalake-partitioned/install b/trino-datalake-partitioned/install index cd6fd35403..10e0b96135 100755 --- a/trino-datalake-partitioned/install +++ b/trino-datalake-partitioned/install @@ -41,7 +41,12 @@ if [ ! -f shim/S3AnonymousProvider.jar ]; then --entrypoint sh trinodb/trino:latest -c ' set -e cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") + if [ -z "$CP" ]; then + # Last resort: every jar under trino. Order matters for classpath + # only in case of duplicate classes, which we do not have here. + CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") + fi javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' diff --git a/trino-datalake/install b/trino-datalake/install index cd6fd35403..10e0b96135 100755 --- a/trino-datalake/install +++ b/trino-datalake/install @@ -41,7 +41,12 @@ if [ ! -f shim/S3AnonymousProvider.jar ]; then --entrypoint sh trinodb/trino:latest -c ' set -e cd /shim - CP="/usr/lib/trino/plugin/hive/hdfs/com.amazonaws_aws-java-sdk-core-1.12.797.jar:/usr/lib/trino/plugin/hive/hdfs/io.trino.hadoop_hadoop-apache-3.3.5-3.jar" + CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") + if [ -z "$CP" ]; then + # Last resort: every jar under trino. Order matters for classpath + # only in case of duplicate classes, which we do not have here. + CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") + fi javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' From cc76bc21a51978e18d1c4189920029d64b57bd3b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:10:10 +0000 Subject: [PATCH 036/221] playground: hover tooltip on system slabs with uptime/status - Track vm.ready_since (set when state -> ready, cleared on teardown). - Expose ready_since in /api/state and /api/system/. - UI: hover tooltip per slab: ready -> "up N seconds/minutes/hours/days" snapshotted -> "ready" provisioning -> "provisioning" down -> "failed" --- playground/server/main.py | 1 + playground/server/vm_manager.py | 3 +++ playground/web/app.js | 34 ++++++++++++++++++++++++++++++++- playground/web/index.html | 2 +- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/playground/server/main.py b/playground/server/main.py index 8ea74f2343..4deeb3db3d 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -72,6 +72,7 @@ async def handle_system(self, req: web.Request) -> web.Response: "has_snapshot": vm.snapshot_bin.exists(), "provisioned_at": vm.provisioned_at, "last_used": vm.last_used, + "ready_since": vm.ready_since, "last_error": vm.last_error, "agent_url": self.vmm.agent_url(vm), }) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index b338c224d7..80c02b6a24 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -184,6 +184,7 @@ def list_all(self) -> list[dict]: "agent_url": self.agent_url(vm), "provisioned_at": vm.provisioned_at, "last_used": vm.last_used, + "ready_since": vm.ready_since, "tags": list(vm.system.tags), "data_format": vm.system.data_format, "last_error": vm.last_error, @@ -531,6 +532,7 @@ async def _restore_snapshot(self, vm: VM) -> None: # for slow JVMs (Doris/Druid/Trino). await self._wait_for_daemon_ready(vm, timeout=600) vm.state = "ready" + vm.ready_since = time.time() def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]: """(working rootfs, working sysdisk, golden rootfs, golden sysdisk).""" @@ -639,6 +641,7 @@ async def _teardown(self, vm: VM, reason: str) -> None: with contextlib.suppress(Exception): await self._shutdown(vm) vm.state = "snapshotted" if _has_snapshot(vm) else "down" + vm.ready_since = None # Drop the decompressed snapshot.bin if we still have the .zst — it's # ~16 GB of redundancy on disk. Keep .zst as the canonical artifact. zst = vm.snapshot_bin.with_suffix(".bin.zst") diff --git a/playground/web/app.js b/playground/web/app.js index ca72780f53..6d51f4ab29 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -46,16 +46,47 @@ async function loadCatalog() { function renderList() { listEl.innerHTML = ""; for (const s of catalog) { - const st = (stateByName[s.name] && stateByName[s.name].state) || "down"; + const sObj = stateByName[s.name]; + const st = (sObj && sObj.state) || "down"; const row = document.createElement("div"); row.className = `system-item state-${st}` + (s.name === selected ? " selected" : ""); row.dataset.name = s.name; row.textContent = s.display_name; + row.title = tooltipFor(sObj, st); row.addEventListener("click", () => select(s.name)); listEl.appendChild(row); } } +function tooltipFor(sObj, st) { + if (st === "ready") { + const since = sObj && sObj.ready_since; + if (since) { + const ago = Math.max(0, Math.floor(Date.now() / 1000 - since)); + return "up " + formatDuration(ago); + } + return "up"; + } + if (st === "snapshotted") return "ready"; + if (st === "provisioning") return "provisioning"; + if (st === "down") return "failed"; + return st; +} + +function formatDuration(secs) { + if (secs < 60) return `${secs} second${secs === 1 ? "" : "s"}`; + if (secs < 3600) { + const m = Math.floor(secs / 60); + return `${m} minute${m === 1 ? "" : "s"}`; + } + if (secs < 86400) { + const h = Math.floor(secs / 3600); + return `${h} hour${h === 1 ? "" : "s"}`; + } + const d = Math.floor(secs / 86400); + return `${d} day${d === 1 ? "" : "s"}`; +} + function select(name) { selected = name; location.hash = name; @@ -161,6 +192,7 @@ async function pollState() { const st = (s && s.state) || "down"; row.className = `system-item state-${st}` + (row.dataset.name === selected ? " selected" : ""); + row.title = tooltipFor(s, st); } if (selected && stateByName[selected]) { stateBlob.textContent = JSON.stringify(stateByName[selected], null, 2); diff --git a/playground/web/index.html b/playground/web/index.html index 8f09884572..82c8bfbea9 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + From 3c1014f94ec0d5665c1ba2579d8c0ff9ec6c8817 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:11:36 +0000 Subject: [PATCH 037/221] playground: CSS-only system slab tooltip Replace native title= attribute with a ::after pseudo-element. Shows instantly on :hover instead of waiting for the browser's ~1 s delay. --- playground/web/app.js | 4 ++-- playground/web/index.html | 4 ++-- playground/web/style.css | 22 ++++++++++++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 6d51f4ab29..ec7509f58b 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -52,7 +52,7 @@ function renderList() { row.className = `system-item state-${st}` + (s.name === selected ? " selected" : ""); row.dataset.name = s.name; row.textContent = s.display_name; - row.title = tooltipFor(sObj, st); + row.dataset.tooltip = tooltipFor(sObj, st); row.addEventListener("click", () => select(s.name)); listEl.appendChild(row); } @@ -192,7 +192,7 @@ async function pollState() { const st = (s && s.state) || "down"; row.className = `system-item state-${st}` + (row.dataset.name === selected ? " selected" : ""); - row.title = tooltipFor(s, st); + row.dataset.tooltip = tooltipFor(s, st); } if (selected && stateByName[selected]) { stateBlob.textContent = JSON.stringify(stateByName[selected], null, 2); diff --git a/playground/web/index.html b/playground/web/index.html index 82c8bfbea9..71631a544c 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground - +
@@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + diff --git a/playground/web/style.css b/playground/web/style.css index 96698811d9..c3630c7929 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -104,6 +104,28 @@ button:hover:not(:disabled) { filter: brightness(0.95); } color: var(--fg); line-height: 1.2; white-space: nowrap; + position: relative; +} + +.system-item::after { + content: attr(data-tooltip); + position: absolute; + bottom: calc(100% + 4px); + left: 50%; + transform: translateX(-50%); + background: #000; + color: #fff; + padding: 3px 6px; + font-size: 11px; + line-height: 1.2; + white-space: nowrap; + pointer-events: none; + visibility: hidden; + z-index: 10; +} + +.system-item:hover::after { + visibility: visible; } .system-item:hover { filter: brightness(0.95); } .system-item.selected { From 2316c2a2eb4addf290681a12e21437b44489bbb2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:12:19 +0000 Subject: [PATCH 038/221] playground: triangle pin under tooltip --- playground/web/index.html | 2 +- playground/web/style.css | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/playground/web/index.html b/playground/web/index.html index 71631a544c..52e123c767 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground - +
diff --git a/playground/web/style.css b/playground/web/style.css index c3630c7929..08f6350150 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -110,7 +110,7 @@ button:hover:not(:disabled) { filter: brightness(0.95); } .system-item::after { content: attr(data-tooltip); position: absolute; - bottom: calc(100% + 4px); + bottom: calc(100% + 6px); left: 50%; transform: translateX(-50%); background: #000; @@ -124,7 +124,23 @@ button:hover:not(:disabled) { filter: brightness(0.95); } z-index: 10; } -.system-item:hover::after { +.system-item::before { + content: ""; + position: absolute; + bottom: 100%; + left: 50%; + transform: translateX(-50%); + /* Solid downward-pointing triangle: transparent left/right + bottom + borders carve out the wedge, top border becomes the visible pin. */ + border: 4px solid transparent; + border-top-color: #000; + pointer-events: none; + visibility: hidden; + z-index: 10; +} + +.system-item:hover::after, +.system-item:hover::before { visibility: visible; } .system-item:hover { filter: brightness(0.95); } From 428787d537df4956200c1cde50332aa37c9ecae3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:13:03 +0000 Subject: [PATCH 039/221] playground: enlarge tooltip pin so it spans slab to box --- playground/web/index.html | 2 +- playground/web/style.css | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/playground/web/index.html b/playground/web/index.html index 52e123c767..a8ea6c9bce 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground - +
diff --git a/playground/web/style.css b/playground/web/style.css index 08f6350150..e54d977993 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -110,7 +110,7 @@ button:hover:not(:disabled) { filter: brightness(0.95); } .system-item::after { content: attr(data-tooltip); position: absolute; - bottom: calc(100% + 6px); + bottom: calc(100% + 8px); left: 50%; transform: translateX(-50%); background: #000; @@ -127,13 +127,13 @@ button:hover:not(:disabled) { filter: brightness(0.95); } .system-item::before { content: ""; position: absolute; + /* Tip touches the slab; body fills the gap to the tooltip box. */ bottom: 100%; left: 50%; transform: translateX(-50%); - /* Solid downward-pointing triangle: transparent left/right + bottom - borders carve out the wedge, top border becomes the visible pin. */ - border: 4px solid transparent; + border: 8px solid transparent; border-top-color: #000; + border-bottom-width: 0; pointer-events: none; visibility: hidden; z-index: 10; From e953f344cda9691b0fe18d6f02aa9dd6149340a4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:15:08 +0000 Subject: [PATCH 040/221] playground: no text selection on slabs; second click = run - user-select: none on system slabs so a fast double click doesn't highlight the label. - Clicking the already-selected slab now invokes runQuery() instead of being a no-op, as long as the system is in a queryable state (anything but down / provisioning). --- playground/web/app.js | 16 +++++++++++++++- playground/web/index.html | 4 ++-- playground/web/style.css | 2 ++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index ec7509f58b..4599051cce 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -53,7 +53,7 @@ function renderList() { row.dataset.name = s.name; row.textContent = s.display_name; row.dataset.tooltip = tooltipFor(sObj, st); - row.addEventListener("click", () => select(s.name)); + row.addEventListener("click", () => onSlabClick(s.name)); listEl.appendChild(row); } } @@ -87,6 +87,20 @@ function formatDuration(secs) { return `${d} day${d === 1 ? "" : "s"}`; } +function onSlabClick(name) { + // Click on the already-selected system = shortcut to run the + // current query, as long as that system is in a queryable state. + if (name === selected) { + const s = stateByName[name]; + const st = s && s.state; + if (st && st !== "down" && st !== "provisioning") { + runQuery(); + } + return; + } + select(name); +} + function select(name) { selected = name; location.hash = name; diff --git a/playground/web/index.html b/playground/web/index.html index a8ea6c9bce..6a307c77d9 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground - +
@@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + diff --git a/playground/web/style.css b/playground/web/style.css index e54d977993..5aa49ff8a6 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -105,6 +105,8 @@ button:hover:not(:disabled) { filter: brightness(0.95); } line-height: 1.2; white-space: nowrap; position: relative; + user-select: none; + -webkit-user-select: none; } .system-item::after { From 230ad1da160df658a0a9ba76451b7d2d89916240 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:17:28 +0000 Subject: [PATCH 041/221] playground: preserve example index across system switches --- playground/web/app.js | 20 ++++++++++++++------ playground/web/index.html | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 4599051cce..244b3d5040 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -131,6 +131,9 @@ async function loadExamples(name) { queriesByName[name] = qs; } if (selected !== name) return; // user moved on + // Preserve the example index across system switches: if the user + // had Q5 selected for system A, switching to B keeps Q5. + const prevIndex = parseInt(exampleSel.value, 10); exampleSel.innerHTML = ""; if (!qs.length) { const o = document.createElement("option"); @@ -141,16 +144,21 @@ async function loadExamples(name) { for (let i = 0; i < qs.length; i++) { const o = document.createElement("option"); o.value = String(i); - // Single-line label: first 90 chars of the query. const label = qs[i].replace(/\s+/g, " ").slice(0, 90); o.textContent = `Q${i + 1}: ${label}`; exampleSel.appendChild(o); } - } - // Only populate the first example if the textarea is empty — - // anything the user has typed stays put when switching systems. - if (!queryEl.value.trim() && qs.length) { - queryEl.value = qs[0]; + // Clamp prevIndex into range; default to 0. + let idx = 0; + if (!isNaN(prevIndex) && prevIndex >= 0 && prevIndex < qs.length) { + idx = prevIndex; + } + exampleSel.value = String(idx); + // Only populate the textarea if it's empty — anything the + // user has typed stays put when switching systems. + if (!queryEl.value.trim()) { + queryEl.value = qs[idx]; + } } } diff --git a/playground/web/index.html b/playground/web/index.html index 6a307c77d9..ef1d32ef07 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + From b9f7d90c1164902a0f9f37b334ee101c2d25910c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:18:48 +0000 Subject: [PATCH 042/221] playground: drop ' (script)' suffix from Time --- playground/web/app.js | 2 +- playground/web/index.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 244b3d5040..f70515f808 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -258,7 +258,7 @@ async function runQuery() { } payload = { output, - time: qt ? `${parseFloat(qt).toFixed(3)} s (script)` : "—", + time: qt ? `${parseFloat(qt).toFixed(3)} s` : "—", wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`, bytes: h("X-Output-Bytes") || String(body.byteLength), truncated: h("X-Output-Truncated") === "1" ? "yes" : "no", diff --git a/playground/web/index.html b/playground/web/index.html index ef1d32ef07..283ac4c7f8 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + From fd457a4f9d9e832af1634684a6d6ab853c1c6d17 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:20:50 +0000 Subject: [PATCH 043/221] playground: swap to new system's example only if textarea unedited Track the exact string of the last auto-populated example. When the user switches systems, replace the textarea only if its content still matches that string (or is empty). Anything edited by hand stays. --- playground/web/app.js | 20 +++++++++++++++++--- playground/web/index.html | 2 +- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index f70515f808..8eafbed94f 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -29,6 +29,10 @@ let selected = null; // selected system name let pollTimer = null; let resultsByName = {}; // {name: {output, time, wall, bytes, truncated, exit}} let queriesByName = {}; // {name: [q1, q2, ...]} +// The exact string we last auto-populated the textarea with (from an +// example). If the current textarea still equals it, the user hasn't +// edited it and we're free to swap in the next system's example. +let pristineQuery = ""; async function loadCatalog() { const r = await fetch("/api/systems"); @@ -154,10 +158,15 @@ async function loadExamples(name) { idx = prevIndex; } exampleSel.value = String(idx); - // Only populate the textarea if it's empty — anything the - // user has typed stays put when switching systems. - if (!queryEl.value.trim()) { + // Replace the textarea with this system's example at the same + // index, but only if the user hasn't edited the current text + // (i.e., it still matches whatever example we last set, or + // it's empty). + const isPristine = queryEl.value === pristineQuery + || !queryEl.value.trim(); + if (isPristine) { queryEl.value = qs[idx]; + pristineQuery = qs[idx]; } } } @@ -290,6 +299,7 @@ exampleSel.addEventListener("change", () => { const qs = queriesByName[selected]; if (qs && !isNaN(i) && i >= 0 && i < qs.length) { queryEl.value = qs[i]; + pristineQuery = qs[i]; } }); queryEl.addEventListener("keydown", (e) => { @@ -297,6 +307,10 @@ queryEl.addEventListener("keydown", (e) => { }); (async function init() { + // Treat the HTML default ("SELECT COUNT(*) FROM hits;") as pristine + // so first-system selection is free to swap it for the first + // example. + pristineQuery = queryEl.value; await loadCatalog(); await pollState(); pollTimer = setInterval(pollState, 2000); diff --git a/playground/web/index.html b/playground/web/index.html index 283ac4c7f8..6db4a82d52 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + From b33b8e1a31d427a994e9305eded168c353382cad Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:25:12 +0000 Subject: [PATCH 044/221] playground: warm up snapshotted VMs on select New POST /api/warmup/ triggers a fire-and-forget restore (no query). The UI hits it on system-select for snapshotted systems so the restore overlaps the time the user spends typing/picking an example, and the eventual Run query lands on a ready VM. Refuses to initial-provision; if no snapshot, returns 409 and the operator must use /api/admin/provision/. --- playground/server/main.py | 33 +++++++++++++++++++++++++++++++++ playground/web/app.js | 14 +++++++++++--- playground/web/index.html | 2 +- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/playground/server/main.py b/playground/server/main.py index 4deeb3db3d..4d91f880e5 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -134,6 +134,38 @@ async def _provision_bg(self, name: str) -> None: log.exception("background provision failed for %s", name) self.sink.write_event(system=name, kind="provision-failed", detail=repr(e)) + async def handle_warmup(self, req: web.Request) -> web.Response: + """Trigger snapshot restore for a system without running a query. + + The UI calls this on system-select so the restore (~30 s for + cold ones, near-zero with reflink+live-daemon) overlaps the + time the user is typing their query, and Run query lands on a + VM that's already serving. Refuses to initial-provision; if no + snapshot exists, returns 409 and the user has to /admin/provision. + """ + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + vm = self.vmm.vms[name] + if vm.state == "ready": + return web.json_response({"already_ready": True, "system": name}) + if vm.state == "provisioning": + return web.json_response({"in_flight": True, "system": name}) + if not self.vmm.vms[name].snapshot_bin.exists() and \ + not (self.cfg.systems_dir / name / "rootfs.golden.ext4").exists(): + return web.json_response( + {"error": "no snapshot; POST /api/admin/provision first"}, + status=409, + ) + asyncio.create_task(self._warmup_bg(name)) + return web.json_response({"started": True, "system": name}) + + async def _warmup_bg(self, name: str) -> None: + try: + await self.vmm.ensure_ready_for_query(name) + except Exception as e: + log.warning("warmup failed for %s: %r", name, e) + async def handle_query(self, req: web.Request) -> web.StreamResponse: system_name = req.query.get("system", "") if system_name not in self.systems: @@ -235,6 +267,7 @@ def build_app() -> web.Application: app.router.add_get("/api/queries/{name}", obj.handle_queries) app.router.add_get("/api/provision-log/{name}", obj.handle_provision_log) app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision) + app.router.add_post("/api/warmup/{name}", obj.handle_warmup) app.router.add_post("/api/query", obj.handle_query) # Static UI diff --git a/playground/web/app.js b/playground/web/app.js index 8eafbed94f..12097d8b45 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -116,11 +116,19 @@ function select(name) { stateBlob.textContent = JSON.stringify(stateByName[name], null, 2); } showResult(resultsByName[name]); - // If the user has typed something, keep it across system switches — - // they're likely composing one query against multiple systems. Only - // when the textarea is empty does loadExamples populate Q1. loadExamples(name); refreshDownUI(); + // Kick the restore in the background so the VM is hopefully ready + // by the time the user presses Run query. No-op if the system is + // already ready / provisioning / has no snapshot. + maybeWarmup(name); +} + +function maybeWarmup(name) { + const s = stateByName[name]; + if (!s || s.state !== "snapshotted") return; + fetch(`/api/warmup/${encodeURIComponent(name)}`, {method: "POST"}) + .catch(() => {}); // fire-and-forget } async function loadExamples(name) { diff --git a/playground/web/index.html b/playground/web/index.html index 6db4a82d52..896ee66ab9 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + From b502e4b2b7e135c2c3e3b68755fa1620ff6486fe Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:27:28 +0000 Subject: [PATCH 045/221] playground: /api/admin/provision actually triggers initial provision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier change had ensure_ready_for_query refuse to provision a down system without a snapshot, so /query couldn't accidentally start a 30-min install. But /api/admin/provision also went through ensure_ready_for_query and inherited the refusal — every admin kick returned "no snapshot, POST /api/admin/provision/X to build one", which is exactly the path the operator already took. Split into two entry points: - provision_now(): force a fresh _initial_provision regardless of current state. Called only by /api/admin/provision. - ensure_ready_for_query(): restore-only, refuses down-with-no-snapshot. Called by /query and /api/warmup. --- playground/server/main.py | 2 +- playground/server/vm_manager.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/playground/server/main.py b/playground/server/main.py index 4d91f880e5..9371d41ee7 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -129,7 +129,7 @@ async def handle_admin_provision(self, req: web.Request) -> web.Response: async def _provision_bg(self, name: str) -> None: try: - await self.vmm.ensure_ready_for_query(name) + await self.vmm.provision_now(name) except Exception as e: log.exception("background provision failed for %s", name) self.sink.write_event(system=name, kind="provision-failed", detail=repr(e)) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 80c02b6a24..ab71ca6e87 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -131,6 +131,23 @@ def __init__(self, config: Config, systems: dict[str, System]): # ── public API ─────────────────────────────────────────────────────── + async def provision_now(self, system: str) -> None: + """Force a full initial provision. Only called by + /api/admin/provision; the /query path never lands here. + """ + if system not in self.vms: + raise KeyError(system) + vm = self.vms[system] + async with vm.lock: + if vm.state == "provisioning": + raise RuntimeError(f"{system}: provisioning already in flight") + # Bring everything down so _initial_provision starts fresh. + with contextlib.suppress(Exception): + await self._teardown(vm, "admin-provision") + vm.state = "down" + vm.last_error = None + await self._initial_provision(vm) + async def ensure_ready_for_query(self, system: str) -> VM: """Make sure system is up and responsive to /query. Boot/resume as needed. From e995418e9c8b6b8b1c2099dc0cabe21df9fbfaf2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:30:04 +0000 Subject: [PATCH 046/221] playground: web UI works when opened as file:// - app.js: when location.protocol is file://, route all fetches at http://localhost:8000 instead of relative paths. - server: CORS middleware on every /api/* response and a synthetic 204 for preflight OPTIONS. Exposes the X-* response headers the UI reads (X-Query-Time, X-Wall-Time, X-Output-*, X-Exit-Code, X-Error). --- playground/server/main.py | 26 ++++++++++++++++++++++++++ playground/web/app.js | 15 ++++++++++----- playground/web/index.html | 2 +- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/playground/server/main.py b/playground/server/main.py index 9371d41ee7..3424ca802b 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -288,7 +288,33 @@ async def no_cache_static(request: web.Request, handler): resp.headers["Cache-Control"] = "no-store" return resp + @web.middleware + async def cors(request: web.Request, handler): + # Permit the index.html opened from file:// (or any other origin) + # to call /api/* directly. The browser sends Origin: null in that + # case and refuses the response without ACAO. Reflecting the + # request's Origin keeps credentials-less CORS working in every + # browser. Preflight OPTIONS gets a synthetic 204 here. + origin = request.headers.get("Origin", "*") + if request.method == "OPTIONS": + return web.Response(status=204, headers={ + "Access-Control-Allow-Origin": origin, + "Access-Control-Allow-Methods": "GET, POST, OPTIONS", + "Access-Control-Allow-Headers": + request.headers.get("Access-Control-Request-Headers", "*"), + "Access-Control-Max-Age": "86400", + }) + resp = await handler(request) + resp.headers["Access-Control-Allow-Origin"] = origin + resp.headers["Access-Control-Expose-Headers"] = ( + "X-Query-Time, X-Wall-Time, X-Query-Wall-Time, " + "X-Output-Bytes, X-Output-Truncated, X-Exit-Code, " + "X-System, X-Error" + ) + return resp + app.middlewares.append(no_cache_static) + app.middlewares.append(cors) app.router.add_get("/", root_redirect) app.router.add_get("/ui/", ui_index) app.router.add_get("/ui", ui_index) diff --git a/playground/web/app.js b/playground/web/app.js index 12097d8b45..36f767303e 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -9,6 +9,11 @@ // /api/query?system= and render output as plain text in a
.
 
 const $ = (sel) => document.querySelector(sel);
+// When the page is served by the playground over HTTP, relative URLs
+// work. When it's opened from disk (file://), relative fetches resolve
+// against file:// and fail; rewrite to an absolute localhost URL.
+// CORS is handled by the server's middleware (Access-Control-Allow-Origin: *).
+const API = location.protocol === "file:" ? "http://localhost:8000" : "";
 
 const listEl = $("#system-list");
 const queryEl = $("#query");
@@ -35,7 +40,7 @@ let queriesByName = {};    // {name: [q1, q2, ...]}
 let pristineQuery = "";
 
 async function loadCatalog() {
-    const r = await fetch("/api/systems");
+    const r = await fetch(API + "/api/systems");
     catalog = await r.json();
     catalog.sort((a, b) => a.display_name.localeCompare(b.display_name));
     renderList();
@@ -127,7 +132,7 @@ function select(name) {
 function maybeWarmup(name) {
     const s = stateByName[name];
     if (!s || s.state !== "snapshotted") return;
-    fetch(`/api/warmup/${encodeURIComponent(name)}`, {method: "POST"})
+    fetch(`${API}/api/warmup/${encodeURIComponent(name)}`, {method: "POST"})
         .catch(() => {});  // fire-and-forget
 }
 
@@ -135,7 +140,7 @@ async function loadExamples(name) {
     let qs = queriesByName[name];
     if (!qs) {
         try {
-            const r = await fetch(`/api/queries/${encodeURIComponent(name)}`);
+            const r = await fetch(`${API}/api/queries/${encodeURIComponent(name)}`);
             qs = r.ok ? await r.json() : [];
         } catch (e) {
             qs = [];
@@ -220,7 +225,7 @@ function showResult(r) {
 
 async function pollState() {
     try {
-        const r = await fetch("/api/state");
+        const r = await fetch(API + "/api/state");
         if (!r.ok) throw new Error(`HTTP ${r.status}`);
         const arr = await r.json();
         stateByName = {};
@@ -255,7 +260,7 @@ async function runQuery() {
     const t0 = performance.now();
     let payload = null;
     try {
-        const r = await fetch(`/api/query?system=${encodeURIComponent(target)}`, {
+        const r = await fetch(`${API}/api/query?system=${encodeURIComponent(target)}`, {
             method: "POST",
             body: sql,
             headers: {"Content-Type": "application/octet-stream"},
diff --git a/playground/web/index.html b/playground/web/index.html
index 896ee66ab9..831fc2bb70 100644
--- a/playground/web/index.html
+++ b/playground/web/index.html
@@ -67,6 +67,6 @@ 

ClickBench Playground

keeps no per-user state. - + From f0c9cff212bdaf6f53e2069affda65ca5ef55f22 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 21:33:07 +0000 Subject: [PATCH 047/221] playground: default to clickhouse when no #hash selection --- playground/web/app.js | 2 ++ playground/web/index.html | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/playground/web/app.js b/playground/web/app.js index 36f767303e..dd563d11b3 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -47,6 +47,8 @@ async function loadCatalog() { const hash = (location.hash || "").slice(1); if (hash && catalog.some(s => s.name === hash)) { select(hash); + } else if (catalog.some(s => s.name === "clickhouse")) { + select("clickhouse"); } else if (catalog.length) { select(catalog[0].name); } diff --git a/playground/web/index.html b/playground/web/index.html index 831fc2bb70..6567b238ac 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -67,6 +67,6 @@

ClickBench Playground

keeps no per-user state. - + From 06ea5bfed8e14f86ccae6d83c1dce19eed84afc0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 00:07:31 +0200 Subject: [PATCH 048/221] Remove clutter --- playground/web/app.js | 2 -- playground/web/index.html | 19 ++------------ playground/web/style.css | 55 ++++++++++++++++++++------------------- 3 files changed, 30 insertions(+), 46 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 36f767303e..1560bc7d6b 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -18,7 +18,6 @@ const API = location.protocol === "file:" ? "http://localhost:8000" : ""; const listEl = $("#system-list"); const queryEl = $("#query"); const runBtn = $("#run"); -const selectedEl = $("#selected-system"); const outEl = $("#output"); const outLabelEl = $("#output-label"); const timeEl = $("#time"); @@ -113,7 +112,6 @@ function onSlabClick(name) { function select(name) { selected = name; location.hash = name; - selectedEl.textContent = name; for (const row of listEl.children) { row.classList.toggle("selected", row.dataset.name === name); } diff --git a/playground/web/index.html b/playground/web/index.html index 831fc2bb70..06dc00b6f4 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -8,18 +8,11 @@
-

ClickBench Playground

-

- Run SQL against any of the database systems in - ClickBench, - each isolated in its own Firecracker microVM. The dataset is the - standard hits table — 100 M rows. -

+

ClickBench Playground — run SQL against 80+ databases

-
@@ -27,11 +20,9 @@

ClickBench Playground

-
-
-
+ @@ -52,6 +52,6 @@

ClickBench Playground — run SQL against 80+ databa

- + From d22b6b689ed50fa4fcdc39d54dcadf14ebe7782e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 May 2026 22:23:04 +0000 Subject: [PATCH 052/221] playground: re-apply example even when same option is re-picked A guarantees the textarea gets the current example applied any time the dropdown closes, including the re-select-same case. --- playground/web/app.js | 12 ++++++++++-- playground/web/index.html | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index f5d5ed1120..d4b5860147 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -335,9 +335,17 @@ function applyExampleIdx(i) { pristineQuery = qs[i]; } -exampleSel.addEventListener("change", () => { +function applyCurrentExample() { applyExampleIdx(parseInt(exampleSel.value, 10)); -}); +} + +// `change` fires only when the value actually changes, so re-picking +// the same option does nothing there. `input` is identical on +// on textarea edit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The earlier 'apply on blur + per-option click' approach re-fired the example even when the user manually edited the SQL and accidentally re-focused the select. Replace it with a more direct model: the . To re-apply on re-select, watch `blur` (the native -// dropdown closes and the select loses focus) and the per-option -// click handler in loadExamples. exampleSel.addEventListener("change", applyCurrentExample); -exampleSel.addEventListener("blur", applyCurrentExample); +// When the user types in the textarea, mark the select as +// "unselected" (the disabled placeholder option). That way a +// subsequent click on whatever they had picked before counts as a +// real change and re-applies the example — no more blur-listener +// hack. +queryEl.addEventListener("input", () => { + if (queryEl.value !== pristineQuery) { + exampleSel.value = ""; + } +}); queryEl.addEventListener("keydown", (e) => { if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery(); }); From 937486a59b63d1c272f18b6481531bcd7a01ea91 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:20:59 +0000 Subject: [PATCH 109/221] playground: agent: don't surface truncation as an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the query script exceeds OUTPUT_LIMIT the agent SIGKILLs its process group. The script's rc is then -9 / 137 / SIGPIPE and its stderr ends with 'broken pipe' / 'head: write error' lines — which the playground UI then renders as a query error even though the result up to the truncation point is fine. If the cap fired (X-Output-Truncated=1), return 200 and omit the X-Error header. The truncation flag already lets the UI badge the output as partial. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index c17ae3a1f7..72a7bf0b97 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -598,11 +598,17 @@ def do_POST(self) -> None: } if script_t is not None: headers["X-Query-Time"] = f"{script_t:.6f}" - if rc != 0: + # When _cap truncated the output the script was almost + # certainly killed mid-write — its rc is non-zero (SIGPIPE + # / SIGKILL) and stderr is full of "broken pipe"-style + # noise. That's not a real query failure, so don't surface + # it as an error: return 200 and let X-Output-Truncated=1 + # tell the UI to label the result accordingly. + if rc != 0 and not truncated: # Surface a snippet of stderr so the client sees *something*. err_snip = err[-1024:].decode("utf-8", errors="replace") headers["X-Error"] = err_snip.replace("\n", " | ")[:512] - self._send(200 if rc == 0 else 502, body, headers) + self._send(200 if (rc == 0 or truncated) else 502, body, headers) return self._send_json(404, {"error": "not found", "path": self.path}) From 73f56927eb9cedc2c0f3ee0ecae6a5d11790f1e3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:25:12 +0000 Subject: [PATCH 110/221] playground: cumulative CPU-seconds cap (default 1 hour, ready state only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new watchdog: if a VM burns more than VM_CPU_TOTAL_SECONDS_CAP seconds of cumulative CPU (across all vCPUs) between restore and now, kick it. The snapshot stays intact; the next query restores from a fresh image. Baseline is captured at the 'ready' transition in vm_manager (so the cap excludes boot/resume cost), reset on teardown. Provisioning isn't billed against the cap — install/load can easily exceed an hour and shouldn't trip the watchdog. Default 3600s; override via VM_CPU_TOTAL_SECONDS_CAP env. Co-Authored-By: Claude Opus 4.7 --- playground/server/config.py | 6 ++++++ playground/server/monitor.py | 20 +++++++++++++++++++- playground/server/vm_manager.py | 24 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/playground/server/config.py b/playground/server/config.py index 0426b0cc5a..5e5c0e3511 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -45,6 +45,11 @@ class Config: # Watchdog thresholds. cpu_busy_window_sec: int cpu_busy_threshold: float + # Cumulative CPU-seconds (across all vCPUs) a VM may burn between + # restore and now. Anything past this is presumably a runaway and + # the watchdog kicks the VM. Counts only "ready" state — provision + # is allowed to use as much CPU as it wants. + vm_cpu_total_seconds_cap: int host_min_free_ram_gb: int host_min_free_disk_gb: int # Per-system disk full check. @@ -92,6 +97,7 @@ def load() -> Config: max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120), cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), + vm_cpu_total_seconds_cap=_env_int("VM_CPU_TOTAL_SECONDS_CAP", 3600), host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32), host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 500), vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")), diff --git a/playground/server/monitor.py b/playground/server/monitor.py index 1bdefc6b82..dd551e7ae7 100644 --- a/playground/server/monitor.py +++ b/playground/server/monitor.py @@ -27,7 +27,7 @@ from .config import Config from .logging_sink import LoggingSink -from .vm_manager import VM, VMManager +from .vm_manager import VM, VMManager, _read_proc_jiffies log = logging.getLogger("monitor") @@ -129,6 +129,24 @@ async def _check_per_vm(self, vm: VM, cpu_pct: float | None) -> None: else: vm.cpu_busy_since = None + # Cumulative CPU-cap watchdog. Only checked once we've passed + # the post-provision boundary (vm.state == "ready"); the + # cpu_baseline_jiffies was captured at that transition, so the + # delta below isolates query-serving CPU from boot/restore. + if vm.state == "ready" and vm.pid is not None and vm.cpu_baseline_jiffies: + jiffies = _read_proc_jiffies(vm.pid) + if jiffies > 0: + clk = os.sysconf("SC_CLK_TCK") or 100 + delta_s = (jiffies - vm.cpu_baseline_jiffies) / clk + if delta_s >= self.cfg.vm_cpu_total_seconds_cap: + self.sink.write_event( + system=vm.system.name, kind="cpu-cap", + detail=f"cumulative CPU {delta_s:.0f}s >= " + f"{self.cfg.vm_cpu_total_seconds_cap}s", + ) + await self.vmm.kick(vm.system.name, "cpu-cap") + return + # Disk usage watchdog cap = self.cfg.vm_rootfs_size_gb * (1 << 30) if vm.rootfs_used_bytes and vm.rootfs_used_bytes / cap >= self.cfg.vm_disk_pct_kill_threshold: diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 698b057602..f10983fa5f 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -79,6 +79,12 @@ class VM: cpu_busy_since: Optional[float] = None rss_bytes: int = 0 rootfs_used_bytes: int = 0 + # Cumulative (utime+stime) jiffies of the firecracker process at + # the moment this VM transitioned to "ready" (after restore). The + # CPU-cap watchdog uses (current - baseline) / SC_CLK_TCK to bill + # only the time spent serving queries, not the boot/resume cost. + # Cleared on teardown. + cpu_baseline_jiffies: int = 0 class VMManager: @@ -606,6 +612,9 @@ async def _restore_snapshot(self, vm: VM) -> None: await self._wait_for_daemon_ready(vm, timeout=600) vm.state = "ready" vm.ready_since = time.time() + # Baseline the firecracker's current jiffy counter so the + # per-VM CPU-cap watchdog can bill only post-ready CPU time. + vm.cpu_baseline_jiffies = _read_proc_jiffies(vm.pid) if vm.pid else 0 def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]: """(working rootfs, working sysdisk, golden rootfs, golden sysdisk).""" @@ -734,6 +743,7 @@ async def _teardown(self, vm: VM, reason: str) -> None: await self._shutdown(vm) vm.state = "snapshotted" if _has_snapshot(vm) else "down" vm.ready_since = None + vm.cpu_baseline_jiffies = 0 # Drop the decompressed snapshot.bin if we still have the .zst — it's # ~16 GB of redundancy on disk. Keep .zst as the canonical artifact. zst = vm.snapshot_bin.with_suffix(".bin.zst") @@ -874,3 +884,17 @@ def _pid_alive(pid: int) -> bool: return False except PermissionError: return True + + +def _read_proc_jiffies(pid: int) -> int: + """Return (utime+stime) for `pid` in jiffies, or 0 if unreadable.""" + try: + stat = Path(f"/proc/{pid}/stat").read_text() + except (FileNotFoundError, PermissionError): + return 0 + end = stat.rfind(")") + parts = stat[end + 2:].split() + try: + return int(parts[11]) + int(parts[12]) + except (IndexError, ValueError): + return 0 From f603ee983ed26a01175c7b5837e2e325c0c3115c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:27:16 +0000 Subject: [PATCH 111/221] playground: agent: pull dmesg OOM lines when query fails with no output/stderr A query that times out because its daemon was OOM-killed mid-flight loses both stdout (process dead before printing anything) and stderr (same), so the host's X-Error header was a blank string. The UI then showed '(error)' with no detail. When rc != 0 AND body AND err are all empty, scan dmesg for OOM lines ('killed process', 'out of memory', 'oom-killer') and put those in X-Error instead. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 72a7bf0b97..50d019ec37 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -271,6 +271,26 @@ def _drain_stderr() -> None: return rc, bytes(stdout_buf), bytes(stderr_buf), time.monotonic() - t0 +def _recent_oom_messages() -> str: + """Return kernel OOM-killer messages from `dmesg`, or '' if nothing + relevant. Called when the query script exits non-zero with empty + stdout AND stderr — the daemon was likely OOM-killed and never + got a chance to write a real error message. + """ + try: + out = subprocess.run( + ["dmesg", "--ctime"], + capture_output=True, timeout=5, check=False, + ).stdout.decode(errors="replace") + except Exception: + return "" + needles = ("killed process", "out of memory", "oom-killer", + "invoked oom-killer") + lines = [ln for ln in out.splitlines() + if any(n in ln.lower() for n in needles)] + return "\n".join(lines[-20:]) + + def _extract_script_timing(stderr: bytes) -> float | None: """ Pull fractional-seconds timing from the last numeric line of stderr, @@ -607,6 +627,14 @@ def do_POST(self) -> None: if rc != 0 and not truncated: # Surface a snippet of stderr so the client sees *something*. err_snip = err[-1024:].decode("utf-8", errors="replace") + # Both stdout and stderr empty usually means the + # daemon was OOM-killed mid-query. Pull the recent + # OOM-killer lines from dmesg so the UI shows a real + # cause instead of a blank error. + if not body.strip() and not err_snip.strip(): + oom = _recent_oom_messages() + if oom: + err_snip = "kernel OOM-killer:\n" + oom headers["X-Error"] = err_snip.replace("\n", " | ")[:512] self._send(200 if (rc == 0 or truncated) else 502, body, headers) return From 8ed06690b0eae23c4d26ed1974e6bcfed742a87e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:32:51 +0000 Subject: [PATCH 112/221] playground: kick VM on any /query error, not just unhealthy-daemon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The playground accepts arbitrarily destructive SQL — DROP TABLE hits, TRUNCATE, etc. After any error response (status >= 400) we can't be sure the daemon's persisted state is still consistent, so always kick the VM and let the next request restore from snapshot. This also drops the old _daemon_unhealthy /check probe: it gated the kick on the daemon being demonstrably dead, but a 'Table not found' error against a healthy daemon could have left the user stuck with a broken state forever. Co-Authored-By: Claude Opus 4.7 --- playground/server/main.py | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/playground/server/main.py b/playground/server/main.py index 9bec3cdf3d..0a746ad8d1 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -251,20 +251,20 @@ async def _dispatch_query(self, system_name: str, sql: bytes body = await r.read() headers = {k: r.headers[k] for k in r.headers if k.startswith("X-")} headers.setdefault("X-Output-Bytes", str(len(body))) - if r.status >= 500 and await self._daemon_unhealthy(vm): - # Daemon died (not just a bad query). Tear - # the VM down so the next request restores - # from snapshot. Bubble the error up on - # the first attempt for retry. + if r.status >= 400: + # ANY error tears the VM down so the next + # request restores from snapshot. The + # playground accepts destructive SQL + # (DROP TABLE hits, TRUNCATE, ...) — once + # an error happens we can't be sure the + # daemon's state is still consistent, so + # the safe move is always to reset. self.sink.write_event( - system=system_name, kind="post-query-unhealthy", + system=system_name, kind="post-query-error", detail=f"attempt {attempt}: status={r.status}", ) await self.vmm.kick(system_name, - "post-query-check-failed") - if attempt == 1: - await asyncio.sleep(0.5) - continue + "post-query-error") return body, headers, r.status except Exception as e: last_exc = e @@ -279,20 +279,6 @@ async def _dispatch_query(self, system_name: str, sql: bytes # unreachable, but keep mypy happy raise RuntimeError(str(last_exc)) - async def _daemon_unhealthy(self, vm) -> bool: - """Hit the agent's /check endpoint. Returns True if ./check - reports the daemon is not serving (so the host should teardown - + restore). Returns False on transient transport errors so a - single dropped packet doesn't trigger a restore.""" - url = self.vmm.agent_url(vm) + "/check" - try: - async with aiohttp.ClientSession() as s: - async with s.get(url, timeout=aiohttp.ClientTimeout(total=15)) as r: - return r.status != 200 - except Exception: - return False - - def build_app() -> web.Application: obj = App() app = web.Application(client_max_size=4 * 1024 * 1024) From 14867315b8424b2fe2f3542e04cc4a8bd9f7c1d0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:35:48 +0000 Subject: [PATCH 113/221] playground: route presto+trino datalake variants through the SNI proxy presto-datalake / presto-datalake-partitioned / trino-datalake* hit S3 at query time (clickhouse-public-datasets.s3.eu-central-1.amazonaws.com). Without DATALAKE_FILTERED net.enable_filtered_internet wasn't being called on restore, so the post-snapshot network had no outbound route and the query came back 'Unable to execute HTTP request: clickhouse-public-datasets.s3.eu-central-1.amazonaws.com'. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/playground/server/systems.py b/playground/server/systems.py index 283a846ccf..74e3a6b547 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -84,6 +84,10 @@ "clickhouse-datalake-partitioned", "duckdb-datalake", "duckdb-datalake-partitioned", + "presto-datalake", + "presto-datalake-partitioned", + "trino-datalake", + "trino-datalake-partitioned", }) # DataFrame / in-process engines load the full 100M-row hits set into a From 58103eeddc4f9214d1e10f809bdb9f5fa019e155 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:38:56 +0000 Subject: [PATCH 114/221] playground: persist slot assignments so existing snapshots keep their TAP name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removing a system from the catalog (sirius -> _EXTERNAL) shifted every alphabetical neighbor down by one slot. Their snapshots had host_dev_name=fc-tap- baked into snapshot.state, and the restore failed with 'Open tap device failed: Operation not permitted (os error 1). Invalid TUN/TAP Backend provided by fc-tap-' Allocate slots once and persist /slot-assignments.json. On subsequent boots, existing entries keep their slot; newly-added systems fill in the lowest unused slot. Removing a system frees its slot for future reuse — but never reshuffles incumbents. Co-Authored-By: Claude Opus 4.7 --- playground/server/vm_manager.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index f10983fa5f..d8f32d9a25 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -115,15 +115,39 @@ def __init__(self, config: Config, systems: dict[str, System]): # one's write window under ~5 minutes on a single fast SSD. self._snapshot_sem = asyncio.Semaphore(int(os.environ.get( "PLAYGROUND_SNAPSHOT_CONCURRENCY", "6"))) - # Stable slot allocation: sort systems alphabetically so each system - # always gets the same slot id (and therefore the same TAP/IP). - for i, name in enumerate(sorted(systems.keys()), start=1): + # Stable slot allocation. Each system gets a slot id (used as + # the TAP name fc-tap- and the /24 IP block 10.200..0/24); + # snapshot.state has the TAP name baked in, so once a snapshot + # exists we MUST keep handing the same slot back to the same + # system or restore fails with + # "Open tap device failed: Operation not permitted (os error 1). + # Invalid TUN/TAP Backend provided by fc-tap-" + # Persist the map so removing a system (e.g. sirius from + # _EXTERNAL) doesn't shift every later alphabetical neighbor. + slot_map_path = config.state_dir / "slot-assignments.json" + slot_map: dict[str, int] = {} + if slot_map_path.exists(): + with contextlib.suppress(Exception): + slot_map = json.loads(slot_map_path.read_text()) + used = set(slot_map.values()) + next_slot = 1 + for name in sorted(systems.keys()): + if name in slot_map: + continue + while next_slot in used: + next_slot += 1 + slot_map[name] = next_slot + used.add(next_slot) + with contextlib.suppress(Exception): + slot_map_path.write_text(json.dumps(slot_map, indent=2, sort_keys=True)) + for name in sorted(systems.keys()): sys = systems[name] + slot = slot_map[name] sys_state_dir = config.systems_dir / name sys_state_dir.mkdir(parents=True, exist_ok=True) vm = VM( system=sys, - slot=i, + slot=slot, api_sock=config.vms_dir / f"{name}.sock", log_sock=config.vms_dir / f"{name}.log.sock", snapshot_bin=sys_state_dir / "snapshot.bin", From 2f17f9a478b72915b29206efc27778e272f2e201 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:41:50 +0000 Subject: [PATCH 115/221] playground: lower HOST_MIN_FREE_DISK_GB default 500->100 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 500 GB pressure threshold made sense before the 256 GB swap drives for dataframe systems landed — now duckdb-dataframe alone takes ~480 GB physical (working + golden), so the watchdog fires permanently and kicks every restore with 'host-disk-pressure', which surfaces as ServerDisconnectedError on /api/query. Drop to 100 GB so the disk-pressure watchdog only kicks in when the host is actually short of room rather than 'somewhat full'. Co-Authored-By: Claude Opus 4.7 --- playground/server/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playground/server/config.py b/playground/server/config.py index 5e5c0e3511..ed0e3c2f78 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -99,7 +99,7 @@ def load() -> Config: cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), vm_cpu_total_seconds_cap=_env_int("VM_CPU_TOTAL_SECONDS_CAP", 3600), host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32), - host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 500), + host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 100), vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")), ch_cloud_url=os.environ.get("CLICKHOUSE_CLOUD_URL", ""), ch_cloud_user=os.environ.get("CLICKHOUSE_CLOUD_USER", ""), From f669cc83a5b76ff02c6f074a0d4bd7bba8a37552 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 11:56:49 +0000 Subject: [PATCH 116/221] playground: agent: detect snapshot restore via /proc/stat btime + reconcile docker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drill (and any other docker-shelled system) hangs after restore because the resumed dockerd's view of cgroups / netfilter / netns no longer matches the (also-resumed) kernel state. Symptom: every 'docker run' from the query script produces empty output and exit 1, and the only stderr is the JVM's 'Picked up _JAVA_OPTIONS' info line. _reconcile_docker_after_restore already exists but was only run once at agent main() startup; restoring from snapshot re-uses the same agent process, so it never fired post-restore. Detect a restore by reading /proc/stat's btime — uptime is preserved inside the snapshot, wall-clock is not, so btime (= wall - uptime) shifts on each restore. Cache it on the first call, compare on every subsequent /query, and reconcile when it changes. Initialised in main() before snapshot to avoid a missed reconcile on the very first post-restore /query. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 50d019ec37..5d59b4ac0b 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -600,6 +600,10 @@ def do_POST(self) -> None: if not sql.strip(): self._send_json(400, {"error": "empty query"}) return + # If /proc/stat's btime has shifted since the last call + # the VM was snapshot-restored and any docker daemon needs + # to be reconciled before we run the query script. + _maybe_reconcile_for_restore() # First /query after a snapshot restore: start the daemon # (it was stopped pre-snapshot to keep snapshots small). # Subsequent calls are a near-instant no-op. @@ -646,6 +650,44 @@ class ReusableServer(socketserver.ThreadingTCPServer): daemon_threads = True +_last_seen_btime: int | None = None + + +def _proc_btime() -> int | None: + """Read /proc/stat btime (the Unix timestamp of the kernel's last + boot). Shifts on snapshot/restore because uptime is preserved + while wall-clock advances, so we use it to detect restores from + inside the (restored) agent process.""" + try: + for line in Path("/proc/stat").read_text().splitlines(): + if line.startswith("btime "): + return int(line.split()[1]) + except Exception: + return None + return None + + +def _maybe_reconcile_for_restore() -> None: + """Called on each /query: if /proc/stat btime has shifted since + the last call, the VM was snapshot-restored and any docker daemon + needs reconciling (the kernel-side cgroup/netfilter state diverged + from dockerd's restored view of it). The reconcile itself is a + no-op when docker isn't installed.""" + global _last_seen_btime + cur = _proc_btime() + if cur is None: + return + if _last_seen_btime is None: + _last_seen_btime = cur + return + if cur != _last_seen_btime: + sys.stderr.write( + f"[agent] btime shifted " + f"{_last_seen_btime} -> {cur}; reconciling docker\n") + _last_seen_btime = cur + _reconcile_docker_after_restore() + + def _reconcile_docker_after_restore() -> None: """Restart dockerd if it's active, to recover from snapshot-restore skew. @@ -783,6 +825,12 @@ def main() -> None: f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True) _activate_swap() _reconcile_docker_after_restore() + # Capture btime *now*, before snapshot is taken: the snapshot + # freezes this value into memory, and after restore the live + # /proc/stat btime will have shifted, so _maybe_reconcile_for_restore + # picks up the change on the first post-restore /query. + global _last_seen_btime + _last_seen_btime = _proc_btime() _kick_daemon_if_provisioned() with ReusableServer(addr, Handler) as srv: srv.serve_forever() From 3e831e131a137e0e7abe5710db60f5289103936a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:00:55 +0000 Subject: [PATCH 117/221] playground: load ClickHouse credentials from /clickhouse.conf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A repo-versioned template ships at playground/clickhouse.conf.example; the actual file ('clickhouse.conf') lives in state_dir so it's not overwritten by repo updates and stays out of git. Env vars take precedence so existing deployments are unaffected. The logging sink already reads cfg.ch_cloud_* — same path now picks up either source. The same connection will back the shared-query feature when that lands. Co-Authored-By: Claude Opus 4.7 --- playground/clickhouse.conf.example | 16 ++++++++++++ playground/server/config.py | 41 +++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 playground/clickhouse.conf.example diff --git a/playground/clickhouse.conf.example b/playground/clickhouse.conf.example new file mode 100644 index 0000000000..7e36d7d47a --- /dev/null +++ b/playground/clickhouse.conf.example @@ -0,0 +1,16 @@ +# ClickHouse credentials for the playground. +# +# Copy this file to /clickhouse.conf (default: +# /opt/clickbench-playground/clickhouse.conf +# ) and fill in your hostname / user / password. The playground server +# reads it on startup for both the request-logging sink and any +# shared-query feature that gets wired in later. +# +# Env vars (CLICKHOUSE_CLOUD_URL / _USER / _PASSWORD / _DB) take +# precedence over this file so existing deployments keep working. + +[clickhouse] +url = https://your-host.clickhouse.cloud:8443 +user = default +password = +db = playground diff --git a/playground/server/config.py b/playground/server/config.py index ed0e3c2f78..3338c4bd11 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -2,9 +2,15 @@ All knobs are read from environment variables so a single systemd unit can drop them in. Falls back to sensible defaults for local development. + +ClickHouse credentials (for the logging sink and any future shared-query +feature) can also be supplied via an INI file at +`/clickhouse.conf`. Env vars, if set, take precedence over the +file so existing deployments keep working unchanged. """ from __future__ import annotations +import configparser import os from dataclasses import dataclass from pathlib import Path @@ -76,11 +82,38 @@ def logs_dir(self) -> Path: return self.state_dir / "logs" def firecracker_bin(self) -> Path: return self.state_dir / "bin" / "firecracker" +def _load_clickhouse_conf(state_dir: Path) -> dict[str, str]: + """Parse /clickhouse.conf. Format is INI with a single + [clickhouse] section: + + [clickhouse] + url = https://your-host.clickhouse.cloud:8443 + user = default + password = ... + db = playground + + Missing file / parse errors return {} silently — the env-var path + still works and the logging sink just stays disabled. + """ + path = state_dir / "clickhouse.conf" + if not path.exists(): + return {} + parser = configparser.ConfigParser() + try: + parser.read(path) + except configparser.Error: + return {} + if "clickhouse" not in parser: + return {} + return {k: v for k, v in parser["clickhouse"].items()} + + def load() -> Config: state_dir = Path(os.environ.get("PLAYGROUND_STATE_DIR", "/opt/clickbench-playground")) repo_dir = Path(os.environ.get("PLAYGROUND_REPO_DIR", "/home/ubuntu/ClickBench")) listen = os.environ.get("PLAYGROUND_LISTEN", "0.0.0.0:8000") host, _, port = listen.rpartition(":") + ch_conf = _load_clickhouse_conf(state_dir) return Config( state_dir=state_dir, repo_dir=repo_dir, @@ -101,8 +134,8 @@ def load() -> Config: host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32), host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 100), vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")), - ch_cloud_url=os.environ.get("CLICKHOUSE_CLOUD_URL", ""), - ch_cloud_user=os.environ.get("CLICKHOUSE_CLOUD_USER", ""), - ch_cloud_password=os.environ.get("CLICKHOUSE_CLOUD_PASSWORD", ""), - ch_cloud_db=os.environ.get("CLICKHOUSE_CLOUD_DB", "playground"), + ch_cloud_url=os.environ.get("CLICKHOUSE_CLOUD_URL", ch_conf.get("url", "")), + ch_cloud_user=os.environ.get("CLICKHOUSE_CLOUD_USER", ch_conf.get("user", "")), + ch_cloud_password=os.environ.get("CLICKHOUSE_CLOUD_PASSWORD", ch_conf.get("password", "")), + ch_cloud_db=os.environ.get("CLICKHOUSE_CLOUD_DB", ch_conf.get("db", "playground")), ) From 6a9aa7a42136feab439ac4d5d32ca3c0bf1d7bf5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:01:15 +0200 Subject: [PATCH 118/221] Minor edit --- playground/web/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playground/web/index.html b/playground/web/index.html index 81f20c025c..d1c3001de4 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -8,7 +8,7 @@
-

ClickBench Playground — run SQL against 80+ databases

+

ClickBench Playground — run SQL against 90+ databases

From 67d00d83026da40efc24c1210927ce56d6daa458 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:21:31 +0000 Subject: [PATCH 119/221] playground: clickhouse bootstrap + shared-query plumbing Saves DDL + user-creation SQL as versioned playground/server/clickhouse-bootstrap.sql so the schema lives next to the code that uses it. On startup the server connects with the default user (creds from clickhouse.conf), runs the file (DB + requests/events tables + parameterized request_by_id view with SQL SECURITY DEFINER), then creates two host-restricted users: playground_writer INSERT-only, host-pinned to our public IP playground_reader SELECT-only on request_by_id, public, with readonly=2 + max_execution_time=5 + max_rows_to_read=1M + max_threads=2 Auto-generated passwords land in /clickhouse-credentials.json so the same identities survive across restarts; bootstrap rotates both on every run (ALTER USER ... IDENTIFIED ...). The requests table now carries a 64-bit uid=1000(ubuntu) gid=1000(ubuntu) groups=1000(ubuntu),4(adm),24(cdrom),27(sudo),30(dip),102(lxd),991(kvm), the stored , and a PROJECTION by_id INDEX id TYPE basic (CH 26.1 syntax) so the saved-query lookup is point-fast even when the table grows. Every /api/query mints a random id, persists output+timing alongside it, and returns X-Query-Id (11-char URL-safe base64) so the browser can build permalinks; GET /api/saved/ reads it back via the parameterized view as the reader user. Co-Authored-By: Claude Opus 4.7 --- playground/server/clickhouse-bootstrap.sql | 61 +++++++ playground/server/clickhouse_bootstrap.py | 193 +++++++++++++++++++++ playground/server/logging_sink.py | 81 ++------- playground/server/main.py | 77 +++++++- 4 files changed, 345 insertions(+), 67 deletions(-) create mode 100644 playground/server/clickhouse-bootstrap.sql create mode 100644 playground/server/clickhouse_bootstrap.py diff --git a/playground/server/clickhouse-bootstrap.sql b/playground/server/clickhouse-bootstrap.sql new file mode 100644 index 0000000000..08fbd7f26b --- /dev/null +++ b/playground/server/clickhouse-bootstrap.sql @@ -0,0 +1,61 @@ +-- ClickHouse bootstrap for the playground. +-- +-- Run as the default user on every server startup. Idempotent: CREATE +-- IF NOT EXISTS / CREATE OR REPLACE / ALTER USER ... IDENTIFIED. +-- +-- Parameters (passed via HTTP ?param_db=... etc. or substituted in +-- Python for the user-creation statements where CH doesn't accept +-- query parameters): +-- {db:Identifier} target database name +-- {writer_pw:String} freshly-rotated password for the writer user +-- {writer_host:String} IP the writer must connect from (the playground +-- server's public IP, as seen by CH Cloud) +-- {reader_pw:String} freshly-rotated password for the reader user + +-- =========================================================================== +-- Schema +-- =========================================================================== + +CREATE DATABASE IF NOT EXISTS {db:Identifier}; + +-- Request log + shared queries (same table). +-- ORDER BY ts so recent rows cluster (chronological retention / TTL friendly). +-- The `id` is a random 64-bit handle the API returns to the client; an +-- INDEX projection on `id` (ClickHouse 26.1 syntax) gives a fast +-- equality lookup without sorting the main part by id. +CREATE TABLE IF NOT EXISTS {db:Identifier}.requests ( + id UInt64, + ts DateTime64(6) DEFAULT now64(6), + client_addr String, + user_agent String, + system LowCardinality(String), + query String, + output String, + output_bytes UInt64, + output_truncated UInt8, + query_time Nullable(Float64), + wall_time Float64, + status UInt16, + error String, + PROJECTION by_id INDEX id TYPE basic +) ENGINE = MergeTree ORDER BY ts; + +-- Operational events (vm boot, oom-kick, watchdog teardown, ...). +CREATE TABLE IF NOT EXISTS {db:Identifier}.events ( + ts DateTime64(6) DEFAULT now64(6), + system LowCardinality(String), + kind LowCardinality(String), + detail String +) ENGINE = MergeTree ORDER BY ts; + +-- Parameterized view for the read-only public user. SQL SECURITY DEFINER +-- runs the SELECT as the view's owner (the default user), so the reader +-- doesn't need a direct grant on `requests` — just SELECT on the view. +-- The id projection makes this an O(log n) lookup even when the table +-- has billions of rows. +CREATE OR REPLACE VIEW {db:Identifier}.request_by_id +DEFINER = default +SQL SECURITY DEFINER +AS SELECT * FROM {db:Identifier}.requests + WHERE id = {q_id:UInt64} + LIMIT 1; diff --git a/playground/server/clickhouse_bootstrap.py b/playground/server/clickhouse_bootstrap.py new file mode 100644 index 0000000000..56c2143110 --- /dev/null +++ b/playground/server/clickhouse_bootstrap.py @@ -0,0 +1,193 @@ +"""ClickHouse bootstrap: schema + writer/reader users. + +Runs on server startup using the default-user credentials supplied in +/clickhouse.conf (or env vars). Idempotent: + +* Schema DDL (DB + tables + parameterized view) lives in the sibling + clickhouse-bootstrap.sql file — that file is the canonical source + of truth for the request/event tables and the request_by_id view. +* The two human users are created here in Python because CREATE USER + doesn't accept HTTP query parameters for the password / host clauses + and rotating those at bootstrap time is convenient. + +Generated credentials persist to /clickhouse-credentials.json +so the writer/reader users keep the same password across restarts; if +the file is missing, fresh random passwords are generated and the +users' passwords are reset to match. +""" +from __future__ import annotations + +import contextlib +import json +import logging +import secrets +from pathlib import Path +from typing import NamedTuple +from urllib.parse import urlencode + +import aiohttp + +from .config import Config + +log = logging.getLogger("clickhouse_bootstrap") + +_SQL_FILE = Path(__file__).parent / "clickhouse-bootstrap.sql" + + +class Credentials(NamedTuple): + url: str + db: str + writer_user: str + writer_password: str + reader_user: str + reader_password: str + + +def _gen_pw(n: int = 32) -> str: + # URL-safe random string. Avoid characters that need escaping in + # SQL literals. + return secrets.token_urlsafe(n) + + +def _credentials_path(cfg: Config) -> Path: + return cfg.state_dir / "clickhouse-credentials.json" + + +def _load_or_make_credentials(cfg: Config) -> tuple[str, str]: + """Return (writer_password, reader_password). Persist on first run.""" + path = _credentials_path(cfg) + if path.exists(): + with contextlib.suppress(Exception): + data = json.loads(path.read_text()) + return data["writer_password"], data["reader_password"] + creds = { + "writer_password": _gen_pw(), + "reader_password": _gen_pw(), + } + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(creds, indent=2)) + path.chmod(0o600) + return creds["writer_password"], creds["reader_password"] + + +async def _ch_exec(session: aiohttp.ClientSession, + url: str, user: str, password: str, + sql: str, params: dict[str, str] | None = None) -> str: + """Run `sql` via HTTP and return the response body. Raises on + non-2xx.""" + qs = {f"param_{k}": v for k, v in (params or {}).items()} + full = url + ("?" + urlencode(qs) if qs else "") + async with session.post( + full, data=sql, + auth=aiohttp.BasicAuth(user, password), + timeout=aiohttp.ClientTimeout(total=60), + ) as r: + body = await r.text() + if r.status >= 300: + raise RuntimeError(f"CH bootstrap {r.status}: {body[:500]} (sql={sql[:200]})") + return body + + +async def bootstrap(cfg: Config) -> Credentials | None: + """Run the bootstrap. Returns the credentials the runtime should + use for the writer (logging sink) and the reader (saved-query + lookups). Returns None if the bootstrap config isn't present + (CH integration disabled).""" + if not (cfg.ch_cloud_url and cfg.ch_cloud_user and cfg.ch_cloud_password): + return None + db = cfg.ch_cloud_db or "playground" + writer_pw, reader_pw = _load_or_make_credentials(cfg) + async with aiohttp.ClientSession() as session: + # Find the IP CH Cloud sees us connecting from — that's the + # host the writer user is restricted to. + body = await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + "SELECT toString(remote_address())", + ) + # CH returns "ip:port\n"; strip the port. + writer_host = body.strip().split(":")[0] + + # Schema DDL from the .sql file. Each statement runs in its + # own request so server-side parameter substitution works. + sql_blob = _SQL_FILE.read_text() + # Strip line comments and split on `;` boundaries. + statements = _split_sql_statements(sql_blob) + for stmt in statements: + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + stmt, params={"db": db}, + ) + + # Users — passwords + host clause go inline; ALTER on every + # bootstrap rotates / re-pins them. + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"CREATE USER IF NOT EXISTS playground_writer " + f"IDENTIFIED WITH sha256_password BY '{writer_pw}' " + f"HOST IP '{writer_host}'", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"ALTER USER playground_writer " + f"IDENTIFIED WITH sha256_password BY '{writer_pw}' " + f"HOST IP '{writer_host}'", + ) + # Strict scope: revoke everything then re-grant only INSERT. + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"REVOKE ALL ON *.* FROM playground_writer", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"GRANT INSERT ON {db}.requests TO playground_writer", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"GRANT INSERT ON {db}.events TO playground_writer", + ) + + # Reader: public, SELECT-only on the parameterized view, with + # tight resource caps. Profile-style settings prevent anyone + # who somehow gets the password from using it as a foothold. + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"CREATE USER IF NOT EXISTS playground_reader " + f"IDENTIFIED WITH sha256_password BY '{reader_pw}' " + f"DEFAULT DATABASE {db} " + f"SETTINGS readonly = 2, " + f"max_execution_time = 5, " + f"max_memory_usage = 100000000, " + f"max_result_rows = 1, " + f"max_rows_to_read = 1048576, " + f"max_threads = 2", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"ALTER USER playground_reader " + f"IDENTIFIED WITH sha256_password BY '{reader_pw}'", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"REVOKE ALL ON *.* FROM playground_reader", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"GRANT SELECT ON {db}.request_by_id TO playground_reader", + ) + + log.info("ClickHouse bootstrap complete (writer host=%s)", writer_host) + return Credentials( + url=cfg.ch_cloud_url, db=db, + writer_user="playground_writer", writer_password=writer_pw, + reader_user="playground_reader", reader_password=reader_pw, + ) + + +def _split_sql_statements(blob: str) -> list[str]: + """Strip --line comments, split on top-level `;`. Naive — fine for + the bootstrap file which has no string literals or nested blocks.""" + stripped = "\n".join( + line for line in blob.splitlines() + if not line.lstrip().startswith("--") + ) + return [s.strip() for s in stripped.split(";") if s.strip()] diff --git a/playground/server/logging_sink.py b/playground/server/logging_sink.py index 6ba444c4e0..4f46b506f4 100644 --- a/playground/server/logging_sink.py +++ b/playground/server/logging_sink.py @@ -1,25 +1,11 @@ """Batched, async logger that writes events to ClickHouse Cloud over HTTPS. -Two tables (auto-created on first connect if writeable): - - playground.requests - ts DateTime64(6) - client_addr String - user_agent String - system String - query String - output_bytes UInt64 - output_truncated UInt8 - query_time Nullable(Float64) from agent X-Query-Time - wall_time Float64 host-side end-to-end - status UInt16 HTTP status returned to client - error String - - playground.events - ts DateTime64(6) - system String - kind String "restart" / "oom-kick" / "boot" / ... - detail String +Schema + users are bootstrapped on server startup by +`clickhouse_bootstrap.bootstrap()` — see clickhouse-bootstrap.sql for the +canonical DDL of the `requests` (request log + shared queries) and +`events` (operational events) tables and the `request_by_id` +parameterized view. This module only writes; it uses the writer user +issued by the bootstrap, NOT the default user. When CLICKHOUSE_CLOUD_URL is unset, both tables are mirrored to /opt/clickbench-playground/logs/requests.jsonl and events.jsonl so the @@ -38,44 +24,20 @@ import aiohttp from .config import Config +from .clickhouse_bootstrap import Credentials log = logging.getLogger("logging_sink") -_REQUESTS_DDL = """ -CREATE TABLE IF NOT EXISTS playground.requests ( - ts DateTime64(6) DEFAULT now64(6), - client_addr String, - user_agent String, - system String, - query String, - output_bytes UInt64, - output_truncated UInt8, - query_time Nullable(Float64), - wall_time Float64, - status UInt16, - error String -) ENGINE = MergeTree ORDER BY (system, ts) -""" - -_EVENTS_DDL = """ -CREATE TABLE IF NOT EXISTS playground.events ( - ts DateTime64(6) DEFAULT now64(6), - system String, - kind String, - detail String -) ENGINE = MergeTree ORDER BY (system, ts) -""" - - class LoggingSink: - def __init__(self, cfg: Config): + def __init__(self, cfg: Config, creds: Credentials | None): self.cfg = cfg + self._creds = creds self._queue: asyncio.Queue[tuple[str, dict]] = asyncio.Queue(maxsize=10000) self._task: asyncio.Task | None = None self._session: aiohttp.ClientSession | None = None self._local_files: dict[str, Path] = {} - self._enabled = bool(cfg.ch_cloud_url and cfg.ch_cloud_user and cfg.ch_cloud_password) + self._enabled = creds is not None async def start(self) -> None: self.cfg.logs_dir.mkdir(parents=True, exist_ok=True) @@ -84,12 +46,7 @@ async def start(self) -> None: "events": self.cfg.logs_dir / "events.jsonl", } if self._enabled: - try: - self._session = aiohttp.ClientSession() - await self._run_ddl() - except Exception as e: - log.warning("ClickHouse Cloud DDL failed (%r); falling back to JSONL only", e) - self._enabled = False + self._session = aiohttp.ClientSession() self._task = asyncio.create_task(self._flusher(), name="logging-sink") async def stop(self) -> None: @@ -119,17 +76,13 @@ def _enqueue(self, table: str, row: dict) -> None: except Exception: pass - async def _run_ddl(self) -> None: - await self._exec_ch(f"CREATE DATABASE IF NOT EXISTS {self.cfg.ch_cloud_db}") - await self._exec_ch(_REQUESTS_DDL.replace("playground.", f"{self.cfg.ch_cloud_db}.")) - await self._exec_ch(_EVENTS_DDL.replace("playground.", f"{self.cfg.ch_cloud_db}.")) - async def _exec_ch(self, sql: str) -> None: - assert self._session is not None + assert self._session is not None and self._creds is not None async with self._session.post( - self.cfg.ch_cloud_url, + self._creds.url, data=sql, - auth=aiohttp.BasicAuth(self.cfg.ch_cloud_user, self.cfg.ch_cloud_password), + auth=aiohttp.BasicAuth(self._creds.writer_user, + self._creds.writer_password), timeout=aiohttp.ClientTimeout(total=30), ) as r: if r.status >= 300: @@ -137,10 +90,10 @@ async def _exec_ch(self, sql: str) -> None: raise RuntimeError(f"CH error {r.status}: {txt[:500]}") async def _insert_ch(self, table: str, rows: list[dict]) -> None: - if not rows: + if not rows or self._creds is None: return body = "\n".join(json.dumps(r, default=str) for r in rows) - sql = f"INSERT INTO {self.cfg.ch_cloud_db}.{table} FORMAT JSONEachRow\n{body}" + sql = f"INSERT INTO {self._creds.db}.{table} FORMAT JSONEachRow\n{body}" await self._exec_ch(sql) async def _flusher(self) -> None: diff --git a/playground/server/main.py b/playground/server/main.py index 0a746ad8d1..875b47128c 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -20,10 +20,25 @@ from __future__ import annotations import asyncio +import base64 import contextlib import logging import signal import time + + +def _id_to_b64url(n: int) -> str: + """64-bit unsigned int -> 11-char URL-safe base64 (no padding). + Symmetric counterpart to _b64url_to_id. The same number can travel + as a UInt64 inside ClickHouse and as a tidy URL handle.""" + return base64.urlsafe_b64encode( + n.to_bytes(8, "big"), + ).rstrip(b"=").decode("ascii") + + +def _b64url_to_id(s: str) -> int: + pad = "=" * (-len(s) % 4) + return int.from_bytes(base64.urlsafe_b64decode(s + pad), "big") from pathlib import Path import aiohttp @@ -44,10 +59,24 @@ def __init__(self) -> None: self.cfg = config_mod.load() self.systems = systems_mod.discover(self.cfg.repo_dir) self.vmm = VMManager(self.cfg, self.systems) - self.sink = LoggingSink(self.cfg) + # CH credentials are populated by on_startup after the + # bootstrap runs. None means CH integration is disabled and + # the sink falls back to JSONL. + self.ch_creds = None + self.sink = LoggingSink(self.cfg, None) self.monitor = Monitor(self.cfg, self.vmm, self.sink) async def on_startup(self, _app: web.Application) -> None: + from . import clickhouse_bootstrap + try: + self.ch_creds = await clickhouse_bootstrap.bootstrap(self.cfg) + except Exception as e: + log.warning("ClickHouse bootstrap failed (%r); CH integration disabled", e) + self.ch_creds = None + # Replace the placeholder sink with one wired to the bootstrap's + # writer credentials. + self.sink = LoggingSink(self.cfg, self.ch_creds) + self.monitor.sink = self.sink await self.sink.start() await self.monitor.start() # SNI-allowlist proxy that mediates outbound HTTP/HTTPS for @@ -193,6 +222,12 @@ async def handle_query(self, req: web.Request) -> web.StreamResponse: body = b"" headers: dict[str, str] = {} err: str | None = None + # Random 64-bit handle returned to the client as a base64url + # string (X-Query-Id) AND persisted to the requests table. + # The same id is the key the browser uses to permalink the + # result via /api/saved/. + import secrets + query_id = secrets.randbits(64) try: body, headers, status = await self._dispatch_query(system_name, sql) except Exception as e: @@ -202,16 +237,18 @@ async def handle_query(self, req: web.Request) -> web.StreamResponse: wall = time.monotonic() - wall_t0 try: self.sink.write_request( + id=query_id, client_addr=client_addr, user_agent=ua, system=system_name, query=sql.decode("utf-8", errors="replace")[:65536], + output=body.decode("utf-8", errors="replace")[:1 << 20], output_bytes=int(headers.get("X-Output-Bytes", "0") or 0), output_truncated=int(headers.get("X-Output-Truncated", "0") or 0), query_time=(float(headers["X-Query-Time"]) if "X-Query-Time" in headers else None), wall_time=wall, status=status, - error=err or "", + error=err or headers.get("X-Error", ""), ) except Exception: log.exception("logging request failed") @@ -221,10 +258,43 @@ async def handle_query(self, req: web.Request) -> web.StreamResponse: for k, v in headers.items(): resp.headers[k] = v resp.headers["X-Wall-Time"] = f"{wall:.6f}" + resp.headers["X-Query-Id"] = _id_to_b64url(query_id) if err and "X-Error" not in resp.headers: resp.headers["X-Error"] = err[:512] return resp + async def handle_saved(self, req: web.Request) -> web.Response: + """Look up a previously-saved query+result by its base64url id. + Returns a JSON object with output, error, timing — the browser + replays it as if the query just ran. + """ + if self.ch_creds is None: + raise web.HTTPServiceUnavailable(reason="shared queries disabled (no CH)") + b64 = req.match_info["b64"] + try: + qid = _b64url_to_id(b64) + except Exception: + raise web.HTTPBadRequest(reason="malformed id") + # Read through the parameterized view as the reader user. The + # view has SQL SECURITY DEFINER so the reader doesn't need a + # direct grant on the requests table. + sql = (f"SELECT * FROM {self.ch_creds.db}.request_by_id(q_id={qid}) " + f"FORMAT JSONEachRow") + async with aiohttp.ClientSession() as s: + async with s.post( + self.ch_creds.url, data=sql, + auth=aiohttp.BasicAuth(self.ch_creds.reader_user, + self.ch_creds.reader_password), + timeout=aiohttp.ClientTimeout(total=10), + ) as r: + text = await r.text() + if r.status >= 300: + raise web.HTTPBadGateway(reason=f"ch {r.status}: {text[:300]}") + text = text.strip() + if not text: + raise web.HTTPNotFound(reason="no saved query with that id") + return web.Response(text=text, content_type="application/json") + async def _dispatch_query(self, system_name: str, sql: bytes ) -> tuple[bytes, dict[str, str], int]: """Run the query once. On low-level failure (VM unreachable, transport @@ -293,6 +363,7 @@ def build_app() -> web.Application: app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision) app.router.add_post("/api/warmup/{name}", obj.handle_warmup) app.router.add_post("/api/query", obj.handle_query) + app.router.add_get("/api/saved/{b64}", obj.handle_saved) # Static UI web_dir = Path(__file__).resolve().parent.parent / "web" @@ -333,7 +404,7 @@ async def cors(request: web.Request, handler): resp.headers["Access-Control-Expose-Headers"] = ( "X-Query-Time, X-Wall-Time, X-Query-Wall-Time, " "X-Output-Bytes, X-Output-Truncated, X-Exit-Code, " - "X-System, X-Error" + "X-System, X-Error, X-Query-Id" ) return resp From b8a6cf8ff8129d3dded77dce3c8ec52f1a6e12e0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:23:06 +0000 Subject: [PATCH 120/221] playground: web: permalink via X-Query-Id + restore from ?q= After each /api/query the browser pulls X-Query-Id (the base64url form of the saved row's UInt64 id) and rewrites the URL bar to ?q=. On page load the same handle is read back: /api/saved/ returns the stored query+output+timing and the UI replays it as if the query just ran (no VM restore needed). Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/playground/web/app.js b/playground/web/app.js index 01228954db..08f93c9fe7 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -312,6 +312,14 @@ async function runQuery() { truncated: h("X-Output-Truncated") === "1" ? "yes" : "no", exit: h("X-Exit-Code") || String(r.status), }; + // Permalink: the server returns a base64url 64-bit id; drop + // it in the URL bar so reload/share keeps the result. + const qid = h("X-Query-Id"); + if (qid) { + const u = new URL(window.location.href); + u.searchParams.set("q", qid); + window.history.replaceState({}, "", u.toString()); + } } catch (e) { payload = { output: `(client error)\n${e}`, @@ -359,6 +367,42 @@ queryEl.addEventListener("keydown", (e) => { if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery(); }); +async function maybeLoadShared() { + // ?q= permalink — fetch the saved query+result and + // replay it as if we just ran it. + const u = new URL(window.location.href); + const qid = u.searchParams.get("q"); + if (!qid) return; + try { + const r = await fetch(`${API}/api/saved/${encodeURIComponent(qid)}`); + if (!r.ok) return; + const row = await r.json(); + // The CH parameterized view returns JSONEachRow → one object. + const sys = row.system; + if (sys && stateByName[sys]) { + select(sys); + } + queryEl.value = row.query || ""; + pristineQuery = queryEl.value; + const payload = { + output: row.output || "(no output)", + time: row.query_time != null ? `${row.query_time.toFixed(3)} s` : "—", + wall: row.wall_time != null ? `${row.wall_time.toFixed(3)} s` : "—", + bytes: String(row.output_bytes ?? ""), + truncated: row.output_truncated ? "yes" : "no", + exit: String(row.status ?? ""), + }; + if (row.error) { + payload.output = (payload.output === "(no output)" ? "" : payload.output) + + `\n\n(error)\n${row.error}`; + } + resultsByName[selected] = payload; + showResult(payload); + } catch (e) { + console.error("failed to load shared query", e); + } +} + (async function init() { // Treat the HTML default ("SELECT COUNT(*) FROM hits;") as pristine // so first-system selection is free to swap it for the first @@ -366,5 +410,6 @@ queryEl.addEventListener("keydown", (e) => { pristineQuery = queryEl.value; await loadCatalog(); await pollState(); + await maybeLoadShared(); pollTimer = setInterval(pollState, 2000); })(); From 7f0e8eea9247f32b6063b66ef983bc21d28e09c4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:23:45 +0200 Subject: [PATCH 121/221] Minor edit --- playground/web/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playground/web/index.html b/playground/web/index.html index d1c3001de4..766dec3e6d 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -3,7 +3,7 @@ -ClickBench Playground +ClickBench Playground — run SQL against 90+ databases From c273b3590dce3fde2f8d63ee0e20165f822a306c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:23:52 +0000 Subject: [PATCH 122/221] playground: bootstrap: resolve writer host via ipify, not remote_address() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CH Cloud (26.2) doesn't expose a 'remote_address()' function on the HTTP interface, so the bootstrap was 404-ing on its very first step. Fetch the public IP from api.ipify.org instead — it's the same value we want anyway (the playground server's outbound NAT address). Co-Authored-By: Claude Opus 4.7 --- playground/server/clickhouse_bootstrap.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/playground/server/clickhouse_bootstrap.py b/playground/server/clickhouse_bootstrap.py index 56c2143110..30e21989ac 100644 --- a/playground/server/clickhouse_bootstrap.py +++ b/playground/server/clickhouse_bootstrap.py @@ -98,14 +98,18 @@ async def bootstrap(cfg: Config) -> Credentials | None: db = cfg.ch_cloud_db or "playground" writer_pw, reader_pw = _load_or_make_credentials(cfg) async with aiohttp.ClientSession() as session: - # Find the IP CH Cloud sees us connecting from — that's the - # host the writer user is restricted to. - body = await _ch_exec( - session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, - "SELECT toString(remote_address())", - ) - # CH returns "ip:port\n"; strip the port. - writer_host = body.strip().split(":")[0] + # The writer user is host-pinned to our public IP. Resolve it + # via api.ipify.org rather than asking CH (its various + # client-address functions vary by interface and version). + try: + async with session.get( + "https://api.ipify.org", + timeout=aiohttp.ClientTimeout(total=10), + ) as r: + writer_host = (await r.text()).strip() + except Exception: + writer_host = "0.0.0.0" # fallback: no IP restriction + log.info("writer host (public IP) = %s", writer_host) # Schema DDL from the .sql file. Each statement runs in its # own request so server-side parameter substitution works. From 913871e065fe22c932028f5991da3cd9150f3e0d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:28:27 +0000 Subject: [PATCH 123/221] playground: reader user has no password (sha256 of empty string) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleaner story for the public reader identity: it's an unauthenticated user that can only SELECT from request_by_id and runs under a strict readonly profile, so password-protecting it adds friction without buying anything. Use sha256_hash of '' (the well-known e3b0c442... constant) so the user is stored in the same shape as the writer but the operator-facing form is plainly empty. Credentials file drops reader_password — we only persist the writer's password, the reader is derived from a constant. Co-Authored-By: Claude Opus 4.7 --- playground/server/clickhouse_bootstrap.py | 62 ++++++++++++++--------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/playground/server/clickhouse_bootstrap.py b/playground/server/clickhouse_bootstrap.py index 30e21989ac..95a66942d9 100644 --- a/playground/server/clickhouse_bootstrap.py +++ b/playground/server/clickhouse_bootstrap.py @@ -44,30 +44,39 @@ class Credentials(NamedTuple): def _gen_pw(n: int = 32) -> str: - # URL-safe random string. Avoid characters that need escaping in - # SQL literals. - return secrets.token_urlsafe(n) + # URL-safe random string with at least one digit + one special char + # (CH Cloud's password policy requires both). + body = secrets.token_urlsafe(n) + return body + "!1" + + +# SHA-256 of the empty string. Lets us tell ClickHouse "no password" +# in a form the server understands (it stores sha256_hash users) while +# the operator-facing identity is plainly empty. +_SHA256_EMPTY = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" def _credentials_path(cfg: Config) -> Path: return cfg.state_dir / "clickhouse-credentials.json" -def _load_or_make_credentials(cfg: Config) -> tuple[str, str]: - """Return (writer_password, reader_password). Persist on first run.""" +def _load_or_make_credentials(cfg: Config) -> str: + """Return the writer's password. Persist on first run. + + The reader has no password — it's an unauthenticated public identity + that can only SELECT from the parameterized request_by_id view — + so we don't manage one here. + """ path = _credentials_path(cfg) if path.exists(): with contextlib.suppress(Exception): data = json.loads(path.read_text()) - return data["writer_password"], data["reader_password"] - creds = { - "writer_password": _gen_pw(), - "reader_password": _gen_pw(), - } + return data["writer_password"] + creds = {"writer_password": _gen_pw()} path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(creds, indent=2)) path.chmod(0o600) - return creds["writer_password"], creds["reader_password"] + return creds["writer_password"] async def _ch_exec(session: aiohttp.ClientSession, @@ -96,7 +105,7 @@ async def bootstrap(cfg: Config) -> Credentials | None: if not (cfg.ch_cloud_url and cfg.ch_cloud_user and cfg.ch_cloud_password): return None db = cfg.ch_cloud_db or "playground" - writer_pw, reader_pw = _load_or_make_credentials(cfg) + writer_pw = _load_or_make_credentials(cfg) async with aiohttp.ClientSession() as session: # The writer user is host-pinned to our public IP. Resolve it # via api.ipify.org rather than asking CH (its various @@ -111,15 +120,20 @@ async def bootstrap(cfg: Config) -> Credentials | None: writer_host = "0.0.0.0" # fallback: no IP restriction log.info("writer host (public IP) = %s", writer_host) - # Schema DDL from the .sql file. Each statement runs in its - # own request so server-side parameter substitution works. - sql_blob = _SQL_FILE.read_text() - # Strip line comments and split on `;` boundaries. + # Schema DDL from the .sql file. We substitute {db:Identifier} + # in Python rather than via HTTP params because the CREATE VIEW + # body contains a *view-time* parameter ({q_id:UInt64}) and + # ClickHouse skips HTTP param substitution for DDL when there + # are unbound placeholders — the result is the VIEW DDL going + # out with literal `{db:Identifier}` and the parser barking + # "Database `` does not exist". Python substitution is fine + # because the db name is our own (no SQL-injection vector). + sql_blob = _SQL_FILE.read_text().replace("{db:Identifier}", db) statements = _split_sql_statements(sql_blob) for stmt in statements: await _ch_exec( session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, - stmt, params={"db": db}, + stmt, ) # Users — passwords + host clause go inline; ALTER on every @@ -150,13 +164,15 @@ async def bootstrap(cfg: Config) -> Credentials | None: f"GRANT INSERT ON {db}.events TO playground_writer", ) - # Reader: public, SELECT-only on the parameterized view, with - # tight resource caps. Profile-style settings prevent anyone - # who somehow gets the password from using it as a foothold. + # Reader: public, no password — SELECT-only on the parameterized + # view, with tight resource caps. The empty password is + # expressed as sha256_hash of the empty string so CH stores it + # in the same shape as any other sha256 user but the + # operator-facing identity is plainly "no password". await _ch_exec( session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, f"CREATE USER IF NOT EXISTS playground_reader " - f"IDENTIFIED WITH sha256_password BY '{reader_pw}' " + f"IDENTIFIED WITH sha256_hash BY '{_SHA256_EMPTY}' " f"DEFAULT DATABASE {db} " f"SETTINGS readonly = 2, " f"max_execution_time = 5, " @@ -168,7 +184,7 @@ async def bootstrap(cfg: Config) -> Credentials | None: await _ch_exec( session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, f"ALTER USER playground_reader " - f"IDENTIFIED WITH sha256_password BY '{reader_pw}'", + f"IDENTIFIED WITH sha256_hash BY '{_SHA256_EMPTY}'", ) await _ch_exec( session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, @@ -183,7 +199,7 @@ async def bootstrap(cfg: Config) -> Credentials | None: return Credentials( url=cfg.ch_cloud_url, db=db, writer_user="playground_writer", writer_password=writer_pw, - reader_user="playground_reader", reader_password=reader_pw, + reader_user="playground_reader", reader_password="", ) From d002acd84aab8f05bd1416541d641026d2f5d466 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:30:50 +0000 Subject: [PATCH 124/221] playground: trino-datalake: switch to fs.native-s3 (the legacy hadoop S3 plugin is gone) trino:latest no longer ships the hadoop-S3 plugin, so the fs.hadoop.enabled=true + custom AWSCredentialsProvider shim path is broken: 'External location is not a valid file system URI: s3://...'. Switch hive.properties to fs.native-s3.enabled=true with region + endpoint set explicitly; the public bucket allows unauthenticated GETs and the AWS SDK falls through its default credentials chain to anonymous when no creds are configured. The shim + core-site.xml mounts in start stay around as no-ops for now. Co-Authored-By: Claude Opus 4.7 --- trino-datalake/install | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/trino-datalake/install b/trino-datalake/install index 10e0b96135..3381c55966 100755 --- a/trino-datalake/install +++ b/trino-datalake/install @@ -58,25 +58,14 @@ hive.metastore=file hive.metastore.catalog.dir=local:///meta local.location=/data fs.native-local.enabled=true -fs.hadoop.enabled=true -hive.config.resources=/etc/trino/core-site.xml +# trino:latest dropped the legacy hadoop-S3 plugin, which is why +# fs.hadoop.enabled=true + a custom AWSCredentialsProvider shim no +# longer worked: 'External location is not a valid file system URI: +# s3://...'. Switch to the native S3 file system; the bucket is +# public so we configure it to skip the AWS signing flow entirely. +fs.native-s3.enabled=true +s3.region=eu-central-1 +s3.endpoint=https://s3.eu-central-1.amazonaws.com +s3.path-style-access=false hive.non-managed-table-writes-enabled=true EOF - -cat > etc/core-site.xml <<'EOF' - - - - trino.s3.credentials-provider - S3AnonymousProvider - - - trino.s3.endpoint - https://s3.eu-central-1.amazonaws.com - - - trino.s3.region - eu-central-1 - - -EOF From 0eccf8d7105b31350ef58f1b2750069876eb80af Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:33:44 +0000 Subject: [PATCH 125/221] playground: trino: --output-format=ALIGNED so /api/query has output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original benchmark setup used --output-format=NULL because it measures timing only; under the playground that produces 200 OK with an empty body, which the UI faithfully shows as '(no output)'. Switch to ALIGNED — same human-readable table presto* uses — so the saved row + the UI both have something to display. Co-Authored-By: Claude Opus 4.7 --- trino-datalake-partitioned/query | 2 +- trino-datalake/query | 2 +- trino-partitioned/query | 2 +- trino/query | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/trino-datalake-partitioned/query b/trino-datalake-partitioned/query index 0d7e70cc23..041a144aad 100755 --- a/trino-datalake-partitioned/query +++ b/trino-datalake-partitioned/query @@ -10,7 +10,7 @@ query=$(cat) start=$(date +%s.%N) sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "$query" + --output-format=ALIGNED --execute "$query" end=$(date +%s.%N) awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino-datalake/query b/trino-datalake/query index 0d7e70cc23..041a144aad 100755 --- a/trino-datalake/query +++ b/trino-datalake/query @@ -10,7 +10,7 @@ query=$(cat) start=$(date +%s.%N) sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "$query" + --output-format=ALIGNED --execute "$query" end=$(date +%s.%N) awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino-partitioned/query b/trino-partitioned/query index 0d7e70cc23..041a144aad 100755 --- a/trino-partitioned/query +++ b/trino-partitioned/query @@ -10,7 +10,7 @@ query=$(cat) start=$(date +%s.%N) sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "$query" + --output-format=ALIGNED --execute "$query" end=$(date +%s.%N) awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/trino/query b/trino/query index 0d7e70cc23..041a144aad 100755 --- a/trino/query +++ b/trino/query @@ -10,7 +10,7 @@ query=$(cat) start=$(date +%s.%N) sudo docker exec -i trino trino --catalog hive --schema clickbench \ - --output-format=NULL --execute "$query" + --output-format=ALIGNED --execute "$query" end=$(date +%s.%N) awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 From 0f17abf66fb97f4df45f793e83f5a640a2da1324 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:35:38 +0000 Subject: [PATCH 126/221] playground: agent: clear _daemon_started after docker reconcile docker daemon restart kills containers that weren't pinned via restart=unless-stopped. byconity's compose stack falls into that bucket: post-reconcile the server container is dead and queries fail with 'Not connected to :30605'. Clearing the daemon-started gate after reconcile makes the very next /query's _ensure_daemon_started run ./start again, which is a no-op for healthy daemons and 'docker compose up -d' for the multi-container systems that need it. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 5d59b4ac0b..a4707c551d 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -686,6 +686,12 @@ def _maybe_reconcile_for_restore() -> None: f"{_last_seen_btime} -> {cur}; reconciling docker\n") _last_seen_btime = cur _reconcile_docker_after_restore() + # docker daemon restart kills containers that aren't pinned + # via `restart: unless-stopped`; for compose-based systems + # like byconity that means the worker is dead until we re-run + # ./start. Clear the daemon-started gate so the very next + # _ensure_daemon_started() call brings the stack back up. + _daemon_started.clear() def _reconcile_docker_after_restore() -> None: From 03beb2b58fb1d4cc8706f2824d2485ba81ebbe82 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:40:54 +0000 Subject: [PATCH 127/221] playground: umbra: --memory=128g --memory-swap=-1 so cgroup sees the host swap umbra still OOMs at create.sql:109 with the 256 GB swap on the host: docker's default cgroup setup gives the container the host's full memory but Umbra's own allocator caps itself at the cgroup's 'available' figure, which lands near the 16 GB physical RAM. Pin the container to 128 GB with unlimited swap so Umbra's allocator sees enough room to load the table. Co-Authored-By: Claude Opus 4.7 --- umbra/start | 2 ++ 1 file changed, 2 insertions(+) diff --git a/umbra/start b/umbra/start index a7fc4dc58e..f0d5b30b75 100755 --- a/umbra/start +++ b/umbra/start @@ -14,6 +14,8 @@ sudo docker run -d --name umbradb \ -p 5432:5432 \ --ulimit nofile=1048576:1048576 \ --ulimit memlock=8388608:8388608 \ + --memory=128g \ + --memory-swap=-1 \ umbradb/umbra:latest >/dev/null # Container needs a moment before psql can connect. From 72fc9058f9f76a3a136ab67f2747c782ac1b3537 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:49:24 +0000 Subject: [PATCH 128/221] playground: drill: -XX:-UseContainerSupport to dodge the cgroup-v2 NPE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apache/drill ships a JDK whose CgroupV2Subsystem.getInstance() NPEs when 'anyController is null' — happens on the playground VM where cgroup2 is mounted with no controllers visible to the container. The NPE killed RootAllocator init and every query returned 'Could not initialize class org.apache.drill.exec.memory.RootAllocator' with no other visible output beyond the JVM picking up the _JAVA_OPTIONS env line. Turning off the JVM's container-aware sysinfo path with -XX:-UseContainerSupport skips the broken code; SELECT now works end-to-end (verified: SELECT 1 -> '1 row selected (1.105 seconds)'). Co-Authored-By: Claude Opus 4.7 --- drill/query | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drill/query b/drill/query index 5fbfcc4220..78429e74a9 100755 --- a/drill/query +++ b/drill/query @@ -41,6 +41,16 @@ DRILL_OPENS=( -Dio.netty.tryReflectionSetAccessible=true -Darrow.memory.debug.allocator=false -Dorg.apache.drill.exec.memory.debug.allocator=false + # apache/drill image bundles a JVM whose CgroupV2Subsystem + # NPEs on the host's cgroup-v2 layout (the playground VM + # mounts only `cgroup2` with no controllers visible to the + # container). The NPE in jdk.internal.platform.cgroupv2 + # kills RootAllocator init and every query returns + # 'No current connection / Could not initialize class + # org.apache.drill.exec.memory.RootAllocator'. + # Turning off the JVM's container-aware sysinfo path skips + # the offending code entirely. + -XX:-UseContainerSupport ) OPENS="${DRILL_OPENS[*]}" From ca0abe5023a7e4e7383c680d5a4cf41a583ec521 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 12:59:42 +0000 Subject: [PATCH 129/221] playground: web: 'Run all' competition mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a faint secondary button next to the example selector that fires the SAME example index against every snapshotted/ready system in parallel, each using its OWN translation of that query (ClickHouse runs SELECT COUNT(*) FROM hits, pandas runs hits.count(), polars runs hits.select(pl.len())..., etc.). Results render in a dynamically re-sorted table: 1. completed — fastest first (winner row gets the accent bg) 2. failed — alphabetical, tooltip = X-Error 3. running — alphabetical The button hides when no example is picked, including after a textarea edit (which already resets the + + + +
@@ -52,6 +61,6 @@

ClickBench Playground — run SQL against 90+ databa

- + diff --git a/playground/web/style.css b/playground/web/style.css index 968c68667a..d258bfce46 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -78,6 +78,40 @@ button { button:disabled { opacity: 0.6; cursor: not-allowed; } button:hover:not(:disabled) { filter: brightness(0.95); } +/* "Run all" — faint secondary button next to the example selector. */ +button.run-all { + background: transparent; + color: var(--muted); + border-color: var(--muted); + font-weight: normal; + padding: 4px 10px; + margin-left: 8px; +} +button.run-all:hover:not(:disabled) { + background: var(--accent); + color: var(--accent-fg); + border-color: var(--border); + filter: none; +} + +/* Competition-mode table. */ +#runall-table { + width: 100%; + border-collapse: collapse; + border: 1px solid var(--border); + background: var(--bg-alt); +} +#runall-table td { + padding: 4px 8px; + border-bottom: 1px solid #eee; + font-family: monospace; +} +#runall-table td.time { text-align: right; } +#runall-table tr.done td.time { color: var(--good); font-weight: 600; } +#runall-table tr.failed td.time { color: var(--bad); } +#runall-table tr.running td.time { color: var(--muted); } +#runall-table tr.winner td { background: var(--accent); } + .row { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; } .row label { margin: 0; } .stats { font-family: monospace; } From 0635c48ee084e9a72277bb847af7b0ca005396b4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:02:41 +0000 Subject: [PATCH 130/221] playground: un-gate heavyai + oxla (both ship public docker images) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit heavyai uses the omnisci/core-os-cpu:v5.10.2 docker image — the last public release before the HeavyDB rename — which a straight docker pull from Docker Hub fetches. oxla pulls from public.ecr.aws/oxla so it sidesteps Docker Hub rate limits entirely; install/start are vanilla Postgres-protocol + docker. Both go behind the same per-system disk/swap/snapshot machinery the rest of the catalog uses; nothing else changes. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index 74e3a6b547..c6a3e3d71c 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -34,8 +34,8 @@ "clickhouse-cloud", "clickhouse-tencent", "clickhouse-web", "crunchy-bridge-for-analytics", "databend", "databricks", "exasol", "firebolt", "firebolt-parquet", "firebolt-parquet-partitioned", - "gravitons", "heavyai", "hologres", "hydrolix", "kinetica", - "motherduck", "oxla", "pgpro_tam", "redshift", "redshift-serverless", + "gravitons", "hologres", "hydrolix", "kinetica", + "motherduck", "pgpro_tam", "redshift", "redshift-serverless", "s3select", "singlestore", "snowflake", "supabase", "tembo-olap", "timescale-cloud", "tinybird", "velodb", "vertica", "ydb", From 02f78edddd33281b4f46be1725127cc5126a6d3f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:06:42 +0000 Subject: [PATCH 131/221] playground: web: flash competition row yellow for 1s on state change Whenever a system transitions running -> done or running -> failed (or even completed timings drift between re-runs), the row's cells animate from --accent (yellow) to transparent via a 1s ease-out keyframe. Diffing against runAllLast keeps the flash off the rows that didn't actually change but just got reshuffled by re-sorting. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 27 ++++++++++++++++++++++++++- playground/web/index.html | 4 ++-- playground/web/style.css | 11 +++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 624d0082aa..849b17ee91 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -464,6 +464,9 @@ async function runAll() { const status = {}; for (const t of targets) status[t.name] = {state: "running"}; + // Reset the flash-diff cache so the first render seeds 'running' + // for every row without animating them all at once. + runAllLast = {}; renderRunAll(status); await Promise.all(targets.map(async (t) => { @@ -496,6 +499,14 @@ async function runAll() { runAllBtn.disabled = false; } +let runAllLast = {}; + +function _runAllRowKey(s) { + if (s.state === "done") return `done:${s.time}`; + if (s.state === "failed") return `failed:${s.note || ""}`; + return "running"; +} + function renderRunAll(status) { const done = [], failed = [], running = []; for (const [name, s] of Object.entries(status)) { @@ -506,6 +517,17 @@ function renderRunAll(status) { done.sort((a, b) => a.time - b.time); failed.sort((a, b) => a.name.localeCompare(b.name)); running.sort((a, b) => a.name.localeCompare(b.name)); + // Diff against the previous render so only rows whose status + // string actually changed get the flash animation; otherwise the + // mere act of re-sorting would re-animate everyone every tick. + const changed = new Set(); + for (const [name, s] of Object.entries(status)) { + const key = _runAllRowKey(s); + if (runAllLast[name] !== undefined && runAllLast[name] !== key) { + changed.add(name); + } + runAllLast[name] = key; + } const tbody = runAllTable.querySelector("tbody"); tbody.innerHTML = ""; const fragment = document.createDocumentFragment(); @@ -513,7 +535,10 @@ function renderRunAll(status) { for (let i = 0; i < all.length; i++) { const row = all[i]; const tr = document.createElement("tr"); - tr.className = row.state + (i === 0 && row.state === "done" ? " winner" : ""); + const cls = [row.state]; + if (i === 0 && row.state === "done") cls.push("winner"); + if (changed.has(row.name)) cls.push("flash"); + tr.className = cls.join(" "); const td1 = document.createElement("td"); td1.textContent = row.name; const td2 = document.createElement("td"); diff --git a/playground/web/index.html b/playground/web/index.html index 0449dfbf8b..40a18bc337 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
@@ -61,6 +61,6 @@

ClickBench Playground — run SQL against 90+ databa - + diff --git a/playground/web/style.css b/playground/web/style.css index d258bfce46..7f6c3cb70a 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -112,6 +112,17 @@ button.run-all:hover:not(:disabled) { #runall-table tr.running td.time { color: var(--muted); } #runall-table tr.winner td { background: var(--accent); } +/* Flash on every state change in competition mode: yellow that + fades out over a second. .flash is added by JS only to rows whose + stringified status differs from the previous render. */ +@keyframes runall-flash { + from { background-color: var(--accent); } + to { background-color: transparent; } +} +#runall-table tr.flash td { + animation: runall-flash 1s ease-out; +} + .row { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; } .row label { margin: 0; } .stats { font-family: monospace; } From 1e970919db219e06b1d071d9aab65d2367461983 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:08:53 +0000 Subject: [PATCH 132/221] playground: web: competition panel becomes a left rail with clickable rows Splits the area below the buttons into a 2-column grid (CSS :has) when the competition panel is visible: 320 px left rail for the results table, the rest of the row for the query textarea + output. When the panel is hidden the grid falls back to one column so the single-system flow looks the same as before. Each row is now a button: click it and the page swaps to that system (state list highlight, system status pane, query textarea, output pane all re-target). The URL bar updates with the system's X-Query-Id (the same handle a fresh /api/query would have minted) so reload + share keep the picked system's result. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 73 +++++++++++++++++++++++++++++++++++---- playground/web/index.html | 48 +++++++++++++------------ playground/web/style.css | 25 ++++++++++++++ 3 files changed, 118 insertions(+), 28 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 849b17ee91..6e757187cd 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -467,6 +467,8 @@ async function runAll() { // Reset the flash-diff cache so the first render seeds 'running' // for every row without animating them all at once. runAllLast = {}; + runAllSelected = null; + runAllStatus = status; renderRunAll(status); await Promise.all(targets.map(async (t) => { @@ -477,28 +479,84 @@ async function runAll() { body: t.query, headers: {"Content-Type": "application/octet-stream"}, }); - // Drain the body so the connection closes promptly; we - // only look at headers + status. - await r.arrayBuffer(); + // Read the body so we can show it when the user clicks + // the row to pull this system's result into the main pane. + const body = await r.arrayBuffer(); + const txt = bytesToText(body) || "(no output)"; const h = (k) => r.headers.get(k); + const qid = h("X-Query-Id"); if (r.status >= 400) { - status[t.name] = {state: "failed", note: h("X-Error") || `HTTP ${r.status}`}; + const err = h("X-Error") || `HTTP ${r.status}`; + status[t.name] = { + state: "failed", note: err, + payload: { + output: `(error)\n${err}`, + time: "—", wall: "—", bytes: "—", + truncated: "—", exit: String(r.status), + }, + qid, + query: t.query, + }; } else { const qt = h("X-Query-Time"); const wt = h("X-Wall-Time"); const tsec = qt != null && qt !== "" ? parseFloat(qt) : (wt != null && wt !== "" ? parseFloat(wt) : (performance.now() - t0) / 1000); - status[t.name] = {state: "done", time: tsec}; + status[t.name] = { + state: "done", time: tsec, + payload: { + output: txt, + time: qt ? `${parseFloat(qt).toFixed(3)} s` : "—", + wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${tsec.toFixed(3)} s`, + bytes: h("X-Output-Bytes") || String(body.byteLength), + truncated: h("X-Output-Truncated") === "1" ? "yes" : "no", + exit: h("X-Exit-Code") || String(r.status), + }, + qid, + query: t.query, + }; } } catch (e) { - status[t.name] = {state: "failed", note: String(e)}; + status[t.name] = {state: "failed", note: String(e), query: t.query}; } + runAllStatus = status; renderRunAll(status); })); runAllBtn.disabled = false; } +let runAllStatus = {}; +let runAllSelected = null; + +function pickFromRunAll(name) { + const entry = runAllStatus[name]; + if (!entry) return; + runAllSelected = name; + // Switch the system list highlight + state panel to this system. + if (stateByName[name]) select(name); + // Rewrite the query textarea + result pane to this system's run. + if (entry.query) { + queryEl.value = entry.query; + pristineQuery = entry.query; + } + if (entry.payload) { + resultsByName[name] = entry.payload; + showResult(entry.payload); + } + // Update URL: prefer the X-Query-Id for sharability, fall back + // to a system-scoped permalink so reload at least reopens the + // right system. + const u = new URL(window.location.href); + if (entry.qid) { + u.searchParams.set("q", entry.qid); + } else { + u.searchParams.delete("q"); + } + window.history.replaceState({}, "", u.toString()); + renderRunAll(runAllStatus); // re-paint to highlight the selected row +} + let runAllLast = {}; function _runAllRowKey(s) { @@ -538,7 +596,10 @@ function renderRunAll(status) { const cls = [row.state]; if (i === 0 && row.state === "done") cls.push("winner"); if (changed.has(row.name)) cls.push("flash"); + if (runAllSelected === row.name) cls.push("selected"); tr.className = cls.join(" "); + tr.dataset.name = row.name; + tr.addEventListener("click", () => pickFromRunAll(row.name)); const td1 = document.createElement("td"); td1.textContent = row.name; const td2 = document.createElement("td"); diff --git a/playground/web/index.html b/playground/web/index.html index 40a18bc337..8404965ba3 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
@@ -24,29 +24,33 @@

ClickBench Playground — run SQL against 90+ databa title="Run this example's equivalent on every snapshotted system in parallel">Run all - +
+ -
- -
+
+
+ +
- + - + +
+
- + diff --git a/playground/web/style.css b/playground/web/style.css index 7f6c3cb70a..0f7480b86a 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -111,6 +111,31 @@ button.run-all:hover:not(:disabled) { #runall-table tr.failed td.time { color: var(--bad); } #runall-table tr.running td.time { color: var(--muted); } #runall-table tr.winner td { background: var(--accent); } +#runall-table tr { cursor: pointer; } +#runall-table tr:hover td { background: #f5f5f5; } +#runall-table tr.winner:hover td { background: var(--accent); filter: brightness(0.95); } +#runall-table tr.selected td { outline: 2px solid var(--info); outline-offset: -2px; } + +/* Two-column layout when the competition panel is visible. The + panel is the left rail (~300px); the right column gets the + query textarea and the output. The aside collapses cleanly when + ui-runall is display:none — the grid falls back to 1 column. */ +#ui-split { + display: grid; + grid-template-columns: minmax(0, 1fr); + gap: 16px; + margin: 12px 0; +} +#ui-split:has(aside#ui-runall:not([style*="display: none"])) { + grid-template-columns: 320px minmax(0, 1fr); +} +#ui-main > section:first-child { margin-top: 0; } +aside#ui-runall { margin: 0; } +@media (max-width: 800px) { + #ui-split:has(aside#ui-runall:not([style*="display: none"])) { + grid-template-columns: 1fr; + } +} /* Flash on every state change in competition mode: yellow that fades out over a second. .flash is added by JS only to rows whose From f6224158a22aaaf959c484afa0102b142b739f16 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:11:11 +0000 Subject: [PATCH 133/221] playground: web: left rail = max-content, right pane = remaining Switches the visible-competition column template from '320px ...' to 'max-content minmax(0, 1fr)' so the rail shrinks to the widest system name + timing and the query/output pane claims the rest of the row. When the panel is hidden the grid stays one column and the right pane takes 100% width as before. Co-Authored-By: Claude Opus 4.7 --- playground/web/index.html | 2 +- playground/web/style.css | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/playground/web/index.html b/playground/web/index.html index 8404965ba3..457d266e44 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
diff --git a/playground/web/style.css b/playground/web/style.css index 0f7480b86a..7f36c386cb 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -117,20 +117,24 @@ button.run-all:hover:not(:disabled) { #runall-table tr.selected td { outline: 2px solid var(--info); outline-offset: -2px; } /* Two-column layout when the competition panel is visible. The - panel is the left rail (~300px); the right column gets the - query textarea and the output. The aside collapses cleanly when - ui-runall is display:none — the grid falls back to 1 column. */ + left rail shrinks to its content (minimal width — the longest + "Q9.999 s" cell); the right column claims the rest. When the + aside is display:none the grid is one column and the right pane + gets 100 % width. */ #ui-split { display: grid; grid-template-columns: minmax(0, 1fr); gap: 16px; margin: 12px 0; + align-items: start; } #ui-split:has(aside#ui-runall:not([style*="display: none"])) { - grid-template-columns: 320px minmax(0, 1fr); + grid-template-columns: max-content minmax(0, 1fr); } +#ui-main { min-width: 0; } #ui-main > section:first-child { margin-top: 0; } aside#ui-runall { margin: 0; } +aside#ui-runall #runall-table { width: auto; } @media (max-width: 800px) { #ui-split:has(aside#ui-runall:not([style*="display: none"])) { grid-template-columns: 1fr; From 7eb6dd5dbdae430c85566993a0ccafb9207f851f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:15:43 +0000 Subject: [PATCH 134/221] playground: web: only switch to 2-col grid when competition is actually open The :has()-based selector misread the empty-aside case (its style attribute was 'display:none' without a space, while the selector matched 'display: none' with one) and applied the 2-column grid even when the panel was hidden. The first track resolved to max-content of the textarea (~200 px) and the right pane shrank to nothing. Drop the :has() machinery and have JS add a .split class to #ui-split when entering competition mode. Default view stays block-flow with the textarea + output at full width. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 2 ++ playground/web/index.html | 4 ++-- playground/web/style.css | 22 ++++++++-------------- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 6e757187cd..712b350d41 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -446,11 +446,13 @@ async function ensureQueriesLoaded(name) { return queriesByName[name]; } +const uiSplit = $("#ui-split"); async function runAll() { const idx = parseInt(exampleSel.value, 10); if (isNaN(idx)) return; runAllBtn.disabled = true; runAllSection.style.display = ""; + uiSplit.classList.add("split"); // Collect snapshotted/ready systems with an example at this index. const candidates = Object.values(stateByName) diff --git a/playground/web/index.html b/playground/web/index.html index 457d266e44..5769d79333 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
@@ -65,6 +65,6 @@

ClickBench Playground — run SQL against 90+ databa - + diff --git a/playground/web/style.css b/playground/web/style.css index 7f36c386cb..bcac2a8e9a 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -116,29 +116,23 @@ button.run-all:hover:not(:disabled) { #runall-table tr.winner:hover td { background: var(--accent); filter: brightness(0.95); } #runall-table tr.selected td { outline: 2px solid var(--info); outline-offset: -2px; } -/* Two-column layout when the competition panel is visible. The - left rail shrinks to its content (minimal width — the longest - "Q9.999 s" cell); the right column claims the rest. When the - aside is display:none the grid is one column and the right pane - gets 100 % width. */ -#ui-split { +/* Default view: #ui-main fills the row, the (hidden) aside takes no + space. The .split modifier is added by JS only when competition + mode is active; that's when the grid actually splits into rail + + main and the rail shrinks to max-content. */ +#ui-split { margin: 12px 0; } +#ui-split.split { display: grid; - grid-template-columns: minmax(0, 1fr); + grid-template-columns: max-content minmax(0, 1fr); gap: 16px; - margin: 12px 0; align-items: start; } -#ui-split:has(aside#ui-runall:not([style*="display: none"])) { - grid-template-columns: max-content minmax(0, 1fr); -} #ui-main { min-width: 0; } #ui-main > section:first-child { margin-top: 0; } aside#ui-runall { margin: 0; } aside#ui-runall #runall-table { width: auto; } @media (max-width: 800px) { - #ui-split:has(aside#ui-runall:not([style*="display: none"])) { - grid-template-columns: 1fr; - } + #ui-split.split { grid-template-columns: 1fr; } } /* Flash on every state change in competition mode: yellow that From 28f4372d8e23427d354acce14379e0a60086342e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:18:22 +0000 Subject: [PATCH 135/221] playground: web: competition rail tweaks * Drop the 'Competition' label so the table sits flush with the textarea top. * No more 'winner' highlight on the fastest row. * Selected row shows as white-background + bold instead of an outline. * Rail is now its own scroll viewport (overflow-y: auto) at 100 vh minus its top offset, measured once on competition start; the query/output pane keeps the same. * Keyboard arrows on the (focused) rail walk the current sort order and pick a row; the selection scrolls into view. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 33 ++++++++++++++++++++++++++++++++- playground/web/index.html | 7 +++---- playground/web/style.css | 29 +++++++++++++++++++++-------- 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 712b350d41..3d9d290b30 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -447,12 +447,25 @@ async function ensureQueriesLoaded(name) { } const uiSplit = $("#ui-split"); + +function _measureSplitOffset() { + // Pin the split row's height so that aside scrolls inside its + // own track. Read the distance from the page top to #ui-split + // and store it in a CSS var; the layout rule subtracts it from + // 100 vh. + const top = uiSplit.getBoundingClientRect().top + window.scrollY; + document.documentElement.style.setProperty("--ui-split-offset", `${top + 20}px`); +} +window.addEventListener("resize", _measureSplitOffset); + async function runAll() { const idx = parseInt(exampleSel.value, 10); if (isNaN(idx)) return; runAllBtn.disabled = true; runAllSection.style.display = ""; uiSplit.classList.add("split"); + _measureSplitOffset(); + runAllSection.focus(); // Collect snapshotted/ready systems with an example at this index. const candidates = Object.values(stateByName) @@ -596,7 +609,6 @@ function renderRunAll(status) { const row = all[i]; const tr = document.createElement("tr"); const cls = [row.state]; - if (i === 0 && row.state === "done") cls.push("winner"); if (changed.has(row.name)) cls.push("flash"); if (runAllSelected === row.name) cls.push("selected"); tr.className = cls.join(" "); @@ -618,6 +630,25 @@ function renderRunAll(status) { fragment.appendChild(tr); } tbody.appendChild(fragment); + runAllOrder = all.map(r => r.name); } +// Up/down navigation through the rail. The aside is focusable +// (tabindex=0) so the user can tab into it; arrow keys then walk +// the current sort order and pick the next/prev row. +let runAllOrder = []; +runAllSection.addEventListener("keydown", (e) => { + if (e.key !== "ArrowDown" && e.key !== "ArrowUp") return; + if (!runAllOrder.length) return; + e.preventDefault(); + let i = runAllOrder.indexOf(runAllSelected); + if (i === -1) i = e.key === "ArrowDown" ? -1 : runAllOrder.length; + const step = e.key === "ArrowDown" ? 1 : -1; + const next = runAllOrder[Math.max(0, Math.min(runAllOrder.length - 1, i + step))]; + pickFromRunAll(next); + // Keep the picked row in view inside the scrollable rail. + const sel = runAllTable.querySelector("tr.selected"); + if (sel) sel.scrollIntoView({block: "nearest"}); +}); + runAllBtn.addEventListener("click", runAll); diff --git a/playground/web/index.html b/playground/web/index.html index 5769d79333..d7fa3e03b9 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
@@ -25,8 +25,7 @@

ClickBench Playground — run SQL against 90+ databa
-
- - + -
-
- System status -
loading…
-
-
+
+
+ System status +
loading…
+
+
+ + - + From 45ab7ca807a26466124860d56dcd771e3e69540e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:21:01 +0000 Subject: [PATCH 137/221] playground: gizmosql: .mode box (was .mode trash, discarded all rows) The original runner used '.mode trash' to keep timing parsing clean by throwing away result rows. Under the playground that yielded an empty result body even when the query succeeded. '.mode box' renders a readable table; the 'Run Time:' line still matches the timing regex. Co-Authored-By: Claude Opus 4.7 --- gizmosql/query | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gizmosql/query b/gizmosql/query index 937e567bb0..f1445e767e 100755 --- a/gizmosql/query +++ b/gizmosql/query @@ -11,8 +11,11 @@ set -e query=$(cat) # .timer on: emit "Run Time: s" per statement. -# .mode trash: discard result rows so timing parsing isn't polluted. -script=$(printf '.timer on\n.mode trash\n%s\n' "$query") +# `.mode box` keeps the result readable in the playground UI; the +# original ClickBench runner used `.mode trash` to discard rows +# for clean timing, but the user wants to see output here. The +# Run Time line still parses fine. +script=$(printf '.timer on\n.mode box\n%s\n' "$query") raw=$(printf '%s' "$script" | gizmosql_client 2>&1) && exit_code=0 || exit_code=$? From 0e1adccfb1865dc0ecbcd2d9526c2b79f0a98b69 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:22:49 +0000 Subject: [PATCH 138/221] =?UTF-8?q?playground:=20parseable:=20inline=20the?= =?UTF-8?q?=20ingest=20command=20=E2=80=94=20exported=20bash=20fn=20invisi?= =?UTF-8?q?ble=20to=20parallel/sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GNU parallel runs each job under /bin/sh (not bash) by default, and 'export -f ingest_chunk' only carries the function into bash children. The chunks were silently routed into a non-existent command name, parallel exited 0, the load took 4700+ s, and the table came back with 0 rows. Inline the awk + curl pipeline as parallel's literal command string so it's interpreted directly by /bin/sh. Add curl --fail --show-error so an HTTP error from /api/v1/ingest now propagates to the load script's exit code. Co-Authored-By: Claude Opus 4.7 --- parseable/load | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/parseable/load b/parseable/load index 1deef6f7bd..763fe364af 100755 --- a/parseable/load +++ b/parseable/load @@ -28,30 +28,22 @@ curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/h --data-binary @static_schema.json >/dev/null # Wrap each block of LINES_PER_CHUNK NDJSON lines in [ ... ] and POST -# directly to /api/v1/ingest. The previous implementation wrote each -# chunk as a separate file under ./partitioned/ first — at 2500 lines -# per chunk that produced ~40000 files totaling >200 GB, filling up -# the VM's sparse rootfs (`No space left on device` in load.log). -# Streaming the chunks straight to curl keeps the on-disk footprint -# at zero. -ingest_chunk() { - awk 'BEGIN{print "["} { - if (NR>1) print prev "," - prev=$0 - } END{ if (prev) print prev; print "]" }' | - curl --silent --location \ - -H 'Content-Type: application/json' \ - -H 'X-P-Stream: hits' \ - -k -XPOST -u "admin:admin" \ - 'http://localhost:8000/api/v1/ingest' \ - --data-binary @- >/dev/null -} -export -f ingest_chunk - +# directly to /api/v1/ingest. Inlined into parallel's command string +# because parallel runs jobs via /bin/sh by default, and a bash +# `export -f`'d function isn't visible in that shell — the previous +# version silently no-op'd every chunk and load wrote 0 rows. LINES_PER_CHUNK=2500 INGEST_JOBS=6 pv hits.json | parallel --pipe -N$LINES_PER_CHUNK --block 10M \ - --jobs "$INGEST_JOBS" ingest_chunk + --jobs "$INGEST_JOBS" --halt-on-error 0 ' + awk "BEGIN{print \"[\"} NR>1{print prev \",\"} {prev=\$0} END{if (prev) print prev; print \"]\"}" | + curl --silent --show-error --fail \ + -H "Content-Type: application/json" \ + -H "X-P-Stream: hits" \ + -k -XPOST -u "admin:admin" \ + "http://localhost:8000/api/v1/ingest" \ + --data-binary @- >/dev/null + ' # Drop the symlink to the RO dataset — no chunk files to clean up. rm -f hits.json hits.json.gz From ae98b6cb9520bbea7f9b1224d34d1f1c8e2854e3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:25:09 +0000 Subject: [PATCH 139/221] playground: web: competition runs 3 rounds, fires in shuffled order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fire round 1 in a Fisher–Yates-shuffled sequence so no single system gets a deterministic head start. The displayed sort is still by best timing. * Each system that survives round 1 runs the same query two more times. The table grows to three timing columns (best-of-three is the sort key). * Per-run state is rendered explicitly: '—' for not-yet-started, 'running', the seconds for done, 'failed' (with tooltip) when any run hits 4xx/5xx. A failure in any round marks the row failed and stops further rounds for that system. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 148 +++++++++++++++++++++++++++----------- playground/web/index.html | 4 +- playground/web/style.css | 1 + 3 files changed, 109 insertions(+), 44 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 3d9d290b30..754d29c7ed 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -475,10 +475,15 @@ async function runAll() { const qs = await ensureQueriesLoaded(s.name); if (qs && idx < qs.length) targets.push({name: s.name, query: qs[idx]}); } - targets.sort((a, b) => a.name.localeCompare(b.name)); const status = {}; - for (const t of targets) status[t.name] = {state: "running"}; + for (const t of targets) { + status[t.name] = { + state: "running", + runs: [{state: "running"}, {state: "pending"}, {state: "pending"}], + query: t.query, + }; + } // Reset the flash-diff cache so the first render seeds 'running' // for every row without animating them all at once. runAllLast = {}; @@ -486,7 +491,16 @@ async function runAll() { runAllStatus = status; renderRunAll(status); - await Promise.all(targets.map(async (t) => { + // Random order: keep the table sorted but fire requests in a + // shuffled sequence so no single system gets a systematic head + // start. + const shuffled = targets.slice(); + for (let i = shuffled.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]]; + } + + async function _runOne(t) { const t0 = performance.now(); try { const r = await fetch(`${API}/api/query?system=${encodeURIComponent(t.name)}`, { @@ -494,49 +508,86 @@ async function runAll() { body: t.query, headers: {"Content-Type": "application/octet-stream"}, }); - // Read the body so we can show it when the user clicks - // the row to pull this system's result into the main pane. const body = await r.arrayBuffer(); const txt = bytesToText(body) || "(no output)"; const h = (k) => r.headers.get(k); const qid = h("X-Query-Id"); if (r.status >= 400) { const err = h("X-Error") || `HTTP ${r.status}`; - status[t.name] = { - state: "failed", note: err, + return { + ok: false, note: err, qid, payload: { output: `(error)\n${err}`, time: "—", wall: "—", bytes: "—", truncated: "—", exit: String(r.status), }, - qid, - query: t.query, - }; - } else { - const qt = h("X-Query-Time"); - const wt = h("X-Wall-Time"); - const tsec = qt != null && qt !== "" - ? parseFloat(qt) - : (wt != null && wt !== "" ? parseFloat(wt) : (performance.now() - t0) / 1000); - status[t.name] = { - state: "done", time: tsec, - payload: { - output: txt, - time: qt ? `${parseFloat(qt).toFixed(3)} s` : "—", - wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${tsec.toFixed(3)} s`, - bytes: h("X-Output-Bytes") || String(body.byteLength), - truncated: h("X-Output-Truncated") === "1" ? "yes" : "no", - exit: h("X-Exit-Code") || String(r.status), - }, - qid, - query: t.query, }; } + const qt = h("X-Query-Time"); + const wt = h("X-Wall-Time"); + const tsec = qt != null && qt !== "" + ? parseFloat(qt) + : (wt != null && wt !== "" ? parseFloat(wt) : (performance.now() - t0) / 1000); + return { + ok: true, time: tsec, qid, + payload: { + output: txt, + time: qt ? `${parseFloat(qt).toFixed(3)} s` : "—", + wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${tsec.toFixed(3)} s`, + bytes: h("X-Output-Bytes") || String(body.byteLength), + truncated: h("X-Output-Truncated") === "1" ? "yes" : "no", + exit: h("X-Exit-Code") || String(r.status), + }, + }; } catch (e) { - status[t.name] = {state: "failed", note: String(e), query: t.query}; + return {ok: false, note: String(e)}; + } + } + + function _recordRun(name, idx, res, query) { + const s = status[name]; + if (!s) return; + s.runs[idx] = res.ok + ? {state: "done", time: res.time} + : {state: "failed", note: res.note}; + // Cache the first successful run's payload for click-to-show. + if (res.ok && !s.payload) { + s.payload = res.payload; + s.qid = res.qid; + s.query = query; + } else if (!res.ok && !s.payload && idx === 0) { + s.payload = res.payload || { + output: `(error)\n${res.note || ""}`, + time: "—", wall: "—", bytes: "—", truncated: "—", exit: "err", + }; + s.query = query; + } + // Overall: failed if any run failed; done when all 3 are done; + // running otherwise. + if (s.runs.some(r => r.state === "failed")) { + s.state = "failed"; + s.note = s.runs.find(r => r.state === "failed").note; + } else if (s.runs.every(r => r.state === "done")) { + s.state = "done"; + s.time = Math.min(...s.runs.map(r => r.time)); + } else { + s.state = "running"; } runAllStatus = status; renderRunAll(status); + } + + // Run all three rounds in the shuffled order; rounds 2 and 3 only + // fire for systems whose round 1 succeeded. + await Promise.all(shuffled.map(async (t) => { + const r1 = await _runOne(t); + _recordRun(t.name, 0, r1, t.query); + if (!r1.ok) return; + const r2 = await _runOne(t); + _recordRun(t.name, 1, r2, t.query); + if (!r2.ok) return; + const r3 = await _runOne(t); + _recordRun(t.name, 2, r3, t.query); })); runAllBtn.disabled = false; } @@ -575,9 +626,14 @@ function pickFromRunAll(name) { let runAllLast = {}; function _runAllRowKey(s) { - if (s.state === "done") return `done:${s.time}`; - if (s.state === "failed") return `failed:${s.note || ""}`; - return "running"; + const runs = (s.runs || []).map(r => { + if (!r) return "-"; + if (r.state === "done") return `d:${r.time}`; + if (r.state === "failed") return `f`; + if (r.state === "pending") return `p`; + return r.state; + }).join("|"); + return `${s.state}|${runs}`; } function renderRunAll(status) { @@ -616,17 +672,25 @@ function renderRunAll(status) { tr.addEventListener("click", () => pickFromRunAll(row.name)); const td1 = document.createElement("td"); td1.textContent = row.name; - const td2 = document.createElement("td"); - td2.className = "time"; - if (row.state === "done") { - td2.textContent = `${row.time.toFixed(3)} s`; - } else if (row.state === "failed") { - td2.textContent = "failed"; - td2.title = row.note || ""; - } else { - td2.textContent = "running"; + tr.appendChild(td1); + const runs = row.runs || [{state: row.state, time: row.time, note: row.note}]; + for (let k = 0; k < 3; k++) { + const td = document.createElement("td"); + td.className = "time"; + const r = runs[k]; + if (!r || r.state === "pending") { + td.textContent = "—"; + td.classList.add("pending"); + } else if (r.state === "done") { + td.textContent = `${r.time.toFixed(3)} s`; + } else if (r.state === "failed") { + td.textContent = "failed"; + td.title = r.note || ""; + } else { + td.textContent = "running"; + } + tr.appendChild(td); } - tr.appendChild(td1); tr.appendChild(td2); fragment.appendChild(tr); } tbody.appendChild(fragment); diff --git a/playground/web/index.html b/playground/web/index.html index b7ba89c585..081667119e 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
@@ -64,6 +64,6 @@

ClickBench Playground — run SQL against 90+ databa - + diff --git a/playground/web/style.css b/playground/web/style.css index 520808375c..960b994ce3 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -114,6 +114,7 @@ button.run-all:hover:not(:disabled) { #runall-table tr:hover td { background: #f5f5f5; } #runall-table tr.selected td { background: white; color: var(--fg); font-weight: 600; } #runall-table tr.selected:hover td { background: white; } +#runall-table td.time.pending { color: var(--muted); } /* Default view: #ui-main fills the row, the (hidden) aside takes no space. The .split modifier is added by JS only when competition From 54f28e8084ba18cede36667617f298ef1928aa65 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:26:45 +0000 Subject: [PATCH 140/221] playground: quickwit: detect 'message' errors + queries.json -> queries.sql The error-detection branch in quickwit/query only looked at .error / .status. Quickwit returns the JSON-parse failure as {"message": "expected value at line 1 column 1"} which the old check missed, so the playground recorded the failed query as a success. Add .message + a 'no .took' fallback so any shape of malformed response surfaces as exit 1. Also rename the workload file from queries.json to queries.sql (removing the cosmetic SQL one that was sitting alongside) so the playground UI picks it up via the standard handle_queries path. Quickwit consumes Elasticsearch DSL JSON; the .sql name is just the cross-system convention for the file the playground reads. Co-Authored-By: Claude Opus 4.7 --- quickwit/queries.json | 43 ---------------------- quickwit/queries.sql | 86 +++++++++++++++++++++---------------------- quickwit/query | 16 +++++++- 3 files changed, 57 insertions(+), 88 deletions(-) delete mode 100644 quickwit/queries.json diff --git a/quickwit/queries.json b/quickwit/queries.json deleted file mode 100644 index b7b298d699..0000000000 --- a/quickwit/queries.json +++ /dev/null @@ -1,43 +0,0 @@ -{"size":0,"track_total_hits":true,"query":{"match_all":{}}} -{"size":0,"track_total_hits":true,"query":{"bool":{"must_not":[{"term":{"AdvEngineID":0}}]}}} -{"size":0,"track_total_hits":true,"aggs":{"sum_adv":{"sum":{"field":"AdvEngineID"}},"avg_res":{"avg":{"field":"ResolutionWidth"}}}} -{"size":0,"aggs":{"avg_user":{"avg":{"field":"UserID"}}}} -{"size":0,"aggs":{"u":{"cardinality":{"field":"UserID"}}}} -{"size":0,"aggs":{"u":{"cardinality":{"field":"SearchPhrase"}}}} -{"size":0,"aggs":{"min_date":{"min":{"field":"EventDate"}},"max_date":{"max":{"field":"EventDate"}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"AdvEngineID":0}}]}},"aggs":{"by_adv":{"terms":{"field":"AdvEngineID","size":1000,"order":{"_count":"desc"}}}}} -{"size":0,"aggs":{"r":{"terms":{"field":"RegionID","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} -{"size":0,"aggs":{"r":{"terms":{"field":"RegionID","size":10,"order":{"_count":"desc"}},"aggs":{"sumadv":{"sum":{"field":"AdvEngineID"}},"avgres":{"avg":{"field":"ResolutionWidth"}},"u":{"cardinality":{"field":"UserID"}}}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"MobilePhoneModel":""}}]}},"aggs":{"m":{"terms":{"field":"MobilePhoneModel","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"MobilePhoneModel":""}}]}},"aggs":{"p":{"terms":{"field":"MobilePhone","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}},"m":{"terms":{"field":"MobilePhoneModel","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"se":{"terms":{"field":"SearchEngineID","size":10,"order":{"_count":"desc"}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}}}} -{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10,"order":{"_count":"desc"}}}}} -{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10,"order":{"_count":"desc"}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}}}} -{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10}}}}}} -null -{"size":10,"query":{"term":{"UserID":435090932899640449}}} -{"size":0,"track_total_hits":true,"query":{"wildcard":{"URL":"*google*"}}} -{"size":0,"query":{"bool":{"filter":[{"wildcard":{"URL":"*google*"}}],"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}} -{"size":0,"query":{"bool":{"filter":[{"wildcard":{"Title":"*Google*"}}],"must_not":[{"wildcard":{"URL":"*.google.*"}},{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} -{"size":10,"query":{"wildcard":{"URL":"*google*"}},"sort":[{"EventTime":"asc"}]} -{"size":10,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"sort":[{"EventTime":"asc"}]} -null -null -null -null -null -{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"se":{"terms":{"field":"SearchEngineID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} -{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"w":{"terms":{"field":"WatchID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} -{"size":0,"aggs":{"w":{"terms":{"field":"WatchID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} -{"size":0,"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} -{"size":0,"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} -null -{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"DontCountHits":0}},{"term":{"IsRefresh":0}}],"must_not":[{"term":{"URL":""}}]}},"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} -{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"DontCountHits":0}},{"term":{"IsRefresh":0}}],"must_not":[{"term":{"Title":""}}]}},"aggs":{"t":{"terms":{"field":"Title","size":10,"order":{"_count":"desc"}}}}} -{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"term":{"IsDownload":0}}],"must_not":[{"term":{"IsLink":0}}]}},"aggs":{"u":{"terms":{"field":"URL","size":1010,"order":{"_count":"desc"}}}}} -null -{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"bool":{"should":[{"term":{"TraficSourceID":-1}},{"term":{"TraficSourceID":6}}]}},{"term":{"RefererHash":3594120000172545465}}]}},"aggs":{"uh":{"terms":{"field":"URLHash","size":110,"order":{"_count":"desc"}},"aggs":{"ed":{"terms":{"field":"EventDate","size":110,"order":{"_count":"desc"}}}}}}} -{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"term":{"DontCountHits":0}},{"term":{"URLHash":2868770270353813622}}]}},"aggs":{"w":{"terms":{"field":"WindowClientWidth","size":10010,"order":{"_count":"desc"}},"aggs":{"h":{"terms":{"field":"WindowClientHeight","size":10010,"order":{"_count":"desc"}}}}}}} -{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-14","lte":"2013-07-15"}}},{"term":{"IsRefresh":0}},{"term":{"DontCountHits":0}}]}},"aggs":{"dt":{"date_histogram":{"field":"EventTime","fixed_interval":"1m"}}}} diff --git a/quickwit/queries.sql b/quickwit/queries.sql index 7d093d057d..b7b298d699 100644 --- a/quickwit/queries.sql +++ b/quickwit/queries.sql @@ -1,43 +1,43 @@ -SELECT COUNT(*) FROM hits; -SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; -SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; -SELECT AVG(UserID) FROM hits; -SELECT COUNT(DISTINCT UserID) FROM hits; -SELECT COUNT(DISTINCT SearchPhrase) FROM hits; -SELECT MIN(EventDate), MAX(EventDate) FROM hits; -SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; -SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; -SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; -SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; -SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; -SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; -SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; -SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID FROM hits WHERE UserID = 435090932899640449; -SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; -SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; -SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; -SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; -SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; -SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; -SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; -SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; -SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; -SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1010; -SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1010; -SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 110; -SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10010; -SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 1010; +{"size":0,"track_total_hits":true,"query":{"match_all":{}}} +{"size":0,"track_total_hits":true,"query":{"bool":{"must_not":[{"term":{"AdvEngineID":0}}]}}} +{"size":0,"track_total_hits":true,"aggs":{"sum_adv":{"sum":{"field":"AdvEngineID"}},"avg_res":{"avg":{"field":"ResolutionWidth"}}}} +{"size":0,"aggs":{"avg_user":{"avg":{"field":"UserID"}}}} +{"size":0,"aggs":{"u":{"cardinality":{"field":"UserID"}}}} +{"size":0,"aggs":{"u":{"cardinality":{"field":"SearchPhrase"}}}} +{"size":0,"aggs":{"min_date":{"min":{"field":"EventDate"}},"max_date":{"max":{"field":"EventDate"}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"AdvEngineID":0}}]}},"aggs":{"by_adv":{"terms":{"field":"AdvEngineID","size":1000,"order":{"_count":"desc"}}}}} +{"size":0,"aggs":{"r":{"terms":{"field":"RegionID","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"aggs":{"r":{"terms":{"field":"RegionID","size":10,"order":{"_count":"desc"}},"aggs":{"sumadv":{"sum":{"field":"AdvEngineID"}},"avgres":{"avg":{"field":"ResolutionWidth"}},"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"MobilePhoneModel":""}}]}},"aggs":{"m":{"terms":{"field":"MobilePhoneModel","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"MobilePhoneModel":""}}]}},"aggs":{"p":{"terms":{"field":"MobilePhone","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}},"m":{"terms":{"field":"MobilePhoneModel","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"u":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"se":{"terms":{"field":"SearchEngineID","size":10,"order":{"_count":"desc"}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10,"order":{"_count":"desc"}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"UserID","size":10},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10}}}}}} +null +{"size":10,"query":{"term":{"UserID":435090932899640449}}} +{"size":0,"track_total_hits":true,"query":{"wildcard":{"URL":"*google*"}}} +{"size":0,"query":{"bool":{"filter":[{"wildcard":{"URL":"*google*"}}],"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"filter":[{"wildcard":{"Title":"*Google*"}}],"must_not":[{"wildcard":{"URL":"*.google.*"}},{"term":{"SearchPhrase":""}}]}},"aggs":{"sp":{"terms":{"field":"SearchPhrase","size":10,"order":{"_count":"desc"}},"aggs":{"u":{"cardinality":{"field":"UserID"}}}}}} +{"size":10,"query":{"wildcard":{"URL":"*google*"}},"sort":[{"EventTime":"asc"}]} +{"size":10,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"sort":[{"EventTime":"asc"}]} +null +null +null +null +null +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"se":{"terms":{"field":"SearchEngineID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} +{"size":0,"query":{"bool":{"must_not":[{"term":{"SearchPhrase":""}}]}},"aggs":{"w":{"terms":{"field":"WatchID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} +{"size":0,"aggs":{"w":{"terms":{"field":"WatchID","size":10,"order":{"_count":"desc"}},"aggs":{"ip":{"terms":{"field":"ClientIP","size":10,"order":{"_count":"desc"}},"aggs":{"sumref":{"sum":{"field":"IsRefresh"}},"avgres":{"avg":{"field":"ResolutionWidth"}}}}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} +null +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"DontCountHits":0}},{"term":{"IsRefresh":0}}],"must_not":[{"term":{"URL":""}}]}},"aggs":{"u":{"terms":{"field":"URL","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"DontCountHits":0}},{"term":{"IsRefresh":0}}],"must_not":[{"term":{"Title":""}}]}},"aggs":{"t":{"terms":{"field":"Title","size":10,"order":{"_count":"desc"}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"term":{"IsDownload":0}}],"must_not":[{"term":{"IsLink":0}}]}},"aggs":{"u":{"terms":{"field":"URL","size":1010,"order":{"_count":"desc"}}}}} +null +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"bool":{"should":[{"term":{"TraficSourceID":-1}},{"term":{"TraficSourceID":6}}]}},{"term":{"RefererHash":3594120000172545465}}]}},"aggs":{"uh":{"terms":{"field":"URLHash","size":110,"order":{"_count":"desc"}},"aggs":{"ed":{"terms":{"field":"EventDate","size":110,"order":{"_count":"desc"}}}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-01","lte":"2013-07-31"}}},{"term":{"IsRefresh":0}},{"term":{"DontCountHits":0}},{"term":{"URLHash":2868770270353813622}}]}},"aggs":{"w":{"terms":{"field":"WindowClientWidth","size":10010,"order":{"_count":"desc"}},"aggs":{"h":{"terms":{"field":"WindowClientHeight","size":10010,"order":{"_count":"desc"}}}}}}} +{"size":0,"query":{"bool":{"filter":[{"term":{"CounterID":62}},{"range":{"EventDate":{"gte":"2013-07-14","lte":"2013-07-15"}}},{"term":{"IsRefresh":0}},{"term":{"DontCountHits":0}}]}},"aggs":{"dt":{"date_histogram":{"field":"EventTime","fixed_interval":"1m"}}}} diff --git a/quickwit/query b/quickwit/query index 0a7eb7d05e..8674401de0 100755 --- a/quickwit/query +++ b/quickwit/query @@ -19,11 +19,23 @@ resp=$(curl -sS -X POST \ -d "$query" \ http://localhost:7280/api/v1/_elastic/hits/_search) -took=$(printf '%s' "$resp" | jq -r 'if has("error") or has("status") then empty else (.took | tostring) end') -if [ -z "$took" ]; then +# Treat the response as an error if it carries any of the known +# error keys (.error / .status / .message) OR if it has no .took +# at all. The previous check looked only at .error/.status — a +# malformed-body response like +# {"message":"expected value at line 1 column 1"} +# slipped through as success. +err=$(printf '%s' "$resp" | jq -r ' + if has("error") then "error: " + (.error|tostring) + elif has("status") then "status: " + (.status|tostring) + elif has("message") then "message: " + (.message|tostring) + elif (.took // null) == null then "no .took in response" + else empty end') +if [ -n "$err" ]; then printf '%s\n' "$resp" >&2 exit 1 fi +took=$(printf '%s' "$resp" | jq -r '.took') printf '%s\n' "$resp" From d53d960e5d2a0968a92c61183c70fe4505a0832b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:28:15 +0000 Subject: [PATCH 141/221] playground: web: sort by best-so-far time, not only when all 3 rounds are in Previously a system stayed in the 'running' bucket (alphabetical order) until all three rounds completed. Track s.bestTime as min(completed runs) and place any row with at least one done run into the timed-and-sorted group; the row keeps its overall state ('running' until round 3 lands) but its position reflects the fastest run we've seen so far. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 20 +++++++++++++++----- playground/web/index.html | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 754d29c7ed..a275e2e279 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -563,13 +563,20 @@ async function runAll() { s.query = query; } // Overall: failed if any run failed; done when all 3 are done; - // running otherwise. + // running otherwise. bestTime tracks the min of whatever runs + // have completed so far so partial-done systems can sort into + // the done group instead of sitting in running until the last + // round lands. + const doneRuns = s.runs.filter(r => r.state === "done"); + s.bestTime = doneRuns.length + ? Math.min(...doneRuns.map(r => r.time)) + : undefined; if (s.runs.some(r => r.state === "failed")) { s.state = "failed"; s.note = s.runs.find(r => r.state === "failed").note; } else if (s.runs.every(r => r.state === "done")) { s.state = "done"; - s.time = Math.min(...s.runs.map(r => r.time)); + s.time = s.bestTime; } else { s.state = "running"; } @@ -639,11 +646,14 @@ function _runAllRowKey(s) { function renderRunAll(status) { const done = [], failed = [], running = []; for (const [name, s] of Object.entries(status)) { - if (s.state === "done") done.push({name, ...s}); - else if (s.state === "failed") failed.push({name, ...s}); + if (s.state === "failed") failed.push({name, ...s}); + // Include partial-done systems in the timed group so the + // table sorts as soon as the first run lands rather than + // waiting for all three rounds. + else if (s.bestTime != null) done.push({name, ...s}); else running.push({name, ...s}); } - done.sort((a, b) => a.time - b.time); + done.sort((a, b) => a.bestTime - b.bestTime); failed.sort((a, b) => a.name.localeCompare(b.name)); running.sort((a, b) => a.name.localeCompare(b.name)); // Diff against the previous render so only rows whose status diff --git a/playground/web/index.html b/playground/web/index.html index 081667119e..675cdfbf8e 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -64,6 +64,6 @@

ClickBench Playground — run SQL against 90+ databa - + From 487a41bf3d7e7965d94a305a591b470052b9d111 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:29:25 +0000 Subject: [PATCH 142/221] playground: tidb: --tag clickbench so the data dir persists across restarts tiup playground generates a fresh data dir per invocation if no --tag is given. Pre-snapshot stop killed the loaded cluster and the subsequent pre-snapshot start spun up a brand new one; the snapshot captured the empty replacement and queries against the restored VM returned 'Table test.hits doesn't exist'. Pin --tag clickbench so the load survives stop/start. Co-Authored-By: Claude Opus 4.7 --- tidb/start | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tidb/start b/tidb/start index d0465e97ca..a77860232c 100755 --- a/tidb/start +++ b/tidb/start @@ -22,7 +22,8 @@ else NUM_TIFLASH_INSTANCES=1 fi -nohup tiup playground "$TIDBVERSION" --db 1 --pd 1 --kv 1 \ +nohup tiup playground "$TIDBVERSION" --tag clickbench \ + --db 1 --pd 1 --kv 1 \ --tiflash $NUM_TIFLASH_INSTANCES \ --db.config "$DB_CONFIG_FILE" \ --without-monitor > tiup-cluster.out 2>&1 & From 597d7cea5248fdd7a12ca80ef56132e205ae2748 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:36:38 +0000 Subject: [PATCH 143/221] playground: web: 'Run all' also works for custom textarea queries Previously the button only showed when an example index was picked, and refused to fire otherwise. Now it shows whenever either condition holds: example picked -> each system runs its OWN translation of that example (apples-to-apples ClickBench format) custom textarea -> the exact string runs against every system; incompatible engines just land in the failed bucket. The textarea input handler refreshes button visibility on every keystroke so the button hides cleanly once you delete the query. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 29 ++++++++++++++++++++++------- playground/web/index.html | 4 ++-- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index a275e2e279..e3b65a12f2 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -365,8 +365,8 @@ exampleSel.addEventListener("change", () => { queryEl.addEventListener("input", () => { if (queryEl.value !== pristineQuery) { exampleSel.value = ""; - refreshRunAllVisibility(); } + refreshRunAllVisibility(); }); queryEl.addEventListener("keydown", (e) => { if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery(); @@ -431,8 +431,12 @@ const runAllSection = $("#ui-runall"); const runAllTable = $("#runall-table"); function refreshRunAllVisibility() { - const v = exampleSel.value; - runAllBtn.style.display = (v === "" || isNaN(parseInt(v, 10))) ? "none" : ""; + // Always available when there's *something* to run: a picked + // example index OR a non-empty custom query in the textarea. + const haveExample = exampleSel.value !== "" + && !isNaN(parseInt(exampleSel.value, 10)); + const haveCustom = queryEl.value.trim() !== ""; + runAllBtn.style.display = (haveExample || haveCustom) ? "" : "none"; } async function ensureQueriesLoaded(name) { @@ -460,20 +464,31 @@ window.addEventListener("resize", _measureSplitOffset); async function runAll() { const idx = parseInt(exampleSel.value, 10); - if (isNaN(idx)) return; + const useExampleIndex = !isNaN(idx); + const customQuery = queryEl.value; + if (!useExampleIndex && !customQuery.trim()) return; runAllBtn.disabled = true; runAllSection.style.display = ""; uiSplit.classList.add("split"); _measureSplitOffset(); runAllSection.focus(); - // Collect snapshotted/ready systems with an example at this index. + // Collect candidate systems. With an example picked, each system + // runs its OWN translation of the example at the same index + // (the apples-to-apples ClickBench format). With a custom query + // in the textarea, every system runs the exact same string — + // the systems whose query language doesn't accept it will just + // show up in the failed bucket. const candidates = Object.values(stateByName) .filter(s => s.state === "snapshotted" || s.state === "ready"); const targets = []; for (const s of candidates) { - const qs = await ensureQueriesLoaded(s.name); - if (qs && idx < qs.length) targets.push({name: s.name, query: qs[idx]}); + if (useExampleIndex) { + const qs = await ensureQueriesLoaded(s.name); + if (qs && idx < qs.length) targets.push({name: s.name, query: qs[idx]}); + } else { + targets.push({name: s.name, query: customQuery}); + } } const status = {}; diff --git a/playground/web/index.html b/playground/web/index.html index 675cdfbf8e..41b767ed56 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -21,7 +21,7 @@

ClickBench Playground — run SQL against 90+ databa + title="Run this query on every snapshotted system in parallel — per-system translation if an example is picked, exact text otherwise">Run all
@@ -64,6 +64,6 @@

ClickBench Playground — run SQL against 90+ databa

- + From 370f7e15783e5f2a02fc66618efae535d03f997c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:39:46 +0000 Subject: [PATCH 144/221] elasticsearch: wait for shards to recover before declaring ready Post-snapshot-restore, ES responds on :9200 but shards are still recovering. Queries land before allocation completes and fail with no_shard_available_action_exception (status 503). Make start/check poll _cluster/health/hits and require active shards before returning success. Co-Authored-By: Claude Opus 4.7 --- elasticsearch/check | 12 ++++++++++++ elasticsearch/start | 45 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/elasticsearch/check b/elasticsearch/check index ca1249471d..fe2d8029fe 100755 --- a/elasticsearch/check +++ b/elasticsearch/check @@ -1,4 +1,16 @@ #!/bin/bash +# Ready iff HTTP is up AND the `hits` index has at least one active +# shard. The previous version only checked HTTP, which let post-restore +# queries hit a still-recovering cluster and return +# no_shard_available_action_exception (status 503). set -e curl -sSf 'http://localhost:9200' >/dev/null + +# `hits` may not exist yet on a freshly provisioned VM (load hasn't +# run). In that case HTTP up is enough. +if curl -sSf 'http://localhost:9200/hits' >/dev/null 2>&1; then + body=$(curl -sS 'http://localhost:9200/_cluster/health/hits' 2>/dev/null) + printf '%s' "$body" | grep -Eq '"status":"(yellow|green)"' + printf '%s' "$body" | grep -Eq '"active_shards":[1-9]' +fi diff --git a/elasticsearch/start b/elasticsearch/start index 8fa3183341..f9c7633921 100755 --- a/elasticsearch/start +++ b/elasticsearch/start @@ -1,7 +1,46 @@ #!/bin/bash +# Idempotent. Returns only when: +# 1) HTTP is up on :9200, +# 2) the cluster has reached at least yellow, +# 3) the `hits` index (if it exists) reports active shards. +# +# Post-snapshot-restore symptom we're guarding against: +# /_search returns 503 no_shard_available_action_exception while ES is +# still recovering shards. We poll _cluster/health and refuse to return +# success until shards are actually assigned. set -eu -if curl -sSf 'http://localhost:9200' >/dev/null 2>&1; then - exit 0 +if ! curl -sSf 'http://localhost:9200' >/dev/null 2>&1; then + sudo systemctl start elasticsearch.service +fi + +# Wait for HTTP up. +for _ in $(seq 1 60); do + curl -sSf 'http://localhost:9200' >/dev/null 2>&1 && break + sleep 2 +done + +# Wait for cluster to reach at least yellow. +# wait_for_status blocks server-side up to timeout; loop in case it +# returns timed_out=true. +for _ in $(seq 1 30); do + if curl -sSf \ + 'http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s' \ + 2>/dev/null | grep -Eq '"status":"(yellow|green)"'; then + break + fi + sleep 2 +done + +# If `hits` exists, wait until its primary shards are active. +if curl -sSf 'http://localhost:9200/hits' >/dev/null 2>&1; then + for _ in $(seq 1 60); do + body=$(curl -sS 'http://localhost:9200/_cluster/health/hits?wait_for_status=yellow&wait_for_active_shards=1&timeout=10s' 2>/dev/null || true) + if printf '%s' "$body" | grep -Eq '"status":"(yellow|green)"'; then + if printf '%s' "$body" | grep -Eq '"active_shards":[1-9]'; then + break + fi + fi + sleep 2 + done fi -sudo systemctl start elasticsearch.service From c06013732173397c083e20a72a897e48fae5da94 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:42:47 +0000 Subject: [PATCH 145/221] playground: web: right-align 'Run all' in the buttons row margin-left:auto pushes it opposite Run/Example instead of sitting 8px to the right of the select. Co-Authored-By: Claude Opus 4.7 --- playground/web/style.css | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/playground/web/style.css b/playground/web/style.css index 960b994ce3..ba1dbcf0bc 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -78,14 +78,16 @@ button { button:disabled { opacity: 0.6; cursor: not-allowed; } button:hover:not(:disabled) { filter: brightness(0.95); } -/* "Run all" — faint secondary button next to the example selector. */ +/* "Run all" — faint secondary button, pushed to the right end of the + buttons row via margin-left:auto so it sits opposite the primary + Run/Example controls. */ button.run-all { background: transparent; color: var(--muted); border-color: var(--muted); font-weight: normal; padding: 4px 10px; - margin-left: 8px; + margin-left: auto; } button.run-all:hover:not(:disabled) { background: var(--accent); From bbae7fc2b9a7ae802079f2418a9eb37cf61e78b7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 13:54:57 +0000 Subject: [PATCH 146/221] playground: web: hide competition panel on example pick / Run / edit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once the user pivots back to single-system work — picking a new example, pressing Run query, or editing the textarea — the competition rail should collapse so the right pane retakes the full width. They can re-open it with Run all. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/playground/web/app.js b/playground/web/app.js index e3b65a12f2..13ac829e9e 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -276,6 +276,8 @@ async function runQuery() { if (!selected) return; const sql = queryEl.value; if (!sql.trim()) return; + // Running a single query takes us out of competition view. + hideRunAll(); runBtn.disabled = true; outEl.textContent = "(running …)"; timeEl.textContent = "…"; @@ -356,6 +358,7 @@ function applyCurrentExample() { exampleSel.addEventListener("change", () => { applyCurrentExample(); refreshRunAllVisibility(); + hideRunAll(); }); // When the user types in the textarea, mark the select as // "unselected" (the disabled placeholder option). That way a @@ -367,6 +370,7 @@ queryEl.addEventListener("input", () => { exampleSel.value = ""; } refreshRunAllVisibility(); + hideRunAll(); }); queryEl.addEventListener("keydown", (e) => { if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery(); @@ -439,6 +443,16 @@ function refreshRunAllVisibility() { runAllBtn.style.display = (haveExample || haveCustom) ? "" : "none"; } +function hideRunAll() { + // Picking a different example / editing the query / pressing Run + // are all signals that the user's attention has moved off the + // competition rail. Collapse it so the right pane retakes the + // full width. + if (runAllSection.style.display === "none") return; + runAllSection.style.display = "none"; + uiSplit.classList.remove("split"); +} + async function ensureQueriesLoaded(name) { if (queriesByName[name]) return queriesByName[name]; try { From cffa226c38420a9b99d77e111f1eca4aed726fff Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:06:57 +0000 Subject: [PATCH 147/221] gizmosql: disable non-TTY truncation so rows are printed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gizmosql_client is a DuckDB-cli fork. With stdout piped, DuckDB-cli truncates any table taller than the default page to a " rows ( columns)" summary — even under .mode box. Setting .maxrows -1 and .maxwidth 0 disables both axes of truncation so the user sees the actual rows. Co-Authored-By: Claude Opus 4.7 --- gizmosql/query | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gizmosql/query b/gizmosql/query index f1445e767e..9dc83e0a7c 100755 --- a/gizmosql/query +++ b/gizmosql/query @@ -15,7 +15,13 @@ query=$(cat) # original ClickBench runner used `.mode trash` to discard rows # for clean timing, but the user wants to see output here. The # Run Time line still parses fine. -script=$(printf '.timer on\n.mode box\n%s\n' "$query") +# +# .maxrows -1 / .maxwidth 0: gizmosql_client is a DuckDB-cli fork +# and inherits its non-TTY truncation behavior, which collapses any +# table taller than the default page to a " rows ( columns)" +# summary line. Disable both row and column truncation so the user +# sees the whole result. +script=$(printf '.timer on\n.maxrows -1\n.maxwidth 0\n.mode box\n%s\n' "$query") raw=$(printf '%s' "$script" | gizmosql_client 2>&1) && exit_code=0 || exit_code=$? From b14e2cc028ecbce33aeed35ab5988b6a1224d841 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:21:57 +0000 Subject: [PATCH 148/221] playground: per-system VM RAM override; bump umbra to 96 GiB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even with 256 GiB of attached swap, Umbra keeps tripping on 'psql:create.sql:109: ERROR: unable to allocate memory' partway through the COPY — the thrash returns ENOMEM from inside the container before the guest kernel can page out. The host has 1 TiB, so giving Umbra a 96 GiB VM keeps the working set resident. Adds MEM_OVERRIDES_MIB so future engines with similar needs are a one-line edit. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 12 ++++++++++++ playground/server/vm_manager.py | 8 ++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index c6a3e3d71c..7428e4e366 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -115,6 +115,18 @@ # partitioned-parquet set. SWAP_SIZE_GB: int = 256 +# Per-system VM RAM override (MiB). The default cfg.vm_mem_mib is 16 GiB, +# which is enough for almost every engine when paired with the swap +# drive above — but Umbra keeps tripping on `psql:create.sql:109: +# ERROR: unable to allocate memory` partway through the COPY even with +# 256 GiB of swap attached. The thrash is so heavy that allocations +# from inside the docker container return ENOMEM before the guest +# kernel can swap pages out. Giving Umbra a bigger RAM allotment keeps +# the working set resident; the host has 1 TiB so a 96 GiB VM is cheap. +MEM_OVERRIDES_MIB: dict[str, int] = { + "umbra": 96 * 1024, +} + @dataclass(frozen=True) class System: diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index d8f32d9a25..b060f5f959 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -35,7 +35,7 @@ from . import firecracker as fc from . import net -from .systems import NEEDS_SWAP, SWAP_SIZE_GB +from .systems import MEM_OVERRIDES_MIB, NEEDS_SWAP, SWAP_SIZE_GB from .config import Config from .systems import System, TRUSTED_INTERNET, DATALAKE_FILTERED @@ -492,7 +492,11 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: }) await fc.put(sock, "/machine-config", { "vcpu_count": self.cfg.vm_vcpus, - "mem_size_mib": self.cfg.vm_mem_mib, + # Per-system override for memory-hungry engines (Umbra) that + # ENOMEM out even with the per-VM swap drive. See + # MEM_OVERRIDES_MIB in systems.py. + "mem_size_mib": MEM_OVERRIDES_MIB.get( + vm.system.name, self.cfg.vm_mem_mib), "smt": False, }) await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) From 67def65d3d16f9aff4e6e5649e44dbe7629d2e66 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:25:19 +0000 Subject: [PATCH 149/221] trino-datalake{,-partitioned}: pin trino:455, restore hadoop-S3 + anonymous shim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit trino:latest no longer ships the legacy hadoop-S3 plugin (removed in v461). The replacement native-S3 filesystem has no anonymous-creds mode, so it can't read the public clickhouse-public-datasets bucket even with region/endpoint set — the URI is rejected outright with 'External location is not a valid file system URI: s3://...'. Pin trino:455 (last release with hadoop-S3) and restore the fs.hadoop.enabled=true + S3AnonymousProvider shim path that was working until the recent :latest bump. Co-Authored-By: Claude Opus 4.7 --- trino-datalake-partitioned/install | 8 ++++-- trino-datalake-partitioned/start | 2 +- trino-datalake/install | 39 +++++++++++++++++++++--------- trino-datalake/start | 2 +- 4 files changed, 36 insertions(+), 15 deletions(-) diff --git a/trino-datalake-partitioned/install b/trino-datalake-partitioned/install index 10e0b96135..0a83f90478 100755 --- a/trino-datalake-partitioned/install +++ b/trino-datalake-partitioned/install @@ -14,7 +14,7 @@ if ! command -v docker >/dev/null 2>&1; then fi sudo apt-get install -y bc -sudo docker pull trinodb/trino:latest +sudo docker pull trinodb/trino:455 mkdir -p data/meta etc/catalog shim sudo chown 1000:1000 data/meta shim @@ -38,7 +38,7 @@ EOF if [ ! -f shim/S3AnonymousProvider.jar ]; then sudo docker run --rm \ -v "$PWD/shim:/shim" \ - --entrypoint sh trinodb/trino:latest -c ' + --entrypoint sh trinodb/trino:455 -c ' set -e cd /shim CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") @@ -58,6 +58,10 @@ hive.metastore=file hive.metastore.catalog.dir=local:///meta local.location=/data fs.native-local.enabled=true +# Pin to trino:455 (last release shipping the legacy hadoop-S3 plugin +# with a custom AWSCredentialsProvider). 461+ removed it, and the new +# fs.native-s3 has no anonymous-creds mode so we can't read the public +# bucket without it. fs.hadoop.enabled=true hive.config.resources=/etc/trino/core-site.xml hive.non-managed-table-writes-enabled=true diff --git a/trino-datalake-partitioned/start b/trino-datalake-partitioned/start index 78c6d033f7..5e66109ab9 100755 --- a/trino-datalake-partitioned/start +++ b/trino-datalake-partitioned/start @@ -15,4 +15,4 @@ sudo docker run -d --name trino \ -v "$PWD/etc/core-site.xml:/etc/trino/core-site.xml:ro" \ -v "$PWD/data/meta:/data/meta" \ -v "$PWD/shim/S3AnonymousProvider.jar:/usr/lib/trino/plugin/hive/hdfs/S3AnonymousProvider.jar:ro" \ - trinodb/trino:latest + trinodb/trino:455 diff --git a/trino-datalake/install b/trino-datalake/install index 3381c55966..d28cd9255a 100755 --- a/trino-datalake/install +++ b/trino-datalake/install @@ -14,7 +14,7 @@ if ! command -v docker >/dev/null 2>&1; then fi sudo apt-get install -y bc -sudo docker pull trinodb/trino:latest +sudo docker pull trinodb/trino:455 mkdir -p data/meta etc/catalog shim sudo chown 1000:1000 data/meta shim @@ -38,7 +38,7 @@ EOF if [ ! -f shim/S3AnonymousProvider.jar ]; then sudo docker run --rm \ -v "$PWD/shim:/shim" \ - --entrypoint sh trinodb/trino:latest -c ' + --entrypoint sh trinodb/trino:455 -c ' set -e cd /shim CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") @@ -58,14 +58,31 @@ hive.metastore=file hive.metastore.catalog.dir=local:///meta local.location=/data fs.native-local.enabled=true -# trino:latest dropped the legacy hadoop-S3 plugin, which is why -# fs.hadoop.enabled=true + a custom AWSCredentialsProvider shim no -# longer worked: 'External location is not a valid file system URI: -# s3://...'. Switch to the native S3 file system; the bucket is -# public so we configure it to skip the AWS signing flow entirely. -fs.native-s3.enabled=true -s3.region=eu-central-1 -s3.endpoint=https://s3.eu-central-1.amazonaws.com -s3.path-style-access=false +# trino:455 still ships the legacy hadoop-S3 file system. Newer +# trino releases (461+) dropped it and fs.native-s3.enabled=true +# doesn't have an anonymous-credentials mode, so we'd be unable to +# read the public clickhouse-public-datasets bucket. Pin trino:455 +# in install/start until upstream re-adds anonymous-S3 to the native +# filesystem. +fs.hadoop.enabled=true +hive.config.resources=/etc/trino/core-site.xml hive.non-managed-table-writes-enabled=true EOF + +cat > etc/core-site.xml <<'EOF' + + + + trino.s3.credentials-provider + S3AnonymousProvider + + + trino.s3.endpoint + https://s3.eu-central-1.amazonaws.com + + + trino.s3.region + eu-central-1 + + +EOF diff --git a/trino-datalake/start b/trino-datalake/start index 78c6d033f7..5e66109ab9 100755 --- a/trino-datalake/start +++ b/trino-datalake/start @@ -15,4 +15,4 @@ sudo docker run -d --name trino \ -v "$PWD/etc/core-site.xml:/etc/trino/core-site.xml:ro" \ -v "$PWD/data/meta:/data/meta" \ -v "$PWD/shim/S3AnonymousProvider.jar:/usr/lib/trino/plugin/hive/hdfs/S3AnonymousProvider.jar:ro" \ - trinodb/trino:latest + trinodb/trino:455 From 1d1c476814e737ef06303b57299450783244be05 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:34:24 +0000 Subject: [PATCH 150/221] umbra: lean on the swap disk instead of a privileged RAM allotment The recent 16 -> 96 GiB override was unfair to every other engine. Revert it; do what we did for the dataframe systems instead: - drop the docker --memory=128g cap (raise it to --memory=256g to allow swap-backed growth), keep --memory-swap=-1, add --memory-swappiness=100 so the cgroup pages out anon memory aggressively the moment we exceed physical RAM - flip the guest's vm.overcommit_memory to 1 and vm.swappiness to 100 inside ./start so the kernel stops refusing the large mmap requests Umbra issues during COPY Removes MEM_OVERRIDES_MIB and the vm_manager plumbing for it. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 12 ------------ playground/server/vm_manager.py | 8 ++------ umbra/start | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index 7428e4e366..c6a3e3d71c 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -115,18 +115,6 @@ # partitioned-parquet set. SWAP_SIZE_GB: int = 256 -# Per-system VM RAM override (MiB). The default cfg.vm_mem_mib is 16 GiB, -# which is enough for almost every engine when paired with the swap -# drive above — but Umbra keeps tripping on `psql:create.sql:109: -# ERROR: unable to allocate memory` partway through the COPY even with -# 256 GiB of swap attached. The thrash is so heavy that allocations -# from inside the docker container return ENOMEM before the guest -# kernel can swap pages out. Giving Umbra a bigger RAM allotment keeps -# the working set resident; the host has 1 TiB so a 96 GiB VM is cheap. -MEM_OVERRIDES_MIB: dict[str, int] = { - "umbra": 96 * 1024, -} - @dataclass(frozen=True) class System: diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index b060f5f959..d8f32d9a25 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -35,7 +35,7 @@ from . import firecracker as fc from . import net -from .systems import MEM_OVERRIDES_MIB, NEEDS_SWAP, SWAP_SIZE_GB +from .systems import NEEDS_SWAP, SWAP_SIZE_GB from .config import Config from .systems import System, TRUSTED_INTERNET, DATALAKE_FILTERED @@ -492,11 +492,7 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: }) await fc.put(sock, "/machine-config", { "vcpu_count": self.cfg.vm_vcpus, - # Per-system override for memory-hungry engines (Umbra) that - # ENOMEM out even with the per-VM swap drive. See - # MEM_OVERRIDES_MIB in systems.py. - "mem_size_mib": MEM_OVERRIDES_MIB.get( - vm.system.name, self.cfg.vm_mem_mib), + "mem_size_mib": self.cfg.vm_mem_mib, "smt": False, }) await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) diff --git a/umbra/start b/umbra/start index f0d5b30b75..a15989df25 100755 --- a/umbra/start +++ b/umbra/start @@ -8,14 +8,27 @@ fi sudo docker stop umbradb >/dev/null 2>&1 || true sudo docker rm umbradb >/dev/null 2>&1 || true +# Umbra's working set during the ClickBench COPY blows well past the +# guest VM's 16 GiB RAM. The agent has already mkswap'd + swapon'd a +# 256 GiB swap.raw block device — we just need to make sure (a) the +# guest kernel is willing to overcommit anonymous mappings well past +# physical RAM (overcommit_memory=1; default 0 is heuristic and +# refuses sufficiently large mmaps), and (b) the docker cgroup +# doesn't sit between Umbra and the swap. No --memory / --memory-swap +# cap so the container can grow into the available RAM+swap, and +# --memory-swappiness=100 so the kernel pages out anonymous memory +# aggressively the moment we exceed physical RAM. +sudo sysctl -wq vm.overcommit_memory=1 vm.swappiness=100 || true + sudo docker run -d --name umbradb \ -v "$(pwd)/db:/var/db" \ -v "$(pwd)/data:/data" \ -p 5432:5432 \ --ulimit nofile=1048576:1048576 \ --ulimit memlock=8388608:8388608 \ - --memory=128g \ + --memory=256g \ --memory-swap=-1 \ + --memory-swappiness=100 \ umbradb/umbra:latest >/dev/null # Container needs a moment before psql can connect. From 9b1414977045bbc7d4865b5c864429df1e43bda4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:35:13 +0000 Subject: [PATCH 151/221] trino-datalake{,-partitioned}: widen shim classpath after :455 pin The AWS-SDK + Hadoop-jar name-glob no longer matches anything in trino:455 (the dependency tree shifted between releases), so the S3AnonymousProvider compile dies with 'package com.amazonaws.auth does not exist'. Always use the full /usr/lib/trino/**/*.jar classpath; the shim has no class-name collisions to worry about. Co-Authored-By: Claude Opus 4.7 --- trino-datalake-partitioned/install | 14 ++++++-------- trino-datalake/install | 14 ++++++-------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/trino-datalake-partitioned/install b/trino-datalake-partitioned/install index 0a83f90478..bd0798b1e4 100755 --- a/trino-datalake-partitioned/install +++ b/trino-datalake-partitioned/install @@ -33,20 +33,18 @@ public class S3AnonymousProvider implements AWSCredentialsProvider { } EOF -# Compile the shim against the AWS SDK + Hadoop jars bundled in the trino -# image. Target Java 11 bytecode for portability. Skip if already built. +# Compile the shim against every jar under /usr/lib/trino. The narrower +# AWS-SDK + Hadoop name-glob we used to rely on stopped matching after +# pinning trino:455 (the dependency layout shifted between releases), +# producing "package com.amazonaws.auth does not exist" errors. Casting +# the classpath net at every jar avoids guessing at upstream renames. if [ ! -f shim/S3AnonymousProvider.jar ]; then sudo docker run --rm \ -v "$PWD/shim:/shim" \ --entrypoint sh trinodb/trino:455 -c ' set -e cd /shim - CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") - if [ -z "$CP" ]; then - # Last resort: every jar under trino. Order matters for classpath - # only in case of duplicate classes, which we do not have here. - CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") - fi + CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' diff --git a/trino-datalake/install b/trino-datalake/install index d28cd9255a..9f615c477d 100755 --- a/trino-datalake/install +++ b/trino-datalake/install @@ -33,20 +33,18 @@ public class S3AnonymousProvider implements AWSCredentialsProvider { } EOF -# Compile the shim against the AWS SDK + Hadoop jars bundled in the trino -# image. Target Java 11 bytecode for portability. Skip if already built. +# Compile the shim against every jar under /usr/lib/trino. The narrower +# AWS-SDK + Hadoop name-glob we used to rely on stopped matching after +# pinning trino:455 (the dependency layout shifted between releases), +# producing "package com.amazonaws.auth does not exist" errors. Casting +# the classpath net at every jar avoids guessing at upstream renames. if [ ! -f shim/S3AnonymousProvider.jar ]; then sudo docker run --rm \ -v "$PWD/shim:/shim" \ --entrypoint sh trinodb/trino:455 -c ' set -e cd /shim - CP=$(find /usr/lib/trino \( -name "*aws-java-sdk*.jar" -o -name "*hadoop-apache*.jar" \) 2>/dev/null | tr "\n" ":") - if [ -z "$CP" ]; then - # Last resort: every jar under trino. Order matters for classpath - # only in case of duplicate classes, which we do not have here. - CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") - fi + CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' From 6f06afa40d4fa9ca130510e78c423132b49833f9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:43:27 +0000 Subject: [PATCH 152/221] playground: web: keep competition rail open while editing the query Editing the textarea no longer collapses the rail; only picking another example or pressing Run does. The user wants to be able to tweak the query in place and still see the per-system leaderboard. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 13ac829e9e..cbac2968a4 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -370,7 +370,6 @@ queryEl.addEventListener("input", () => { exampleSel.value = ""; } refreshRunAllVisibility(); - hideRunAll(); }); queryEl.addEventListener("keydown", (e) => { if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery(); @@ -444,10 +443,12 @@ function refreshRunAllVisibility() { } function hideRunAll() { - // Picking a different example / editing the query / pressing Run - // are all signals that the user's attention has moved off the - // competition rail. Collapse it so the right pane retakes the - // full width. + // Picking a different example or pressing Run are signals that + // the user's attention has moved off the competition rail. + // Collapse it so the right pane retakes the full width. Editing + // the textarea does *not* trigger this — the user may want to + // tweak the query, see the results refresh, and still compare + // against the rail. if (runAllSection.style.display === "none") return; runAllSection.style.display = "none"; uiSplit.classList.remove("split"); From 2f05785cb90fe351f206428bcd15c9f689d7e788 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:48:41 +0000 Subject: [PATCH 153/221] playground: URL-encode X-Error so error newlines survive end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent.py was flattening \n to ' | ' before stuffing the stderr snippet into X-Error (HTTP headers can't carry raw \n). The pipes leaked into the UI's error pane. URL-encode in the agent + main.py, decode via decodeURIComponent in app.js. Old agents not yet redeployed still work — _decodeXError falls back to the raw value if decoding throws. Also fixes a competition-mode bug where clicking through systems in the rail clobbered an edited query: pickFromRunAll was overwriting pristineQuery with entry.query, which made the next loadExamples call see the textarea as 'pristine' and replace it with the new system's example. Stop setting pristineQuery — loadExamples already maintains it via its own pristine check. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 6 +++++- playground/server/main.py | 12 ++++++++++-- playground/web/app.js | 19 ++++++++++++++++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index a4707c551d..437e61c6d3 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -37,6 +37,7 @@ import sys import threading import time +import urllib.parse from pathlib import Path SYSTEM_DIR = Path(os.environ.get("CLICKBENCH_SYSTEM_DIR", "/opt/clickbench/system")) @@ -639,7 +640,10 @@ def do_POST(self) -> None: oom = _recent_oom_messages() if oom: err_snip = "kernel OOM-killer:\n" + oom - headers["X-Error"] = err_snip.replace("\n", " | ")[:512] + # HTTP headers can't carry raw newlines, so URL-encode + # the (truncated) snippet. The UI decodes via + # decodeURIComponent so real \n survives end-to-end. + headers["X-Error"] = urllib.parse.quote(err_snip[-512:]) self._send(200 if (rc == 0 or truncated) else 502, body, headers) return self._send_json(404, {"error": "not found", "path": self.path}) diff --git a/playground/server/main.py b/playground/server/main.py index 875b47128c..dbc38258a3 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -25,6 +25,7 @@ import logging import signal import time +import urllib.parse def _id_to_b64url(n: int) -> str: @@ -248,7 +249,11 @@ async def handle_query(self, req: web.Request) -> web.StreamResponse: if "X-Query-Time" in headers else None), wall_time=wall, status=status, - error=err or headers.get("X-Error", ""), + # The agent URL-encodes X-Error so newlines survive + # the HTTP header. Decode before logging so the + # ClickHouse log stores the raw multi-line error. + error=err or urllib.parse.unquote( + headers.get("X-Error", "")), ) except Exception: log.exception("logging request failed") @@ -260,7 +265,10 @@ async def handle_query(self, req: web.Request) -> web.StreamResponse: resp.headers["X-Wall-Time"] = f"{wall:.6f}" resp.headers["X-Query-Id"] = _id_to_b64url(query_id) if err and "X-Error" not in resp.headers: - resp.headers["X-Error"] = err[:512] + # URL-encode to match the agent path so the client always + # decodes uniformly. err is usually a one-liner, so this + # is a no-op in practice; but keeps the contract simple. + resp.headers["X-Error"] = urllib.parse.quote(err[:512]) return resp async def handle_saved(self, req: web.Request) -> web.Response: diff --git a/playground/web/app.js b/playground/web/app.js index cbac2968a4..6cdeea8180 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -272,6 +272,15 @@ async function pollState() { } } +// X-Error is URL-encoded on the wire because HTTP headers can't carry +// raw \n. Decode here so real newlines survive end-to-end. Falls back +// to the raw value if the header isn't actually encoded (older agents). +function _decodeXError(s) { + if (!s) return s; + try { return decodeURIComponent(s); } + catch { return s; } +} + async function runQuery() { if (!selected) return; const sql = queryEl.value; @@ -301,7 +310,7 @@ async function runQuery() { const wt = h("X-Wall-Time"); let output = txt; if (r.status >= 400) { - const err = h("X-Error"); + const err = _decodeXError(h("X-Error")); if (err) { const trailer = `\n\n(error)\n${err}`; output = (txt === "(no output)" ? "" : txt) + trailer; @@ -543,7 +552,7 @@ async function runAll() { const h = (k) => r.headers.get(k); const qid = h("X-Query-Id"); if (r.status >= 400) { - const err = h("X-Error") || `HTTP ${r.status}`; + const err = _decodeXError(h("X-Error")) || `HTTP ${r.status}`; return { ok: false, note: err, qid, payload: { @@ -639,9 +648,13 @@ function pickFromRunAll(name) { // Switch the system list highlight + state panel to this system. if (stateByName[name]) select(name); // Rewrite the query textarea + result pane to this system's run. + // Do NOT update pristineQuery: loadExamples already set it to the + // new system's example inside select(), and overwriting it with + // entry.query causes the next rail click's loadExamples to think + // the textarea is pristine and clobber the user's edited query + // with that system's example. if (entry.query) { queryEl.value = entry.query; - pristineQuery = entry.query; } if (entry.payload) { resultsByName[name] = entry.payload; From 8487726fed728515e23c8bfdcb621481e09cbd94 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 14:50:28 +0000 Subject: [PATCH 154/221] playground: web: re-baseline pristineQuery on rail clicks in example-mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track which mode runAll started in. On rail clicks: - example-mode: entry.query IS the new system's example translation, so update pristineQuery to it. The next click can then see "pristine" and let loadExamples swap in the next system's translation. - custom-mode: entry.query is the user's edited string. Don't touch pristineQuery — loadExamples must not treat the edit as pristine and replace it with the system's example. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 6cdeea8180..d44371e921 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -491,6 +491,15 @@ async function runAll() { const useExampleIndex = !isNaN(idx); const customQuery = queryEl.value; if (!useExampleIndex && !customQuery.trim()) return; + // Track which mode the competition was launched in. pickFromRunAll + // uses this to decide whether to update pristineQuery on rail + // clicks: in example-mode, each row click should re-baseline + // pristineQuery to the new system's example translation so it + // tracks the visible textarea cleanly; in custom-mode the + // original pristineQuery is intentionally stale (different from + // the textarea) so the user's edit isn't treated as pristine and + // clobbered by loadExamples. + runAllExampleMode = useExampleIndex; runAllBtn.disabled = true; runAllSection.style.display = ""; uiSplit.classList.add("split"); @@ -640,6 +649,7 @@ async function runAll() { let runAllStatus = {}; let runAllSelected = null; +let runAllExampleMode = false; function pickFromRunAll(name) { const entry = runAllStatus[name]; @@ -648,13 +658,17 @@ function pickFromRunAll(name) { // Switch the system list highlight + state panel to this system. if (stateByName[name]) select(name); // Rewrite the query textarea + result pane to this system's run. - // Do NOT update pristineQuery: loadExamples already set it to the - // new system's example inside select(), and overwriting it with - // entry.query causes the next rail click's loadExamples to think - // the textarea is pristine and clobber the user's edited query - // with that system's example. + // pristineQuery handling depends on which mode the competition is + // in. In example-mode, entry.query IS the new system's example + // translation, so re-baseline pristineQuery so subsequent rail + // clicks see the textarea as pristine and let loadExamples swap + // in the next system's translation cleanly. In custom-mode, + // entry.query is the user's edited string; leaving pristineQuery + // alone keeps loadExamples from thinking the edit is pristine + // and replacing it with that system's example. if (entry.query) { queryEl.value = entry.query; + if (runAllExampleMode) pristineQuery = entry.query; } if (entry.payload) { resultsByName[name] = entry.payload; From ef25906d3903bbd02baf1b997e785e6768f68e9e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:02:03 +0000 Subject: [PATCH 155/221] playground: druid post-restore recovery + agent btime watcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Druid's JVMs survive a snapshot restore but the SQL stack stays dead for 10+ minutes — likely ZK session skew across the snapshot boundary. The old druid/start only checked /status (router up fast), so it returned 'idempotent: nothing to do' and queries kept landing on the broken SQL endpoint. - druid/start: probe SELECT 1 with a 5s curl, and on failure pkill -KILL every druid JVM and cold-start the stack. - druid/check already uses SELECT 1 so it's the right gate. Independently, even with start fixed, /ready was reporting ready=true throughout the post-restore window because _daemon_started.is_set() is restored from the snapshot's Python memory. The host's _wait_for_daemon_ready passed instantly, /query landed mid-rebuild, and the 60s host budget fired. Fix: - add a btime watcher thread that calls _maybe_reconcile_for_restore every second, so the moment the VM resumes the watcher clears _daemon_started and spawns _ensure_daemon_started off-thread. - /ready also calls _maybe_reconcile_for_restore so a host probe can't beat the watcher. - _maybe_reconcile_for_restore now kicks _ensure_daemon_started in a thread itself (it was previously synchronous-only from /query; the watcher must not block). - bump _ensure_daemon_started's check loop from 60s to 10 min so slow daemons (Druid, Doris, Pinot) actually reach pass before /ready flips. Co-Authored-By: Claude Opus 4.7 --- druid/start | 28 +++++++++++++++++--- playground/agent/agent.py | 55 ++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 11 deletions(-) diff --git a/druid/start b/druid/start index 8fbf4ae3d6..e04f769986 100755 --- a/druid/start +++ b/druid/start @@ -4,13 +4,33 @@ set -e VERSION=37.0.0 DRUID_DIR="apache-druid-${VERSION}" -# Idempotent: if router is responsive, do nothing. -if curl -sf -o /dev/null http://localhost:8888/status 2>/dev/null; then +# Health check: SQL endpoint must accept queries, not just /status. The +# router's /status comes up immediately while the broker / historical +# wiring is still recovering. Post-snapshot-restore in particular the +# JVMs are alive but the SQL stack stays unhealthy for 10+ minutes — +# ZK session skew across the snapshot boundary, most likely. The old +# "if /status: exit" gate let queries land in that window and time out. +sql_healthy() { + curl -sf -o /dev/null --max-time 5 \ + -XPOST -H 'Content-Type: application/json' \ + http://localhost:8888/druid/v2/sql/ \ + -d '{"query": "SELECT 1"}' 2>/dev/null +} + +if sql_healthy; then exit 0 fi -# Druid launcher does not start Druid as a daemon. Run it in background, with -# its own session so the start script can exit and leave Druid running. +# Otherwise: clean slate. Kill any JVMs left over from a snapshot +# restore, then cold-start Druid. The launcher spawns one Java process +# per role (coordinator, broker, historical, ...) and they all share +# the org.apache.druid.cli.Main entrypoint. +pkill -KILL -f org.apache.druid.cli.Main 2>/dev/null || true +sleep 2 + +# Druid launcher does not start Druid as a daemon. Run it in background, +# with its own session so the start script can exit and leave Druid +# running. nohup "./${DRUID_DIR}/bin/start-single-server-medium" \ >> druid.log 2>&1 < /dev/null & disown diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 437e61c6d3..369ae88e58 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -171,11 +171,14 @@ def _ensure_daemon_started() -> None: return subprocess.run([str(start)], cwd=str(SYSTEM_DIR), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - timeout=300, check=False) + timeout=PROVISION_TIMEOUT, check=False) # Wait for ./check to confirm before unblocking the /query. + # 10 min covers cold-starting Druid + the other JVM stacks + # we ship (Doris, Pinot, Trino). On a fast daemon this loop + # exits in well under a second. check = SYSTEM_DIR / "check" if check.exists(): - for _ in range(120): + for _ in range(1200): rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -540,6 +543,12 @@ def do_GET(self) -> None: # The host uses this at restore time to gate VM-state="ready" # for slow daemons (Doris, Druid, Trino, etc.); without it # the first user query arrives mid-start and times out. + # + # Check btime here too. The Python process state — including + # _daemon_started — survives a snapshot restore, so without + # this call /ready would happily report ready=true throughout + # a 5–10 minute post-restore daemon-rebuild window. + _maybe_reconcile_for_restore() ready = _daemon_started.is_set() self._send_json(200 if ready else 503, {"ready": ready, "system": SYSTEM_NAME}) @@ -672,11 +681,22 @@ def _proc_btime() -> int | None: def _maybe_reconcile_for_restore() -> None: - """Called on each /query: if /proc/stat btime has shifted since - the last call, the VM was snapshot-restored and any docker daemon - needs reconciling (the kernel-side cgroup/netfilter state diverged - from dockerd's restored view of it). The reconcile itself is a - no-op when docker isn't installed.""" + """If /proc/stat btime has shifted since the last call, the VM was + snapshot-restored. Reconcile docker (the kernel-side cgroup / + netfilter state diverged from dockerd's restored view of it) and + clear _daemon_started so /ready reflects the truth: the daemon may + be technically running post-restore but is often broken (Druid's + SQL endpoint, byconity's compose stack, ...), and we need to + rebuild it before serving queries. + + Idempotent: subsequent calls see the same btime and return cheaply. + + Called from BOTH the /ready handler and the btime-watcher thread + so the readiness probe accurately returns 503 throughout the + post-restore rebuild — without this, the agent's Python-process + state (including _daemon_started) survives the snapshot, /ready + returns 200 immediately, and the host sends /query right into the + middle of a 5–10 minute daemon recovery.""" global _last_seen_btime cur = _proc_btime() if cur is None: @@ -696,6 +716,11 @@ def _maybe_reconcile_for_restore() -> None: # ./start. Clear the daemon-started gate so the very next # _ensure_daemon_started() call brings the stack back up. _daemon_started.clear() + # Kick off the rebuild asynchronously. /ready (or whoever + # called us) returns promptly; the host's /ready poll then + # waits for _daemon_started to flip back to True. + threading.Thread(target=_ensure_daemon_started, daemon=True, + name="daemon-restart").start() def _reconcile_docker_after_restore() -> None: @@ -829,6 +854,20 @@ def _activate_swap() -> None: print(f"agent: swapon {target} rc={rc}", flush=True) +def _btime_watcher() -> None: + """Background thread that polls btime and triggers reconcile the + moment a snapshot restore is detected — independent of whether any + /ready or /query has arrived yet. Without it, restore detection + is gated on a request landing, and the first /ready after restore + reports stale-true _daemon_started until that request lands.""" + while True: + try: + _maybe_reconcile_for_restore() + except Exception as e: + sys.stderr.write(f"[agent] btime watcher error: {e!r}\n") + time.sleep(1) + + def main() -> None: addr = ("0.0.0.0", LISTEN_PORT) print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} " @@ -842,6 +881,8 @@ def main() -> None: global _last_seen_btime _last_seen_btime = _proc_btime() _kick_daemon_if_provisioned() + threading.Thread(target=_btime_watcher, daemon=True, + name="btime-watcher").start() with ReusableServer(addr, Handler) as srv: srv.serve_forever() From 6494e8e9918b6698bca4b3c467c1baa6f83cb25c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:05:32 +0000 Subject: [PATCH 156/221] parseable: use a wide time-window in queries, not today MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parseable filters every query by [startTime, endTime] against the row's ingest timestamp. The benchmark script used today's calendar day, which is fine in a one-shot run-on-the-day-you-loaded-it benchmark — but in the playground we load during provisioning, snapshot the result, and then queries run hours-to-days later. Every row falls outside today's window and the result is always zero. Use [2000, 2099] so any plausible load + query date is included. Co-Authored-By: Claude Opus 4.7 --- parseable/query | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/parseable/query b/parseable/query index 7603f63f0d..34df5b8085 100755 --- a/parseable/query +++ b/parseable/query @@ -7,9 +7,16 @@ set -e query=$(cat) -CURRENT_DATE=$(date +%Y-%m-%d) -START_TIME="${CURRENT_DATE}T00:00:00.000Z" -END_TIME="${CURRENT_DATE}T23:59:00.000Z" +# Parseable filters every query by [startTime, endTime] against the +# row's ingest timestamp. The original benchmark used today's calendar +# day, which only works if the dataset was loaded today. In the +# playground the data is ingested during initial provisioning and +# then frozen into a snapshot, so "today" from the query script's +# point of view drifts past the load day and every row falls outside +# the window → all queries return zero. Use a wide window that +# covers any plausible load + query date. +START_TIME="2000-01-01T00:00:00.000Z" +END_TIME="2099-12-31T23:59:00.000Z" # JSON-escape quotes inside the query. escaped=$(printf '%s' "$query" | sed 's/"/\\"/g') From 874f100f0eff3a22b3f28ad596daee27f71c3fac Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:16:03 +0000 Subject: [PATCH 157/221] heavyai: trailing ';' in check; oxla: drop from catalog heavyai/check sent 'SELECT 1' over omnisql stdin. omnisql 5.10.2 parses that as incomplete, exits with 'Missing semicolon at end of SQL command.' without ever contacting the daemon, and the agent's check loop spins for the full 900 s. Add the ';'. oxla's only public docker image (public.ecr.aws/oxla/release) was de-listed; the repo no longer surfaces in the ECR public gallery and there's no replacement on Docker Hub or GitHub Releases. Drop it from the catalog (alongside sirius). Co-Authored-By: Claude Opus 4.7 --- heavyai/check | 7 ++++++- playground/server/systems.py | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/heavyai/check b/heavyai/check index 6f05c4695c..39f8987efa 100755 --- a/heavyai/check +++ b/heavyai/check @@ -5,5 +5,10 @@ set -e # default `omnisci` is implicit. Some 5.10.2 builds of omnisql treat the # trailing positional arg as a script path rather than a db name. CONTAINER_NAME=${CONTAINER_NAME:-heavyai} +# omnisql 5.10.2 needs a trailing ';' on every statement it reads from +# stdin — without it the input is parsed as incomplete and the tool +# exits with "Missing semicolon at end of SQL command." while never +# actually contacting the daemon. The agent's check loop then spins +# for the full timeout. sudo docker exec -i "$CONTAINER_NAME" /omnisci/bin/omnisql \ - -p HyperInteractive -q -t <<< 'SELECT 1' >/dev/null 2>&1 + -p HyperInteractive -q -t <<< 'SELECT 1;' >/dev/null 2>&1 diff --git a/playground/server/systems.py b/playground/server/systems.py index c6a3e3d71c..a972c0c22e 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -49,6 +49,11 @@ # (~35 min from source) but ./check times out because the daemon # can't initialize a CUDA context. Disabled — we'd need GPU passthrough. "sirius", + # oxla's only public docker image, public.ecr.aws/oxla/release, + # was de-listed (the ECR public gallery no longer surfaces the + # repository at all). No replacement on Docker Hub or GitHub + # Releases. Drop until upstream publishes a new image source. + "oxla", # Upstream is broken, asks for credentials we don't have, or # the engine can't survive a 16 GB cap. # - paradedb-partitioned: install script aborts ("pg_lakehouse was From 8ebfabf9b7b872420db2687a4a945b9e1b83afb6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:22:39 +0000 Subject: [PATCH 158/221] playground: orioledb sysdisk bump + slab-hover rail highlight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit postgresql-orioledb: - Park PGDATA on the per-VM sysdisk instead of the container's overlay layer (which lives on the 200 GiB rootfs). The orioledb undo log doubles the write footprint of the base table and blew up at line ~70M of hits.tsv. - Bump the sysdisk for this engine to 400 GiB via a new SYSDISK_OVERRIDES_GB hook in systems.py. The image is sparse so physical cost is what postgres actually writes. - Rootfs is left at 200 GiB — build-system-rootfs.sh clones the base via sparse-cp with no resize2fs, so a rootfs override would need a deeper change. Moving PGDATA to sysdisk sidesteps that. UI: - Hovering a slab in the top system picker now highlights the matching row in the competition leaderboard, so the user can scan from picker to result without losing context. New .slab-hover CSS class toggled via mouseenter/mouseleave. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 18 ++++++++++++++++++ playground/server/vm_manager.py | 11 +++++++++-- playground/web/app.js | 16 ++++++++++++++++ playground/web/style.css | 5 +++++ postgresql-orioledb/install | 14 ++++++++++++++ 5 files changed, 62 insertions(+), 2 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index a972c0c22e..a68480a1c2 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -120,6 +120,24 @@ # partitioned-parquet set. SWAP_SIZE_GB: int = 256 +# Per-system sysdisk-size override (apparent size, in GiB). Default is +# 200 GiB as set by build-system-rootfs.sh. The image is sparse so the +# apparent size doesn't cost physical bytes upfront — only what the +# guest actually writes. Rootfs is intentionally not overridable here: +# the build script clones the base ext4 image via sparse-cp without +# resize2fs, so a bigger rootfs would require a deeper change. +SYSDISK_OVERRIDES_GB: dict[str, int] = { + # postgresql-orioledb's COPY blew through 200 GiB before reaching + # the end of hits.tsv: + # PANIC: could not write buffer to file orioledb_undo/0000000319page: + # No space left on device (line 69,533,798 of hits.tsv) + # The orioledb extension keeps a per-statement undo log inside + # PGDATA that roughly doubles the write footprint of the base + # table. The install script parks PGDATA on the sysdisk + # specifically so this override actually helps. + "postgresql-orioledb": 400, +} + @dataclass(frozen=True) class System: diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index d8f32d9a25..2fe2ca8cee 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -35,7 +35,7 @@ from . import firecracker as fc from . import net -from .systems import NEEDS_SWAP, SWAP_SIZE_GB +from .systems import NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB from .config import Config from .systems import System, TRUSTED_INTERNET, DATALAKE_FILTERED @@ -398,10 +398,17 @@ async def _build_images_if_needed(self, vm: VM) -> None: return log.info("[%s] building rootfs + system disk", vm.system.name) script = self.cfg.repo_dir / "playground" / "images" / "build-system-rootfs.sh" + build_env = {**os.environ, + "PLAYGROUND_STATE_DIR": str(self.cfg.state_dir)} + # Per-system sysdisk-size override for engines whose load blew + # through the 200 GiB default (postgresql-orioledb, ...). + sysdisk_override = SYSDISK_OVERRIDES_GB.get(vm.system.name) + if sysdisk_override: + build_env["VM_SYSDISK_SIZE_GB"] = str(sysdisk_override) p = await asyncio.create_subprocess_exec( "bash", str(script), vm.system.name, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, - env={**os.environ, "PLAYGROUND_STATE_DIR": str(self.cfg.state_dir)}, + env=build_env, ) out, _ = await p.communicate() if p.returncode != 0: diff --git a/playground/web/app.js b/playground/web/app.js index d44371e921..1dbc8e3a1b 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -69,6 +69,11 @@ function renderList() { row.textContent = s.display_name; row.dataset.tooltip = tooltipFor(sObj, st); row.addEventListener("click", () => onSlabClick(s.name)); + // In competition mode, hovering a slab highlights its row in + // the leaderboard so the user can scan from picker to result + // without losing context. + row.addEventListener("mouseenter", () => _setRailHover(s.name)); + row.addEventListener("mouseleave", () => _setRailHover(null)); listEl.appendChild(row); } } @@ -451,6 +456,17 @@ function refreshRunAllVisibility() { runAllBtn.style.display = (haveExample || haveCustom) ? "" : "none"; } +function _setRailHover(name) { + // Toggle a `.slab-hover` class on the matching tr in the + // competition table. No-op when the rail isn't visible. + if (runAllSection.style.display === "none") return; + const tbody = runAllTable.querySelector("tbody"); + if (!tbody) return; + for (const tr of tbody.children) { + tr.classList.toggle("slab-hover", tr.dataset.name === name); + } +} + function hideRunAll() { // Picking a different example or pressing Run are signals that // the user's attention has moved off the competition rail. diff --git a/playground/web/style.css b/playground/web/style.css index ba1dbcf0bc..db827c0bc4 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -116,6 +116,11 @@ button.run-all:hover:not(:disabled) { #runall-table tr:hover td { background: #f5f5f5; } #runall-table tr.selected td { background: white; color: var(--fg); font-weight: 600; } #runall-table tr.selected:hover td { background: white; } +/* Hovering the matching slab in the system picker highlights the + competition row, so the user can move between picker and result + without losing track of which system they're looking at. */ +#runall-table tr.slab-hover td { background: #fff4d6; } +#runall-table tr.selected.slab-hover td { background: #fff4d6; } #runall-table td.time.pending { color: var(--muted); } /* Default view: #ui-main fills the row, the (hidden) aside takes no diff --git a/postgresql-orioledb/install b/postgresql-orioledb/install index 88d2cca689..1862e0502d 100755 --- a/postgresql-orioledb/install +++ b/postgresql-orioledb/install @@ -12,6 +12,19 @@ sudo docker pull "orioledb/orioledb:$VERSION" mkdir -p /tmp/data +# Park PGDATA on the per-VM system disk (mounted at /opt/clickbench/sysdisk). +# The default /var/lib/postgresql/data lives in the container's writable +# layer, which sits on the guest's 200 GiB rootfs — orioledb's per-COPY +# undo log roughly doubles the write footprint of the base table and +# blew through that: +# PANIC: could not write buffer to file orioledb_undo/... +# No space left on device (line 69,533,798 of hits.tsv) +# The sysdisk is sized via DISK_OVERRIDES_GB in systems.py (400 GiB +# for this engine), so PGDATA fits there comfortably. +sudo mkdir -p /opt/clickbench/sysdisk/pgdata +# The orioledb image runs postgres as uid 999. +sudo chown 999:999 /opt/clickbench/sysdisk/pgdata + # (Re)create container with our config. Remove any existing one first. if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then sudo docker rm -f "$CONTAINER_NAME" >/dev/null @@ -22,6 +35,7 @@ SHM_SIZE=$(echo "$MEM_SIZE/2/1024" | bc) sudo docker run --name "$CONTAINER_NAME" \ -v /tmp/data:/tmp/data \ + -v /opt/clickbench/sysdisk/pgdata:/var/lib/postgresql/data \ --shm-size="${SHM_SIZE}m" \ -p 5432:5432 \ -e POSTGRES_HOST_AUTH_METHOD=trust \ From c2f986eec21c25ea73fea387881053e65290489a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:27:37 +0000 Subject: [PATCH 159/221] drill: accept sqlline's current 'N row(s) selected (X.YYY seconds)' footer The image's sqlline used to print '(N rows in X.YYY seconds)' below the result rows; current builds print 'N row(s) selected (X.YYY seconds)' instead. Our grep matched only the old form, so the result body kept the summary line and the timing extractor returned empty, failing every query with 'no marker in drill output'. Match either form for stripping, and pull the timing from any '(X.YYY seconds)' suffix. Co-Authored-By: Claude Opus 4.7 --- drill/query | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drill/query b/drill/query index 78429e74a9..1cb78d935b 100755 --- a/drill/query +++ b/drill/query @@ -74,18 +74,23 @@ if [ "$status" -ne 0 ] || printf '%s\n' "$out" | \ exit 1 fi -# sqlline echoes "1/1 SELECT ..." (the script prelude) and -# "(N rows in X.YYY seconds)" alongside the result rows; strip both -# for the result body, then pull the last "(... seconds)" for the timing. +# sqlline echoes "1/1 SELECT ..." (the script prelude) plus a +# footer summarizing the result rows. Older sqlline printed +# (N rows in X.YYY seconds) +# but the current apache/drill image prints +# N row(s) selected (X.YYY seconds) +# Strip either footer from the result body and pull the last +# "(X.YYY seconds)" for the timing. printf '%s\n' "$out" \ - | grep -vE '^[0-9]+/[0-9]+ |\([0-9]+ rows? in [0-9.]+ seconds?\)|^Apache Drill|^"' \ + | grep -vE '^[0-9]+/[0-9]+ |^[0-9]+ rows? selected \([0-9.]+ seconds?\)|\([0-9]+ rows? in [0-9.]+ seconds?\)|^Apache Drill|^"' \ || true -secs=$(printf '%s\n' "$out" | grep -oE '\([0-9]+ rows? in [0-9.]+ seconds?\)' \ - | grep -oE '[0-9.]+ seconds?' | grep -oE '[0-9.]+' | tail -n1) +secs=$(printf '%s\n' "$out" \ + | grep -oE '\([0-9.]+ seconds?\)' \ + | grep -oE '[0-9.]+' | tail -n1) if [ -z "$secs" ]; then - echo "no '(N rows in X.YYY seconds)' marker in drill output" >&2 + echo "no '(X.YYY seconds)' marker in drill output" >&2 exit 1 fi From 58a02c5ff52be747d083fd55556426072aaa53a7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:29:29 +0000 Subject: [PATCH 160/221] tidb: mark .preserve-state so the snapshot keeps the loaded table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tiup playground does not reuse the data dir across restarts even with --tag — each invocation initialises a fresh cluster, drops PD metadata about previously-stored TiKV regions, and the test.hits table becomes invisible. The agent's normal pre-snapshot stop-then-start cycle therefore destroys the data tidb-lightning just spent an hour loading. Mark .preserve-state so the snapshot captures TiDB running as-is (no stop/start cycle around the snapshot), and the restored VM resumes with the table intact. The post-restore btime watcher still re-runs ./start, which is idempotent (returns early when MySQL on :4000 already responds), so this remains compatible with the docker-reconcile path. Co-Authored-By: Claude Opus 4.7 --- tidb/.preserve-state | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tidb/.preserve-state diff --git a/tidb/.preserve-state b/tidb/.preserve-state new file mode 100644 index 0000000000..e69de29bb2 From 1110d9fea147644180876ac8b4b51b85c1ac6601 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:32:21 +0000 Subject: [PATCH 161/221] mongodb: emit timing from bash, not console.error inside mongosh mongosh routes console.error() through its own log formatter rather than to process.stderr the way Node REPL does, so the elapsed time the eval block was printing never reached the agent's _extract_script_timing(stderr) parser. The UI's Time: column was empty for every mongo query. Wrap the mongosh invocation in shell-side date arithmetic and emit the seconds to stderr ourselves. Co-Authored-By: Claude Opus 4.7 --- mongodb/query | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mongodb/query b/mongodb/query index 4c3f7e6946..b49c431e28 100755 --- a/mongodb/query +++ b/mongodb/query @@ -12,11 +12,17 @@ set -e pipeline=$(cat) +# mongosh routes console.error() through its own log formatter rather +# than to process.stderr the way Node REPL does, so the elapsed time +# we used to emit there never reached the agent's +# _extract_script_timing(stderr) parser — and the UI's "Time:" +# column stayed at —. Time the mongosh invocation from bash and +# emit the seconds to stderr ourselves. +t1=$(date +%s.%N) PIPELINE_JSON="$pipeline" mongosh --quiet test --eval ' -const start = new Date(); const pipeline = EJSON.parse(process.env.PIPELINE_JSON); const result = db.hits.aggregate(pipeline, {allowDiskUse: true}).toArray(); -const elapsed = (new Date() - start) / 1000; print(EJSON.stringify(result)); -console.error(elapsed.toFixed(3)); ' +t2=$(date +%s.%N) +awk -v a="$t1" -v b="$t2" 'BEGIN { printf "%.6f\n", b - a }' >&2 From 7377cac4420eb482a16ad0f964f9d1d1295a0e20 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 15:38:28 +0000 Subject: [PATCH 162/221] umbra: drop docker memory cgroup; raise vm.max_map_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous attempt set --memory=256g --memory-swap=-1 --memory-swappiness=100, but on cgroup v2 the swappiness flag is silently discarded and any --memory cap creates a hard cgroup ceiling that the kernel will OOM on regardless of swap. Let Umbra run with no docker memory cgroup and rely on the host kernel + 256 GiB swap drive. Also raise vm.max_map_count to 1048576 — Umbra issues many small mmaps for its memory-mapped storage and a 100M-row COPY blows past the 65530 default well before any OOM-killer fires. Co-Authored-By: Claude Opus 4.7 --- umbra/start | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/umbra/start b/umbra/start index a15989df25..eafb1ebcb4 100755 --- a/umbra/start +++ b/umbra/start @@ -10,15 +10,22 @@ sudo docker rm umbradb >/dev/null 2>&1 || true # Umbra's working set during the ClickBench COPY blows well past the # guest VM's 16 GiB RAM. The agent has already mkswap'd + swapon'd a -# 256 GiB swap.raw block device — we just need to make sure (a) the -# guest kernel is willing to overcommit anonymous mappings well past -# physical RAM (overcommit_memory=1; default 0 is heuristic and -# refuses sufficiently large mmaps), and (b) the docker cgroup -# doesn't sit between Umbra and the swap. No --memory / --memory-swap -# cap so the container can grow into the available RAM+swap, and -# --memory-swappiness=100 so the kernel pages out anonymous memory -# aggressively the moment we exceed physical RAM. -sudo sysctl -wq vm.overcommit_memory=1 vm.swappiness=100 || true +# 256 GiB swap.raw block device, so what we need is: +# - vm.overcommit_memory=1 so the kernel doesn't refuse a single +# huge mmap (default heuristic mode rejects allocations that +# would exceed physical RAM + swap by a wide margin). +# - vm.swappiness=100 to bias the kernel toward paging anonymous +# memory out as soon as we exceed physical RAM (default 60 is +# too conservative — Umbra ENOMEMs before the kernel reclaims +# enough). +# - vm.max_map_count raised. Umbra issues a large number of small +# mmaps; the 65530 default is easy to hit on a 100 M-row COPY. +# - NO docker memory cgroup. cgroup v2 silently discards +# --memory-swappiness, and any --memory cap creates a hard +# ceiling that the kernel will OOM on regardless of how much +# swap is available. Let the host kernel manage memory. +sudo sysctl -wq vm.overcommit_memory=1 vm.swappiness=100 \ + vm.max_map_count=1048576 || true sudo docker run -d --name umbradb \ -v "$(pwd)/db:/var/db" \ @@ -26,9 +33,6 @@ sudo docker run -d --name umbradb \ -p 5432:5432 \ --ulimit nofile=1048576:1048576 \ --ulimit memlock=8388608:8388608 \ - --memory=256g \ - --memory-swap=-1 \ - --memory-swappiness=100 \ umbradb/umbra:latest >/dev/null # Container needs a moment before psql can connect. From a0e189335bcbb029fc81b7372e1618c1e3d7dbd7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 17:35:12 +0000 Subject: [PATCH 163/221] =?UTF-8?q?trino-datalake:=20shim=20classpath=20vi?= =?UTF-8?q?a=20shell=20glob=20=E2=80=94=20trino:455=20has=20no=20find=20bi?= =?UTF-8?q?nary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trino:455 image ships no /usr/bin/find, so the previous 'find /usr/lib/trino -name "*.jar"' classpath collector silently returned empty and javac failed with 'package com.amazonaws.auth does not exist'. Use a brace-glob over the two specific HDFS-plugin jars (aws-java-sdk-core and hadoop-apache) and match either the legacy 'com.amazonaws_' / 'io.trino.hadoop_' name prefix used by older Trino builds or the bare modern name. Tested: javac produces S3AnonymousProvider.class against /usr/lib/trino/plugin/hive/hdfs/aws-java-sdk-core-1.12.770.jar /usr/lib/trino/plugin/hive/hdfs/hadoop-apache-3.3.5-3.jar Co-Authored-By: Claude Opus 4.7 --- trino-datalake-partitioned/install | 17 +++++++++++------ trino-datalake/install | 17 +++++++++++------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/trino-datalake-partitioned/install b/trino-datalake-partitioned/install index bd0798b1e4..ef12e95f3e 100755 --- a/trino-datalake-partitioned/install +++ b/trino-datalake-partitioned/install @@ -33,18 +33,23 @@ public class S3AnonymousProvider implements AWSCredentialsProvider { } EOF -# Compile the shim against every jar under /usr/lib/trino. The narrower -# AWS-SDK + Hadoop name-glob we used to rely on stopped matching after -# pinning trino:455 (the dependency layout shifted between releases), -# producing "package com.amazonaws.auth does not exist" errors. Casting -# the classpath net at every jar avoids guessing at upstream renames. +# Compile the shim against the AWS-SDK and Hadoop jars bundled in the +# image. The trino:455 image does NOT ship a `find` binary, so the +# previous approach (find /usr/lib/trino -name '*.jar') silently +# returned empty and javac complained that com.amazonaws.auth doesn't +# exist. Use a shell glob instead. The legacy file names had a +# `com.amazonaws_` / `io.trino.hadoop_` prefix in earlier Trino +# builds; current :455 drops both prefixes — match either. if [ ! -f shim/S3AnonymousProvider.jar ]; then sudo docker run --rm \ -v "$PWD/shim:/shim" \ --entrypoint sh trinodb/trino:455 -c ' set -e cd /shim - CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") + HDFS=/usr/lib/trino/plugin/hive/hdfs + CP=$(ls "$HDFS"/{com.amazonaws_,}aws-java-sdk-core-*.jar \ + "$HDFS"/{io.trino.hadoop_,}hadoop-apache-*.jar \ + 2>/dev/null | tr "\n" ":") javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' diff --git a/trino-datalake/install b/trino-datalake/install index 9f615c477d..77f75be8c5 100755 --- a/trino-datalake/install +++ b/trino-datalake/install @@ -33,18 +33,23 @@ public class S3AnonymousProvider implements AWSCredentialsProvider { } EOF -# Compile the shim against every jar under /usr/lib/trino. The narrower -# AWS-SDK + Hadoop name-glob we used to rely on stopped matching after -# pinning trino:455 (the dependency layout shifted between releases), -# producing "package com.amazonaws.auth does not exist" errors. Casting -# the classpath net at every jar avoids guessing at upstream renames. +# Compile the shim against the AWS-SDK and Hadoop jars bundled in the +# image. The trino:455 image does NOT ship a `find` binary, so the +# previous approach (find /usr/lib/trino -name '*.jar') silently +# returned empty and javac complained that com.amazonaws.auth doesn't +# exist. Use a shell glob instead. The legacy file names had a +# `com.amazonaws_` / `io.trino.hadoop_` prefix in earlier Trino +# builds; current :455 drops both prefixes — match either. if [ ! -f shim/S3AnonymousProvider.jar ]; then sudo docker run --rm \ -v "$PWD/shim:/shim" \ --entrypoint sh trinodb/trino:455 -c ' set -e cd /shim - CP=$(find /usr/lib/trino -name "*.jar" 2>/dev/null | tr "\n" ":") + HDFS=/usr/lib/trino/plugin/hive/hdfs + CP=$(ls "$HDFS"/{com.amazonaws_,}aws-java-sdk-core-*.jar \ + "$HDFS"/{io.trino.hadoop_,}hadoop-apache-*.jar \ + 2>/dev/null | tr "\n" ":") javac --release 11 -cp "$CP" S3AnonymousProvider.java jar cf S3AnonymousProvider.jar S3AnonymousProvider.class ' From 13b4100c5afbc11daa5558ab590ee294fd46297b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 18:34:37 +0000 Subject: [PATCH 164/221] heavyai: allowlist /tmp for COPY FROM via omnisci.conf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit omnisci/core-os-cpu:v5.10.2 ships with an empty allowed-import-paths, so the load script's COPY hits FROM '/tmp/hits.csv' fails with 'File or directory path "/tmp/hits.csv" is not whitelisted.' Drop an omnisci.conf with [/tmp/] on the allowlist into heavyai-storage before launching the container — the startomnisci wrapper picks it up automatically. Co-Authored-By: Claude Opus 4.7 --- heavyai/install | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/heavyai/install b/heavyai/install index 740e8db9ac..ccae99cff9 100755 --- a/heavyai/install +++ b/heavyai/install @@ -23,6 +23,17 @@ if sudo docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then fi mkdir -p heavyai-storage +# OmniSciDB v5.10 tightened CSV ingestion: paths passed to COPY FROM +# must match the `allowed-import-paths` allowlist or the server +# refuses the load with +# File or directory path "/tmp/hits.csv" is not whitelisted. +# The default in this image is empty. The load script stages the +# dataset at /tmp/hits.csv inside the container, so allow /tmp via +# the config file the startomnisci wrapper reads on launch. +cat > heavyai-storage/omnisci.conf <<'EOF' +allowed-import-paths = ["/tmp/"] +EOF + sudo docker run -d \ --name "$CONTAINER_NAME" \ -p 6274:6274 \ From 74239fb7cca99ee99c6b0a78c5246f01973603ba Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 19:25:34 +0000 Subject: [PATCH 165/221] turso: enable RUST_BACKTRACE=1 in load and query tursodb has been panicking partway through .import: thread 'main' panicked at core/storage/sqlite3_ondisk.rs:818:5: assertion failed: !*syncing.borrow() note: run with `RUST_BACKTRACE=1` environment variable ... The note speaks for itself. Set RUST_BACKTRACE=1 so the panic line in the provision log (and any UI-facing panic from /query) ships with a call stack for the upstream bug report. Co-Authored-By: Claude Opus 4.7 --- turso/load | 8 ++++++++ turso/query | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/turso/load b/turso/load index c7c3fb85a3..f4128f8166 100755 --- a/turso/load +++ b/turso/load @@ -5,6 +5,14 @@ export HOME=${HOME:=~} # shellcheck disable=SC1091 source "$HOME/.turso/env" +# Turso is alpha software and tursodb has been panicking partway +# through the CSV import: +# thread 'main' panicked at core/storage/sqlite3_ondisk.rs:818:5: +# assertion failed: !*syncing.borrow() +# Ask the rust runtime for a full backtrace so the provision log +# captures the call stack instead of just the panic line. +export RUST_BACKTRACE=1 + # Idempotent: blow away any prior DB. rm -f mydb diff --git a/turso/query b/turso/query index 2dcdf26cf2..82cb385f82 100755 --- a/turso/query +++ b/turso/query @@ -8,6 +8,12 @@ export HOME=${HOME:=~} # shellcheck disable=SC1091 source "$HOME/.turso/env" +# Turso is alpha software and tursodb panics at the slightest +# provocation. The default panic message hides the call stack; enable +# rust backtraces so the panic the user sees in the playground UI +# carries enough context to file an upstream bug. +export RUST_BACKTRACE=1 + query=$(cat) TIMEFORMAT='%R' From d8db15354f8f1112628db4db628eb356f8333fb0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 19:50:37 +0000 Subject: [PATCH 166/221] playground: web: pretty-print JSON output bodies Engines like Elasticsearch, Quickwit, Parseable, Druid return raw JSON for every query, which currently lands in the output pane as a single 200-char unwrapped line. If the body is a parseable JSON object or array, re-emit it with 2-space indentation. Cheap pre-filter (first non-whitespace byte must be '{' or '[') keeps us from feeding 14 GB count(*) results through JSON.parse. Co-Authored-By: Claude Opus 4.7 --- playground/web/app.js | 26 +++++++++++++++++++++++++- playground/web/index.html | 2 +- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/playground/web/app.js b/playground/web/app.js index 1dbc8e3a1b..dc75c54396 100644 --- a/playground/web/app.js +++ b/playground/web/app.js @@ -237,6 +237,30 @@ function refreshDownUI() { } } +function _maybePrettyJson(s) { + // If the body parses as JSON (entire string), re-emit it with + // 2-space indentation. Otherwise return the original — most + // engines print plain tables and we shouldn't touch them. + if (!s || s.length < 2) return s; + const first = s.charCodeAt(0); + // Cheap pre-filter: only attempt JSON.parse when the first + // non-whitespace byte looks like '{' or '['. Avoids parsing + // every 14 GB count(*) row through a try/catch. + let i = 0; + while (i < s.length && (s.charCodeAt(i) <= 32)) i++; + const c = s.charCodeAt(i); + if (c !== 123 /* { */ && c !== 91 /* [ */) return s; + try { + const parsed = JSON.parse(s); + // Only pretty-print structured values; bare numbers/strings/ + // booleans shouldn't be re-serialized. + if (parsed === null || typeof parsed !== "object") return s; + return JSON.stringify(parsed, null, 2); + } catch { + return s; + } +} + function showResult(r) { if (!r) { outEl.textContent = ""; @@ -246,7 +270,7 @@ function showResult(r) { uiStats.style.display = "none"; return; } - outEl.textContent = r.output; + outEl.textContent = _maybePrettyJson(r.output); timeEl.textContent = r.time; outLabelEl.textContent = r.truncated === "yes" ? "Output (truncated)" : "Output"; uiOutput.style.display = ""; diff --git a/playground/web/index.html b/playground/web/index.html index 41b767ed56..ea7f2b6332 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -64,6 +64,6 @@

ClickBench Playground — run SQL against 90+ databa - + From 18e73d815a71f5654671b495e137c626fe0a85d5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 20:21:27 +0000 Subject: [PATCH 167/221] starrocks: backend_alive check column 9 (Alive), not column 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SHOW BACKENDS TSV columns are 1 BackendId 2 IP 3 HeartbeatPort 4 BePort 5 HttpPort 6 BrpcPort 7 LastStartTime 8 LastHeartbeat 9 Alive ... We were inspecting column 10 (SystemDecommissioned), which is always "false" once the BE is registered — so the wait loop in ./start timed out even when the backend was alive and serving. Co-Authored-By: Claude Opus 4.7 --- starrocks/start | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/starrocks/start b/starrocks/start index bf358ee2e5..b4cbeb0bb6 100755 --- a/starrocks/start +++ b/starrocks/start @@ -7,9 +7,16 @@ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture) export PATH=$JAVA_HOME/bin:$PATH # Idempotent: if FE is up AND at least one BE is alive, we're done. +# +# `SHOW BACKENDS` TSV columns are: +# 1 BackendId 2 IP 3 HeartbeatPort 4 BePort 5 HttpPort +# 6 BrpcPort 7 LastStartTime 8 LastHeartbeat 9 Alive ... +# An earlier version of this script checked column 10 +# (SystemDecommissioned) instead of 9 (Alive), so the wait loop +# timed out even when SHOW BACKENDS reported the BE as alive. backend_alive() { mysql -h127.0.0.1 -P9030 -uroot -B -N -e 'SHOW BACKENDS' 2>/dev/null \ - | awk -F'\t' 'tolower($10)=="true" { found=1 } END { exit !found }' + | awk -F'\t' 'tolower($9)=="true" { found=1 } END { exit !found }' } if mysql -h127.0.0.1 -P9030 -uroot -e 'SELECT 1' >/dev/null 2>&1 \ From f8f8840b1bf6784c9c94ab7adc1e790c6a6e7c1b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 20:44:42 +0000 Subject: [PATCH 168/221] playground: drop stale 10 KB output-cap references The output cap was raised to 256 KB (CLICKBENCH_OUTPUT_LIMIT, enforced inside the in-VM agent), but README.md and build-progress.md still named '10 KB' and the host-side config still carried an unused output_limit_bytes field with a 10 * 1024 default. Align the docs to reality and remove the dead config field (plus the _env_bytes helper that only fed it). Co-Authored-By: Claude Opus 4.7 --- playground/README.md | 2 +- playground/docs/build-progress.md | 2 +- playground/server/config.py | 7 ------- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/playground/README.md b/playground/README.md index 841c57cf24..2a743757a4 100644 --- a/playground/README.md +++ b/playground/README.md @@ -72,7 +72,7 @@ Environment variables (read by `server/config.py`): | `PLAYGROUND_STATE_DIR` | defaults to `/opt/clickbench-playground` | | `PLAYGROUND_LISTEN` | defaults to `0.0.0.0:8000` | | `PLAYGROUND_MAX_VMS` | concurrent live VMs cap (default 16) | -| `PLAYGROUND_OUTPUT_LIMIT` | response body cap in bytes (default 10240) | +| `CLICKBENCH_OUTPUT_LIMIT` | per-query response body cap in bytes, enforced inside the in-VM agent (default 262144 = 256 KB) | ## Lifecycle of a request diff --git a/playground/docs/build-progress.md b/playground/docs/build-progress.md index 384a348f20..70e267fda6 100644 --- a/playground/docs/build-progress.md +++ b/playground/docs/build-progress.md @@ -52,7 +52,7 @@ into the first cold query. - GET /health, /stats, /provision-log - POST /provision (install → start → check → load → stop → drop_caches) - POST /sync (guest fsync just before host snapshot) - - POST /query (10 KB output cap, fractional-second timing in headers) + - POST /query (256 KB output cap, fractional-second timing in headers) - `playground/images/` — `build-base-rootfs.sh` (Ubuntu 24.04 → flat 8 GB ext4 with agent pre-installed), `build-system-rootfs.sh` (per-system 200 GB sparse rootfs + sized system disk with pre-staged dataset). diff --git a/playground/server/config.py b/playground/server/config.py index 3338c4bd11..8699eb1dee 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -26,10 +26,6 @@ def _env_int(name: str, default: int) -> int: return default -def _env_bytes(name: str, default: int) -> int: - return _env_int(name, default) - - @dataclass(frozen=True) class Config: # Where on the host disk we keep VM artifacts and dataset images. @@ -43,8 +39,6 @@ class Config: vm_vcpus: int vm_mem_mib: int vm_rootfs_size_gb: int - # Output cap applied at the host edge (the agent enforces a per-VM cap too). - output_limit_bytes: int # Max number of VMs we'll keep "warm" (resumed from snapshot, ready to # answer) concurrently. max_warm_vms: int @@ -126,7 +120,6 @@ def load() -> Config: # disabled in systems.py instead of bumping VM RAM for everyone. vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024), vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), - output_limit_bytes=_env_bytes("PLAYGROUND_OUTPUT_LIMIT", 10 * 1024), max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120), cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), From c314486e5e9df004c51022b11c07336b9ed3fa91 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 20:49:41 +0000 Subject: [PATCH 169/221] playground: agent: allow concurrent /query in a single VM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the per-VM _query_lock. Per-system ./query scripts are already careful with scratch state (use \$\$ / mktemp; redirect to sockets the daemon owns) and a quick audit shows no remaining fixed /tmp/ paths. Engines whose runtime client takes an exclusive file lock (embedded DuckDB on hits.db, ...) will fail one of two concurrent requests with their normal lock error — that's visible to the user, and the right answer at the engine level is server-mode or per-connection databases. /provision keeps its own lock. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 369ae88e58..11ed91e662 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -68,9 +68,13 @@ PROVISION_DONE = STATE_DIR / "provisioned" PROVISION_LOG = STATE_DIR / "provision.log" -# Single-writer lock; the agent serializes queries per VM. Two ClickBench -# scripts hitting the same socket/temp file concurrently would not be safe. -_query_lock = threading.Lock() +# Concurrency policy: /query is *not* serialized at the agent level — +# we let the host fan multiple requests at the same VM in parallel. +# Per-system ./query scripts are expected to handle this (use $$ / +# mktemp for scratch state, never a fixed PID file). Engines that +# fundamentally don't support concurrent queries (e.g. embedded +# DuckDB with its file-level exclusive lock) will fail one of the +# concurrent requests; that's acceptable and visible to the user. _provision_lock = threading.Lock() # Tracks whether we've successfully run ./start since this agent process # came up. After a snapshot restore the daemon doesn't exist in the @@ -618,8 +622,7 @@ def do_POST(self) -> None: # (it was stopped pre-snapshot to keep snapshots small). # Subsequent calls are a near-instant no-op. _ensure_daemon_started() - with _query_lock: - rc, out, err, wall = _run_query(sql) + rc, out, err, wall = _run_query(sql) script_t = _extract_script_timing(err) body, truncated = _cap(out) headers = { From 5e3db62293da1bbf6808bc95ebadba9c53685812 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 20:57:59 +0000 Subject: [PATCH 170/221] playground: drop stale build-progress.md The file was a snapshot of what was wired up early in the playground bring-up. The real source of truth is the code + README; everything else here has drifted. Co-Authored-By: Claude Opus 4.7 --- playground/docs/build-progress.md | 103 ------------------------------ 1 file changed, 103 deletions(-) delete mode 100644 playground/docs/build-progress.md diff --git a/playground/docs/build-progress.md b/playground/docs/build-progress.md deleted file mode 100644 index 70e267fda6..0000000000 --- a/playground/docs/build-progress.md +++ /dev/null @@ -1,103 +0,0 @@ -# Playground build progress — checkpoint 2026-05-12 ~21:30 UTC - -## Status: ClickHouse end-to-end works - -``` -$ printf 'SELECT COUNT(*) FROM hits' | curl -sS -X POST --data-binary @- \ - 'http://127.0.0.1:8000/api/query?system=clickhouse' -D - -HTTP/1.1 200 OK -X-Query-Wall-Time: 0.122721 -X-Output-Bytes: 9 -X-Output-Truncated: 0 -X-Query-Time: 0.003000 -X-Wall-Time: 10.112950 -Content-Length: 9 - -99997497 -``` - -Cold path (snapshot restore + daemon start): ~10 s. -Warm path (live VM): subsecond on COUNT / MIN-MAX, ~24 s on top-of-URL. -Output truncation: 244 KB result correctly capped to 10 KB with -`X-Output-Truncated: 1` set. - -## Snapshot footprint - -`snapshot.bin.zst` for ClickHouse: **35 MB** (down from 16 GB raw RAM dump, -~470× compression). The combination that gets us there: - - 1. Agent stops the daemon at the end of /provision (clickhouse stop). - 2. Agent drops the page+dentry+inode cache. - 3. Guest kernel runs with `init_on_free=1` — every freed page is - zero-filled before going back on the free list, so the resulting - RAM is genuinely compressible (not just "freed-but-stale" stale - bytes that look random to zstd). - 4. Host calls a /sync endpoint on the agent immediately before - /vm Paused, so ext4 writeback completes before KVM freezes the - vcpus — no half-flushed pages in the snapshot. - 5. `zstd -T0 -3 --long=27` for parallel compression with a 128 MB - match window (helps with repetitive zero patterns). - -On restore the agent's first /query brings the daemon back up via -`_ensure_daemon_started`. That's ~3-5 s of clickhouse startup amortized -into the first cold query. - -## Components shipped - -- `playground/server/` — aiohttp API (UI + /api/{systems,system,query, - state,admin/provision,provision-log}), per-system Firecracker - lifecycle, monitor watchdog, batched ClickHouse-Cloud logging sink - with JSONL fallback. -- `playground/agent/` — stdlib HTTP agent. Endpoints: - - GET /health, /stats, /provision-log - - POST /provision (install → start → check → load → stop → drop_caches) - - POST /sync (guest fsync just before host snapshot) - - POST /query (256 KB output cap, fractional-second timing in headers) -- `playground/images/` — `build-base-rootfs.sh` (Ubuntu 24.04 → flat 8 GB - ext4 with agent pre-installed), `build-system-rootfs.sh` (per-system - 200 GB sparse rootfs + sized system disk with pre-staged dataset). -- `playground/web/` — vanilla-JS SPA with system picker, query box, - timing display, truncation indicator. - -## Host state - -``` -/opt/clickbench-playground/ -├── bin/firecracker, bin/jailer firecracker v1.13.1 -├── kernel/vmlinux Linux 6.1.141 -├── base-rootfs.ext4 2.6 GB physical / 8 GB apparent -├── datasets/ -│ ├── hits.parquet 14.7 GB -│ ├── hits_partitioned/ 14 GB (100 files) -│ ├── hits.tsv 74 GB -│ ├── hits.csv partial (kill-stopped); .gz intact -└── systems/clickhouse/ - ├── rootfs.ext4 sparse 200 GB - ├── system.ext4 16 GB (parquet + scripts) - ├── snapshot.bin.zst 35 MB - └── snapshot.state 58 KB -``` - -## What's left - -- Build system disks for the remaining 96 systems (template is ready; - each requires its own provision pass — most should "just work" with - the same flow). -- Tighten the External-only exclusion list in `systems.py` once we've - validated which local-only systems actually run. -- Wire ClickHouse Cloud credentials for the logging sink (currently - falling back to JSONL under `/opt/clickbench-playground/logs/`). -- Optional: jailer integration for tighter isolation if the host is - ever multi-tenant. - -## Known sharp edges - -- The `chroot` in `build-base-rootfs.sh` previously tore down the host's - `/dev/pts` via mount propagation, breaking sshd PTY allocation. Fixed - with `mount --make-rslave` (committed); if you see "PTY allocation - request failed on channel 0" after a rebuild, `sudo mount -t devpts - devpts /dev/pts -o gid=5,mode=620,ptmxmode=000` brings it back. -- KVM permissions: a udev rule at `/etc/udev/rules.d/65-kvm.rules` keeps - `/dev/kvm` group=kvm mode=666 so the playground user can open it. -- `vm.dirty_writeback_centisecs=10` on the host (down from 500); revert - if it causes problems elsewhere. From fb9e740c86983a5af7492b66be7dccb78d5a0d19 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 20:58:59 +0000 Subject: [PATCH 171/221] playground: drop stale parallel-provisioning-report.md A point-in-time write-up of the first parallel-provision run; the playground has moved on (snapshot/restore overhaul, per-VM swap, btime-watcher agent, sysdisk overrides, ...) and the report is no longer accurate. Co-Authored-By: Claude Opus 4.7 --- .../docs/parallel-provisioning-report.md | 84 ------------------- 1 file changed, 84 deletions(-) delete mode 100644 playground/docs/parallel-provisioning-report.md diff --git a/playground/docs/parallel-provisioning-report.md b/playground/docs/parallel-provisioning-report.md deleted file mode 100644 index 85950e35fe..0000000000 --- a/playground/docs/parallel-provisioning-report.md +++ /dev/null @@ -1,84 +0,0 @@ -# Parallel-provisioning report — 98 ClickBench systems - -## What works end-to-end - -- **Firecracker microVM lifecycle**: cold boot, agent provision (install → - start → check → load), graceful shutdown, snapshot, restore. Snapshots - compress 16 GiB of guest RAM down to 35-100 MB via init_on_free=1 + - daemon stop + zstd -T0. -- **Shared read-only datasets disk** (datasets.ext4, 173 GB, mounted to - every VM). No per-VM dataset copies — overlay-merged at - `/opt/clickbench/system` along with the system's scripts. -- **Per-restore disk hygiene**: working `rootfs.ext4` / `system.ext4` are - sparse copies of golden images; every restore starts fresh. -- **fstrim before snapshot** — freed dataset bytes don't linger in the - golden disk. -- **Ubuntu generic kernel** (7.0.0-15-generic) with its `linux-modules` - deb unpacked into the rootfs via `dpkg-deb -x`. Boots fine under - Firecracker, supports overlay/veth/br_netfilter/iptable_nat so Docker - can actually run. - -## Bug fixes pushed during the run - -- Port collision: agent moved from 8080 → 50080 so cockroach/spark/trino - can keep using 8080 themselves. -- `mv hits.parquet + chown` → `ln -s + chown -h` across 8 ClickBench - systems. Avoids a 14-75 GB copy per provision. -- `lib/download-hits-*` stubs at `/opt/clickbench/lib` — the few systems - that call `../lib/download-hits-...` get instant symlinks instead of - wget. -- Build-time semaphores: 24 disk builds in parallel, 98 provisions. - Without bounding the disk-heavy phase the NVMe was the bottleneck. -- Per-clone e2fsck / resize2fs removed: base is built directly at 200 GB - sparse, clones are `cp --sparse=always` (1 s each). -- Redundant `sudo sync` removed: `umount` syncs the FS being unmounted - and the global sync was blocking everyone else's writeback. -- `clickbench-net.service`: parses `ip=` from `/proc/cmdline` and applies - it to eth0 — the Ubuntu generic kernel lacks `CONFIG_IP_PNP` so the - kernel boot-arg is a no-op there. -- Module preload: `/etc/modules-load.d/clickbench.conf` ensures - overlay/veth/br_netfilter/iptable_nat/nf_conntrack are loaded at boot. -- TIDB-class sizing: per-VM writable disk bumped to 200 GB sparse so - systems that produce 50-137 GB of data (tidb, postgres-indexed, druid) - don't hit ENOSPC mid-load. - -## Latest run snapshot - -After ~30 min of soaking (current run still in flight): - -| State | Count | -|-------|-------| -| snapshotted (success) | 1 (duckdb-parquet) | -| down (failed) | 10 | -| provisioning (in flight) | 87 | - -The provision-time bottleneck is now apt/pip/cargo downloading -gigabytes of dependencies per VM in parallel. With 30-40 VMs actively -downloading from Ubuntu/PyPI/crates.io we're rate-limited by the -mirrors, not local I/O. Each install takes 5-15 min; the catalog -will need ~60-90 min wall to fully drain. - -## Failure categories (so far) - -| Category | Count | Notes | -|---|---|---| -| Arc admin token | 1 | `arc`: `Could not extract Arc admin API token from journal` — Arc's start probes `journalctl -u arc -f`, which racy/empty in our setup. ClickBench-side issue. | -| ByConity TSO | 1 | `byconity`: load fails with `Can't get process TSO request`. Docker now starts (kernel modules fix); next bug is byconity's internal init. | -| chdb / duckdb agent disconnect | 3 | `chdb-dataframe`, `duckdb`, `duckdb-dataframe`: agent crashed mid-provision (Python OOM during pip install or load). | -| Timeouts | 4 | `clickhouse-parquet`, `drill`, `duckdb-datalake*`: provision exceeded host-side 2-hour timeout. Build + provision were still running. | -| gizmosql server crash | 1 | `gizmosql_server (PID 988) exited before opening port` — system-specific bug in gizmosql's start path. | - -## Docker now works - -Previous run (with firecracker-ci kernel): 6 Docker systems failed -with `Job for docker.service failed because the control process -exited with error code`. This run: zero Docker daemon failures. -`byconity` is the only Docker-based system that failed and it got -past the daemon to its own application logic. - -## What's left - -The 87 in-flight provisions will continue draining over the next -~30-60 minutes. Most should succeed; the long-tail failures are -mostly per-system quirks (Arc journal, gizmosql start path) rather -than infrastructure problems. From c0d57e882cfa0844cf9125bae2682e61eccb5b15 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:04:43 +0000 Subject: [PATCH 172/221] playground: drop stale writer/reader-password parameters from bootstrap.sql The SQL file used to take freshly-rotated writer/reader passwords + the writer's IP as substitution parameters, but those statements were moved into clickhouse_bootstrap.py (which generates the passwords from a state file). The header comment in the SQL still listed the three parameters; only {db:Identifier} is left. Co-Authored-By: Claude Opus 4.7 --- playground/server/clickhouse-bootstrap.sql | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/playground/server/clickhouse-bootstrap.sql b/playground/server/clickhouse-bootstrap.sql index 08fbd7f26b..e410270b47 100644 --- a/playground/server/clickhouse-bootstrap.sql +++ b/playground/server/clickhouse-bootstrap.sql @@ -3,14 +3,10 @@ -- Run as the default user on every server startup. Idempotent: CREATE -- IF NOT EXISTS / CREATE OR REPLACE / ALTER USER ... IDENTIFIED. -- --- Parameters (passed via HTTP ?param_db=... etc. or substituted in --- Python for the user-creation statements where CH doesn't accept --- query parameters): --- {db:Identifier} target database name --- {writer_pw:String} freshly-rotated password for the writer user --- {writer_host:String} IP the writer must connect from (the playground --- server's public IP, as seen by CH Cloud) --- {reader_pw:String} freshly-rotated password for the reader user +-- Parameter: +-- {db:Identifier} target database name (substituted in Python +-- before submit — CH doesn't substitute +-- Identifier params inside CREATE VIEW) -- =========================================================================== -- Schema From 57da6824397fdd095e70318445e77cc95a5d8fab Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:09:09 +0000 Subject: [PATCH 173/221] =?UTF-8?q?playground:=20drop=20reader=5Fpassword?= =?UTF-8?q?=20field=20=E2=80=94=20it=20is=20always=20empty?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reader user is created in ClickHouse with sha256_hash of the empty string, so clients authenticate with just the username and no password. The Credentials.reader_password field was a permanent empty string fed straight into aiohttp.BasicAuth(_, "") which is equivalent to BasicAuth(_). Remove the field; pass only the user. Co-Authored-By: Claude Opus 4.7 --- playground/server/clickhouse_bootstrap.py | 6 ++++-- playground/server/main.py | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/playground/server/clickhouse_bootstrap.py b/playground/server/clickhouse_bootstrap.py index 95a66942d9..3c90b386f7 100644 --- a/playground/server/clickhouse_bootstrap.py +++ b/playground/server/clickhouse_bootstrap.py @@ -39,8 +39,10 @@ class Credentials(NamedTuple): db: str writer_user: str writer_password: str + # The reader's password is *always* empty — the user is created + # in CH with sha256_hash(""), and clients just pass their name + # with no password — so we don't keep it as a field. reader_user: str - reader_password: str def _gen_pw(n: int = 32) -> str: @@ -199,7 +201,7 @@ async def bootstrap(cfg: Config) -> Credentials | None: return Credentials( url=cfg.ch_cloud_url, db=db, writer_user="playground_writer", writer_password=writer_pw, - reader_user="playground_reader", reader_password="", + reader_user="playground_reader", ) diff --git a/playground/server/main.py b/playground/server/main.py index dbc38258a3..aac8073f23 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -291,8 +291,7 @@ async def handle_saved(self, req: web.Request) -> web.Response: async with aiohttp.ClientSession() as s: async with s.post( self.ch_creds.url, data=sql, - auth=aiohttp.BasicAuth(self.ch_creds.reader_user, - self.ch_creds.reader_password), + auth=aiohttp.BasicAuth(self.ch_creds.reader_user), timeout=aiohttp.ClientTimeout(total=10), ) as r: text = await r.text() From e756e52cf23673a427f5602476d29a9bff92127c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:13:48 +0000 Subject: [PATCH 174/221] playground: add clickhouse-web to the catalog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clickhouse-web ATTACHes the hits table to a remote web disk pointed at https://clickhouse-public-datasets.s3.amazonaws.com/web/ — nothing is downloaded during ./load, parts stream on demand at query time, with /dev/shm/clickhouse/ as a local cache. Drop it from the _EXTERNAL exclusion and grant DATALAKE_FILTERED so the SNI-restricted proxy lets the S3 calls through post-snapshot. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index a68480a1c2..c28dd0b780 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -31,7 +31,7 @@ # Managed cloud services / require API keys / external infra. "alloydb", "athena", "athena-partitioned", "aurora-mysql", "aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt", - "clickhouse-cloud", "clickhouse-tencent", "clickhouse-web", + "clickhouse-cloud", "clickhouse-tencent", "crunchy-bridge-for-analytics", "databend", "databricks", "exasol", "firebolt", "firebolt-parquet", "firebolt-parquet-partitioned", "gravitons", "hologres", "hydrolix", "kinetica", @@ -87,6 +87,10 @@ DATALAKE_FILTERED: frozenset[str] = frozenset({ "clickhouse-datalake", "clickhouse-datalake-partitioned", + # clickhouse-web ATTACHes the table to a remote web disk pointed at + # https://clickhouse-public-datasets.s3.amazonaws.com/web/ — every + # query pulls parts on demand, so it needs post-snapshot S3 access. + "clickhouse-web", "duckdb-datalake", "duckdb-datalake-partitioned", "presto-datalake", From 397fa6724c38571031bf56a9b14e43103e65779a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:18:31 +0000 Subject: [PATCH 175/221] playground: enable databend, firebolt{,-parquet,-parquet-partitioned}, kinetica databend, kinetica already have install/start/check/load/query/stop; just drop them from the _EXTERNAL exclusion. Both run as self-hosted binaries / docker images. firebolt + parquet variants only had run.sh + benchmark.sh (the monolithic format), so add per-step scripts wrapping the ghcr.io/firebolt-db/firebolt-core:preview-rc docker image: install/ docker pull start/ docker run with memlock 8 GiB + seccomp unconfined; loop on SELECT 'firebolt-ready' until the engine returns the sentinel (firebolt-core's HTTP port answers immediately but returns 'Cluster not yet healthy' at HTTP 200 until the engine threads have warmed) check/ SELECT 1 load/ drop+create database clickbench, POST create.sql (variant-specific: firebolt INSERTs into a managed table, firebolt-parquet keeps the external table, firebolt-parquet-partitioned uses the parquet glob) query/ POST query to /?database=clickbench&output_format=JSON_Compact; parse .statistics.elapsed for X-Query-Time stop/ docker container stop Each benchmark.sh now exports BENCH_DOWNLOAD_SCRIPT so build-system-rootfs.sh stages hits.parquet (firebolt, firebolt-parquet) or hits_*.parquet (firebolt-parquet-partitioned) on the system disk. Co-Authored-By: Claude Opus 4.7 --- firebolt-parquet-partitioned/benchmark.sh | 4 +++ firebolt-parquet-partitioned/check | 7 +++++ firebolt-parquet-partitioned/install | 6 ++++ firebolt-parquet-partitioned/load | 20 ++++++++++++ firebolt-parquet-partitioned/query | 28 +++++++++++++++++ firebolt-parquet-partitioned/start | 38 +++++++++++++++++++++++ firebolt-parquet-partitioned/stop | 5 +++ firebolt-parquet/benchmark.sh | 4 +++ firebolt-parquet/check | 7 +++++ firebolt-parquet/install | 6 ++++ firebolt-parquet/load | 17 ++++++++++ firebolt-parquet/query | 28 +++++++++++++++++ firebolt-parquet/start | 38 +++++++++++++++++++++++ firebolt-parquet/stop | 5 +++ firebolt/benchmark.sh | 4 +++ firebolt/check | 7 +++++ firebolt/install | 6 ++++ firebolt/load | 20 ++++++++++++ firebolt/query | 28 +++++++++++++++++ firebolt/start | 38 +++++++++++++++++++++++ firebolt/stop | 5 +++ playground/server/systems.py | 5 ++- 22 files changed, 323 insertions(+), 3 deletions(-) create mode 100755 firebolt-parquet-partitioned/check create mode 100755 firebolt-parquet-partitioned/install create mode 100755 firebolt-parquet-partitioned/load create mode 100755 firebolt-parquet-partitioned/query create mode 100755 firebolt-parquet-partitioned/start create mode 100755 firebolt-parquet-partitioned/stop create mode 100755 firebolt-parquet/check create mode 100755 firebolt-parquet/install create mode 100755 firebolt-parquet/load create mode 100755 firebolt-parquet/query create mode 100755 firebolt-parquet/start create mode 100755 firebolt-parquet/stop create mode 100755 firebolt/check create mode 100755 firebolt/install create mode 100755 firebolt/load create mode 100755 firebolt/query create mode 100755 firebolt/start create mode 100755 firebolt/stop diff --git a/firebolt-parquet-partitioned/benchmark.sh b/firebolt-parquet-partitioned/benchmark.sh index 0e6a62ae64..d55d945a07 100755 --- a/firebolt-parquet-partitioned/benchmark.sh +++ b/firebolt-parquet-partitioned/benchmark.sh @@ -1,5 +1,9 @@ #!/bin/bash +# Playground reads this line to pre-stage the dataset on the per-VM +# system disk; the rest of benchmark.sh is unchanged. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" + # Download the partitioned hits parquet files echo "Downloading dataset..." rm -rf data diff --git a/firebolt-parquet-partitioned/check b/firebolt-parquet-partitioned/check new file mode 100755 index 0000000000..862722f602 --- /dev/null +++ b/firebolt-parquet-partitioned/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Firebolt-core's HTTP port answers immediately but may return a +# cluster-not-ready JSON error at HTTP 200. Test for an actual result. +curl -sSf --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1' diff --git a/firebolt-parquet-partitioned/install b/firebolt-parquet-partitioned/install new file mode 100755 index 0000000000..38799727d9 --- /dev/null +++ b/firebolt-parquet-partitioned/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io jq +sudo docker pull ghcr.io/firebolt-db/firebolt-core:preview-rc diff --git a/firebolt-parquet-partitioned/load b/firebolt-parquet-partitioned/load new file mode 100755 index 0000000000..e309c8968f --- /dev/null +++ b/firebolt-parquet-partitioned/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +# Partitioned-parquet variant: stage hits_*.parquet under ./data so +# the container sees them at /firebolt-core/clickbench/*.parquet; +# create.sql declares an external table with FROM PATTERN that +# matches the glob. +mkdir -p data +shopt -s nullglob +for f in hits_*.parquet; do + mv -f "$f" "data/$f" +done +shopt -u nullglob + +curl -sSf 'http://localhost:3473/?enable_multi_query_requests=true' \ + --data-binary 'DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;' +curl -sSf 'http://localhost:3473/?database=clickbench&enable_multi_query_requests=true' \ + --data-binary @create.sql + +sync diff --git a/firebolt-parquet-partitioned/query b/firebolt-parquet-partitioned/query new file mode 100755 index 0000000000..910591e6b8 --- /dev/null +++ b/firebolt-parquet-partitioned/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it against the firebolt-core +# container via /?database=clickbench. +# Stdout: query result (firebolt's JSON_Compact format). +# Stderr: query runtime in fractional seconds on the last line, +# pulled from the response's `.statistics.elapsed`. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Result + sub-result caches off so timings are real; output_format +# matches what firebolt's run.sh uses for the public benchmark. +PARAMS='database=clickbench&enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=false&output_format=JSON_Compact' + +resp=$(curl -sS --max-time 600 "http://localhost:3473/?${PARAMS}" \ + --data-binary "$query") + +# Firebolt returns a JSON object whether the query succeeded or not. +# A failed query has an "errors" key; a successful one carries +# "data" + "statistics". +if printf '%s' "$resp" | jq -e '.errors' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" +printf '%s\n' "$resp" | jq -r '.statistics.elapsed' >&2 diff --git a/firebolt-parquet-partitioned/start b/firebolt-parquet-partitioned/start new file mode 100755 index 0000000000..552c7c5014 --- /dev/null +++ b/firebolt-parquet-partitioned/start @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +# Idempotent: if firebolt-core already answers SELECT 1, do nothing. +if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1'; then + exit 0 +fi + +mkdir -p data + +# `firebolt-core` is the public self-hosted image. Container needs +# memlock 8 GiB and seccomp unconfined per upstream's run docs. The +# data dir maps to /firebolt-core/clickbench inside the container so +# create.sql's FROM PATTERN can read the parquet files. +sudo docker run -dit --name firebolt-core --rm \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v /firebolt-core/volume \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null + +# Wait for the cluster to be "actually" ready. firebolt-core's HTTP +# port comes up immediately but returns +# {"errors":[{"description":"Cluster not yet healthy: ..."}]} +# at HTTP 200 until the engine threads have warmed; bench against a +# sentinel string instead of HTTP status to avoid that trap. +for _ in $(seq 1 600); do + if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary "SELECT 'firebolt-ready';" 2>/dev/null \ + | grep -q 'firebolt-ready'; then + exit 0 + fi + sleep 1 +done +echo "firebolt-core did not become healthy in 10 min" >&2 +exit 1 diff --git a/firebolt-parquet-partitioned/stop b/firebolt-parquet-partitioned/stop new file mode 100755 index 0000000000..6860cd226f --- /dev/null +++ b/firebolt-parquet-partitioned/stop @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# firebolt-core was started with --rm; stop is enough to clean up. +sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/firebolt-parquet/benchmark.sh b/firebolt-parquet/benchmark.sh index 737a3ca865..4517ed36b1 100755 --- a/firebolt-parquet/benchmark.sh +++ b/firebolt-parquet/benchmark.sh @@ -1,5 +1,9 @@ #!/bin/bash +# Playground reads this line to pre-stage the dataset on the per-VM +# system disk; the rest of benchmark.sh is unchanged. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" + # Download the hits.parquet file echo "Downloading dataset..." rm -rf data diff --git a/firebolt-parquet/check b/firebolt-parquet/check new file mode 100755 index 0000000000..862722f602 --- /dev/null +++ b/firebolt-parquet/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Firebolt-core's HTTP port answers immediately but may return a +# cluster-not-ready JSON error at HTTP 200. Test for an actual result. +curl -sSf --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1' diff --git a/firebolt-parquet/install b/firebolt-parquet/install new file mode 100755 index 0000000000..38799727d9 --- /dev/null +++ b/firebolt-parquet/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io jq +sudo docker pull ghcr.io/firebolt-db/firebolt-core:preview-rc diff --git a/firebolt-parquet/load b/firebolt-parquet/load new file mode 100755 index 0000000000..8d3886ee9d --- /dev/null +++ b/firebolt-parquet/load @@ -0,0 +1,17 @@ +#!/bin/bash +set -eu + +# Parquet variant: data stays in ./data (mounted as +# /firebolt-core/clickbench in the container), create.sql declares +# an external table that reads it on every query. +mkdir -p data +if [ -f hits.parquet ]; then + mv -f hits.parquet data/hits.parquet +fi + +curl -sSf 'http://localhost:3473/?enable_multi_query_requests=true' \ + --data-binary 'DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;' +curl -sSf 'http://localhost:3473/?database=clickbench&enable_multi_query_requests=true' \ + --data-binary @create.sql + +sync diff --git a/firebolt-parquet/query b/firebolt-parquet/query new file mode 100755 index 0000000000..910591e6b8 --- /dev/null +++ b/firebolt-parquet/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it against the firebolt-core +# container via /?database=clickbench. +# Stdout: query result (firebolt's JSON_Compact format). +# Stderr: query runtime in fractional seconds on the last line, +# pulled from the response's `.statistics.elapsed`. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Result + sub-result caches off so timings are real; output_format +# matches what firebolt's run.sh uses for the public benchmark. +PARAMS='database=clickbench&enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=false&output_format=JSON_Compact' + +resp=$(curl -sS --max-time 600 "http://localhost:3473/?${PARAMS}" \ + --data-binary "$query") + +# Firebolt returns a JSON object whether the query succeeded or not. +# A failed query has an "errors" key; a successful one carries +# "data" + "statistics". +if printf '%s' "$resp" | jq -e '.errors' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" +printf '%s\n' "$resp" | jq -r '.statistics.elapsed' >&2 diff --git a/firebolt-parquet/start b/firebolt-parquet/start new file mode 100755 index 0000000000..552c7c5014 --- /dev/null +++ b/firebolt-parquet/start @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +# Idempotent: if firebolt-core already answers SELECT 1, do nothing. +if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1'; then + exit 0 +fi + +mkdir -p data + +# `firebolt-core` is the public self-hosted image. Container needs +# memlock 8 GiB and seccomp unconfined per upstream's run docs. The +# data dir maps to /firebolt-core/clickbench inside the container so +# create.sql's FROM PATTERN can read the parquet files. +sudo docker run -dit --name firebolt-core --rm \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v /firebolt-core/volume \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null + +# Wait for the cluster to be "actually" ready. firebolt-core's HTTP +# port comes up immediately but returns +# {"errors":[{"description":"Cluster not yet healthy: ..."}]} +# at HTTP 200 until the engine threads have warmed; bench against a +# sentinel string instead of HTTP status to avoid that trap. +for _ in $(seq 1 600); do + if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary "SELECT 'firebolt-ready';" 2>/dev/null \ + | grep -q 'firebolt-ready'; then + exit 0 + fi + sleep 1 +done +echo "firebolt-core did not become healthy in 10 min" >&2 +exit 1 diff --git a/firebolt-parquet/stop b/firebolt-parquet/stop new file mode 100755 index 0000000000..6860cd226f --- /dev/null +++ b/firebolt-parquet/stop @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# firebolt-core was started with --rm; stop is enough to clean up. +sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/firebolt/benchmark.sh b/firebolt/benchmark.sh index f27bc92f71..e6ca77e1e2 100755 --- a/firebolt/benchmark.sh +++ b/firebolt/benchmark.sh @@ -1,5 +1,9 @@ #!/bin/bash +# Playground reads this line to pre-stage the dataset on the per-VM +# system disk; the rest of benchmark.sh is unchanged. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" + # Download the hits.parquet file echo "Downloading dataset..." rm -rf data diff --git a/firebolt/check b/firebolt/check new file mode 100755 index 0000000000..862722f602 --- /dev/null +++ b/firebolt/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Firebolt-core's HTTP port answers immediately but may return a +# cluster-not-ready JSON error at HTTP 200. Test for an actual result. +curl -sSf --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1' diff --git a/firebolt/install b/firebolt/install new file mode 100755 index 0000000000..38799727d9 --- /dev/null +++ b/firebolt/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io jq +sudo docker pull ghcr.io/firebolt-db/firebolt-core:preview-rc diff --git a/firebolt/load b/firebolt/load new file mode 100755 index 0000000000..ba2864fe32 --- /dev/null +++ b/firebolt/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +# Stage hits.parquet where the container can see it (./data is +# bind-mounted as /firebolt-core/clickbench). +mkdir -p data +if [ -f hits.parquet ]; then + mv -f hits.parquet data/hits.parquet +fi + +# create.sql CREATEs hits_external pointing at the parquet file, then +# INSERTs into the managed `hits` table — the ingested-to-Firebolt +# variant of the benchmark. +curl -sSf 'http://localhost:3473/?enable_multi_query_requests=true' \ + --data-binary 'DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;' +curl -sSf 'http://localhost:3473/?database=clickbench&enable_multi_query_requests=true' \ + --data-binary @create.sql + +rm -f data/hits.parquet +sync diff --git a/firebolt/query b/firebolt/query new file mode 100755 index 0000000000..910591e6b8 --- /dev/null +++ b/firebolt/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it against the firebolt-core +# container via /?database=clickbench. +# Stdout: query result (firebolt's JSON_Compact format). +# Stderr: query runtime in fractional seconds on the last line, +# pulled from the response's `.statistics.elapsed`. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Result + sub-result caches off so timings are real; output_format +# matches what firebolt's run.sh uses for the public benchmark. +PARAMS='database=clickbench&enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=false&output_format=JSON_Compact' + +resp=$(curl -sS --max-time 600 "http://localhost:3473/?${PARAMS}" \ + --data-binary "$query") + +# Firebolt returns a JSON object whether the query succeeded or not. +# A failed query has an "errors" key; a successful one carries +# "data" + "statistics". +if printf '%s' "$resp" | jq -e '.errors' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" +printf '%s\n' "$resp" | jq -r '.statistics.elapsed' >&2 diff --git a/firebolt/start b/firebolt/start new file mode 100755 index 0000000000..552c7c5014 --- /dev/null +++ b/firebolt/start @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +# Idempotent: if firebolt-core already answers SELECT 1, do nothing. +if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1'; then + exit 0 +fi + +mkdir -p data + +# `firebolt-core` is the public self-hosted image. Container needs +# memlock 8 GiB and seccomp unconfined per upstream's run docs. The +# data dir maps to /firebolt-core/clickbench inside the container so +# create.sql's FROM PATTERN can read the parquet files. +sudo docker run -dit --name firebolt-core --rm \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v /firebolt-core/volume \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null + +# Wait for the cluster to be "actually" ready. firebolt-core's HTTP +# port comes up immediately but returns +# {"errors":[{"description":"Cluster not yet healthy: ..."}]} +# at HTTP 200 until the engine threads have warmed; bench against a +# sentinel string instead of HTTP status to avoid that trap. +for _ in $(seq 1 600); do + if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary "SELECT 'firebolt-ready';" 2>/dev/null \ + | grep -q 'firebolt-ready'; then + exit 0 + fi + sleep 1 +done +echo "firebolt-core did not become healthy in 10 min" >&2 +exit 1 diff --git a/firebolt/stop b/firebolt/stop new file mode 100755 index 0000000000..6860cd226f --- /dev/null +++ b/firebolt/stop @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# firebolt-core was started with --rm; stop is enough to clean up. +sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/playground/server/systems.py b/playground/server/systems.py index c28dd0b780..60dd26c3f7 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -32,9 +32,8 @@ "alloydb", "athena", "athena-partitioned", "aurora-mysql", "aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt", "clickhouse-cloud", "clickhouse-tencent", - "crunchy-bridge-for-analytics", "databend", "databricks", "exasol", - "firebolt", "firebolt-parquet", "firebolt-parquet-partitioned", - "gravitons", "hologres", "hydrolix", "kinetica", + "crunchy-bridge-for-analytics", "databricks", "exasol", + "gravitons", "hologres", "hydrolix", "motherduck", "pgpro_tam", "redshift", "redshift-serverless", "s3select", "singlestore", "snowflake", "supabase", "tembo-olap", "timescale-cloud", "tinybird", "velodb", From 8d3d4bf7dc198bd349c7181344ac08a4fb49fe51 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:28:07 +0000 Subject: [PATCH 176/221] =?UTF-8?q?playground:=20security=20hardening=20?= =?UTF-8?q?=E2=80=94=20aiohttp=20symlink,=20trusted=20internet,=20proxy=20?= =?UTF-8?q?+=20DNS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the security advisories from the review pass: 1. aiohttp static handler: drop follow_symlinks=True. GHSA-5h86-8mv2-jq9f was a path-traversal in the static handler reachable only when symlinks were followed. The repo's web/ tree has no symlinks anyway, so this is pure attack-surface reduction. 2. TRUSTED_INTERNET set removed. clickhouse{,-parquet,-parquet-partitioned} and chdb{,-parquet,-parquet-partitioned} no longer get unrestricted internet at query time — they all run through the SNI-allowlist proxy now. A user SQL that asked clickhouse-client to fetch http://169.254.169.254/... can no longer reach the EC2 metadata service or any RFC1918 destination; only the S3 hosts in sni_proxy.DEFAULT_ALLOW survive. 3. SNI proxy / local DNS resolver bound to internal traffic only. New net.setup_host_firewall() installs INPUT rules accepting 8443/8080/53 only from the 10.200.0.0/16 TAP CIDR and loopback, then DROP for anything else. Called once at server startup. Without these rules the proxy was an open, unauthenticated S3 allowlist relay reachable from the public internet. 4. DNS via local resolver, UDP only. enable_filtered_internet now REDIRECTs the VM's UDP/53 to the host's local resolver and DROPs TCP/53 outright (no big-payload exfiltration channel via port 53). The previous ACCEPT-and-forward path is gone; the POSTROUTING MASQUERADE that supported it is no longer needed either since the SNI proxy opens its own outbound socket. 5. /api/admin/provision/{name} restricted to loopback callers. It re-runs install/start/load — can take hours per system — so anonymous internet callers triggering it would be a trivial DoS and lateral-movement risk. peer-IP check; behind a reverse proxy the proxy itself is the peer, which is fine (the proxy is part of the admin trust boundary). Co-Authored-By: Claude Opus 4.7 --- playground/server/main.py | 23 ++++++- playground/server/net.py | 118 ++++++++++++++++++++++---------- playground/server/systems.py | 31 ++++----- playground/server/vm_manager.py | 11 ++- 4 files changed, 122 insertions(+), 61 deletions(-) diff --git a/playground/server/main.py b/playground/server/main.py index aac8073f23..2b187febfb 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -87,6 +87,11 @@ async def on_startup(self, _app: web.Application) -> None: https_port=net.PROXY_HTTPS_PORT, http_port=net.PROXY_HTTP_PORT, ) + # Lock the proxy + local DNS resolver to internal/VM traffic + # only. The proxy binds 0.0.0.0 so iptables PREROUTING REDIRECT + # from each TAP can find it; without these INPUT rules it + # would be an open S3 allowlist relay on the public internet. + await net.setup_host_firewall() async def on_cleanup(self, _app: web.Application) -> None: await self.monitor.stop() @@ -161,6 +166,19 @@ async def handle_provision_log(self, req: web.Request) -> web.Response: return web.Response(body=data, content_type="text/plain") async def handle_admin_provision(self, req: web.Request) -> web.Response: + # Heavy operation (re-runs install/start/load, can take hours on + # the postgres-indexed-class systems): only callable from the + # host itself. The public UI must never be able to trigger this. + # Trust the TCP peer address — we don't honor X-Forwarded-For + # here, because the server is meant to listen on the same host + # the operator drives the curls from. If you put it behind a + # reverse proxy, the proxy itself becomes the peer and the + # check still passes (which is fine: the proxy is part of the + # admin trust boundary). + peer = req.transport.get_extra_info("peername") if req.transport else None + peer_ip = peer[0] if peer else "" + if peer_ip not in ("127.0.0.1", "::1"): + raise web.HTTPForbidden(reason="admin endpoint, loopback only") name = req.match_info["name"] if name not in self.systems: raise web.HTTPNotFound() @@ -420,7 +438,10 @@ async def cors(request: web.Request, handler): app.router.add_get("/", root_redirect) app.router.add_get("/ui/", ui_index) app.router.add_get("/ui", ui_index) - app.router.add_static("/ui/", path=str(web_dir), show_index=False, follow_symlinks=True) + # follow_symlinks=False — GHSA-5h86-8mv2-jq9f covers a path-traversal + # in aiohttp's static handler that's only reachable when symlinks are + # followed. The repo's web/ tree has no symlinks anyway. + app.router.add_static("/ui/", path=str(web_dir), show_index=False) return app diff --git a/playground/server/net.py b/playground/server/net.py index 746e789cf2..82c6e7d474 100644 --- a/playground/server/net.py +++ b/playground/server/net.py @@ -118,15 +118,56 @@ async def enable_internet(slot: int) -> None: PROXY_HTTPS_PORT = 8443 PROXY_HTTP_PORT = 8080 +# /16 we hand TAP addresses out of — used to scope INPUT firewall rules. +_INTERNAL_CIDR = f"{_BASE}.0.0/16" + + +async def setup_host_firewall() -> None: + """Install INPUT rules so the SNI proxy + local DNS resolver are + only reachable from the per-VM TAPs (10.200.0.0/16) and loopback. + Run once at server startup. + + Why this matters: sni_proxy.py binds 0.0.0.0:{8443,8080} so the + iptables PREROUTING REDIRECT from the VM's TAP can find it + regardless of which TAP IP the kernel routes the redirected + packet to. Without these INPUT rules the proxy would be an + open, unauthenticated S3 allowlist relay reachable from the + public internet. Same logic for the host's UDP/53 resolver. + """ + # (proto, dport) + ports = ( + ("tcp", str(PROXY_HTTPS_PORT)), + ("tcp", str(PROXY_HTTP_PORT)), + ("tcp", "53"), + ("udp", "53"), + ) + for proto, dport in ports: + for src in (_INTERNAL_CIDR, "127.0.0.0/8"): + allow = ("-p", proto, "--dport", dport, "-s", src, "-j", "ACCEPT") + rc, _, _ = await _run("sudo", "iptables", "-C", "INPUT", + *allow, check=False) + if rc != 0: + # Insert at the top so we override any permissive default. + await _run("sudo", "iptables", "-I", "INPUT", "1", *allow) + drop = ("-p", proto, "--dport", dport, "-j", "DROP") + rc, _, _ = await _run("sudo", "iptables", "-C", "INPUT", + *drop, check=False) + if rc != 0: + await _run("sudo", "iptables", "-A", "INPUT", *drop) + async def enable_filtered_internet(slot: int) -> None: """Allow the VM to reach the *allowlisted* outside world only. - Redirects all TCP 80/443 from the VM's TAP to the host's - SNI-filtering proxy (see sni_proxy.py). DNS (UDP+TCP 53) is - permitted untouched so the VM can still resolve hostnames; the - proxy itself uses the host's resolver to open the upstream socket. - Every other outbound port from the VM is DROPped. + PREROUTING REDIRECTs: + - TCP 443/80 → the host's SNI-filtering proxy. + - UDP 53 → the host's local DNS resolver (operator must run + a UDP-only resolver on the host — see + playground/scripts/install-firecracker.sh). + TCP 53 is dropped entirely (no big-payload DNS, the classic + exfiltration channel — see GHSA / RFC1918 advisories cited in + the security review). Every other outbound port from the VM is + DROPped at FORWARD. """ # Clear any prior `enable_internet` ACCEPT — its blanket allow # rule would otherwise take precedence over the DROP we'll add @@ -135,25 +176,35 @@ async def enable_filtered_internet(slot: int) -> None: await disable_internet(slot) tap = tap_name(slot) iface = await _host_default_iface() - _, _, cidr = addr_for(slot) - # NAT redirects for the http/https ports. - for dport, to_port in ((443, PROXY_HTTPS_PORT), (80, PROXY_HTTP_PORT)): - match = ("-i", tap, "-p", "tcp", "--dport", str(dport), - "-j", "REDIRECT", "--to-ports", str(to_port)) + # NAT redirects: + # TCP 443/80 -> SNI proxy + # UDP 53 -> host's local DNS resolver on port 53 + nat_rules = ( + ("-i", tap, "-p", "tcp", "--dport", "443", + "-j", "REDIRECT", "--to-ports", str(PROXY_HTTPS_PORT)), + ("-i", tap, "-p", "tcp", "--dport", "80", + "-j", "REDIRECT", "--to-ports", str(PROXY_HTTP_PORT)), + ("-i", tap, "-p", "udp", "--dport", "53", + "-j", "REDIRECT", "--to-ports", "53"), + ) + for match in nat_rules: rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-C", "PREROUTING", *match, check=False) if rc != 0: await _run("sudo", "iptables", "-t", "nat", "-A", "PREROUTING", *match) - # FORWARD: allow DNS, allow established replies, drop everything else. + # FORWARD: drop TCP/53 (DNS tunneling), drop UDP/53 too as a + # belt-and-braces (the REDIRECT above already short-circuits it, + # but if the resolver is down we don't want fall-through to + # upstream). Allow established replies for the SNI proxy's + # outbound to upstream. Catchall DROP at the end. forward_rules = ( - ("-i", tap, "-p", "udp", "--dport", "53", "-j", "ACCEPT"), - ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "ACCEPT"), + ("-i", tap, "-p", "udp", "--dport", "53", "-j", "DROP"), + ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "DROP"), ("-i", iface, "-o", tap, "-m", "state", "--state", "RELATED,ESTABLISHED", "-j", "ACCEPT"), - # Catchall drop. Must come last. ("-i", tap, "-j", "DROP"), ) for rule in forward_rules: @@ -161,33 +212,37 @@ async def enable_filtered_internet(slot: int) -> None: check=False) if rc != 0: await _run("sudo", "iptables", "-A", "FORWARD", *rule) - - # MASQUERADE so the DNS replies (and the host's outbound to the proxy - # target) get NAT'd properly back to the VM. - rc, out, _ = await _run("sudo", "iptables", "-t", "nat", "-S", "POSTROUTING") - if f"-s {cidr}" not in out.decode(errors="replace"): - await _run("sudo", "iptables", "-t", "nat", "-A", "POSTROUTING", - "-s", cidr, "-o", iface, "-j", "MASQUERADE") + # No POSTROUTING MASQUERADE here: the SNI proxy on the host opens + # its OWN outbound socket to the allowlisted upstream, so the + # host's normal egress path handles the source rewrite. The VM's + # only legitimate outbound traffic now goes via REDIRECT to a + # local listener; nothing on the VM's CIDR ever reaches the + # outside interface directly. async def disable_filtered_internet(slot: int) -> None: """Drop the rules added by enable_filtered_internet. Idempotent.""" tap = tap_name(slot) iface = await _host_default_iface() - _, _, cidr = addr_for(slot) - for dport, to_port in ((443, PROXY_HTTPS_PORT), (80, PROXY_HTTP_PORT)): + nat_rules = ( + ("-i", tap, "-p", "tcp", "--dport", "443", + "-j", "REDIRECT", "--to-ports", str(PROXY_HTTPS_PORT)), + ("-i", tap, "-p", "tcp", "--dport", "80", + "-j", "REDIRECT", "--to-ports", str(PROXY_HTTP_PORT)), + ("-i", tap, "-p", "udp", "--dport", "53", + "-j", "REDIRECT", "--to-ports", "53"), + ) + for match in nat_rules: while True: rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-D", - "PREROUTING", "-i", tap, "-p", "tcp", - "--dport", str(dport), "-j", "REDIRECT", - "--to-ports", str(to_port), check=False) + "PREROUTING", *match, check=False) if rc != 0: break forward_rules = ( - ("-i", tap, "-p", "udp", "--dport", "53", "-j", "ACCEPT"), - ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "ACCEPT"), + ("-i", tap, "-p", "udp", "--dport", "53", "-j", "DROP"), + ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "DROP"), ("-i", iface, "-o", tap, "-m", "state", "--state", "RELATED,ESTABLISHED", "-j", "ACCEPT"), ("-i", tap, "-j", "DROP"), @@ -199,13 +254,6 @@ async def disable_filtered_internet(slot: int) -> None: if rc != 0: break - while True: - rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-D", - "POSTROUTING", "-s", cidr, "-o", iface, - "-j", "MASQUERADE", check=False) - if rc != 0: - break - async def disable_internet(slot: int) -> None: """Drop the masquerade + forward rules added by enable_internet.""" diff --git a/playground/server/systems.py b/playground/server/systems.py index 60dd26c3f7..e7c1743f58 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -63,29 +63,24 @@ "paradedb", "paradedb-partitioned", "pg_duckdb-motherduck", } -# Systems we trust to keep outbound internet access *after* the snapshot, -# i.e. at query time. Used by datalake-style benchmarks that read live S3 -# during the query; without internet they fail with a DNS error. Stays -# tight on purpose — adding a system here means user queries from that -# VM can reach the wider internet, so only put ClickHouse-family engines -# here (per request). -TRUSTED_INTERNET: frozenset[str] = frozenset({ - "clickhouse", - "clickhouse-parquet", - "clickhouse-parquet-partitioned", +# Systems that need outbound access at query time get routed through +# the SNI-allowlist proxy on the host (see sni_proxy.py + +# net.enable_filtered_internet). Only HTTPS to the S3 hosts in +# sni_proxy.DEFAULT_ALLOW survives; everything else is dropped. The +# ClickHouse-family engines used to live in a separate +# `TRUSTED_INTERNET` set that gave them unrestricted egress (so an +# arbitrary user SQL could `url('http://169.254.169.254/...')` or +# reach internal hosts) — that set is gone; they all now use this +# filtered path too. +DATALAKE_FILTERED: frozenset[str] = frozenset({ "chdb", "chdb-parquet", "chdb-parquet-partitioned", -}) - -# Systems that need outbound access only to s3.amazonaws.com (or -# regional S3 hostnames). Their post-snapshot internet is routed -# through an SNI-allowlist proxy on the host (see sni_proxy.py + -# net.enable_filtered_internet). HTTPS works end-to-end; everything -# else is dropped. -DATALAKE_FILTERED: frozenset[str] = frozenset({ + "clickhouse", "clickhouse-datalake", "clickhouse-datalake-partitioned", + "clickhouse-parquet", + "clickhouse-parquet-partitioned", # clickhouse-web ATTACHes the table to a remote web disk pointed at # https://clickhouse-public-datasets.s3.amazonaws.com/web/ — every # query pulls parts on demand, so it needs post-snapshot S3 access. diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 2fe2ca8cee..41cb997ea1 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -37,7 +37,7 @@ from . import net from .systems import NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB from .config import Config -from .systems import System, TRUSTED_INTERNET, DATALAKE_FILTERED +from .systems import System, DATALAKE_FILTERED log = logging.getLogger("vm_manager") @@ -351,7 +351,7 @@ async def _initial_provision(self, vm: VM) -> None: await self._shutdown(vm) if vm.system.name in DATALAKE_FILTERED: await net.enable_filtered_internet(vm.slot) - elif vm.system.name not in TRUSTED_INTERNET: + else: await net.disable_internet(vm.slot) vm.state = "snapshotted" vm.provisioned_at = time.time() @@ -628,13 +628,10 @@ async def _restore_snapshot(self, vm: VM) -> None: # Firecracker tries to mmap it. await self._decompress_snapshot(vm) await net.ensure_tap(vm.slot) - # Trusted systems (e.g. ClickHouse variants that read live S3 at - # query time) keep outbound internet after restore. Everything - # else stays offline. + # Systems that read live S3 at query time get the SNI-allowlist + # proxy. Everything else stays fully offline post-snapshot. if vm.system.name in DATALAKE_FILTERED: await net.enable_filtered_internet(vm.slot) - elif vm.system.name in TRUSTED_INTERNET: - await net.enable_internet(vm.slot) await self._boot(vm, restore_snapshot=True) await self._wait_for_agent(vm, timeout=60) # Block here until the system's daemon reports ready, so the From ed1bbbb128d012a7319e9d8ccfc442dadc4aa783 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:34:44 +0000 Subject: [PATCH 177/221] playground: drop unused PLAYGROUND_MAX_VMS / max_warm_vms The Config field was advertised as a "concurrent live VMs cap" but nothing in vm_manager / monitor / main ever read it. Drop the dataclass field, the _env_int default, and the README row. Co-Authored-By: Claude Opus 4.7 --- playground/README.md | 1 - playground/server/config.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/playground/README.md b/playground/README.md index 2a743757a4..95b0a85a6d 100644 --- a/playground/README.md +++ b/playground/README.md @@ -71,7 +71,6 @@ Environment variables (read by `server/config.py`): | `CLICKHOUSE_CLOUD_PASSWORD` | password | | `PLAYGROUND_STATE_DIR` | defaults to `/opt/clickbench-playground` | | `PLAYGROUND_LISTEN` | defaults to `0.0.0.0:8000` | -| `PLAYGROUND_MAX_VMS` | concurrent live VMs cap (default 16) | | `CLICKBENCH_OUTPUT_LIMIT` | per-query response body cap in bytes, enforced inside the in-VM agent (default 262144 = 256 KB) | ## Lifecycle of a request diff --git a/playground/server/config.py b/playground/server/config.py index 8699eb1dee..1fbc215955 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -39,9 +39,6 @@ class Config: vm_vcpus: int vm_mem_mib: int vm_rootfs_size_gb: int - # Max number of VMs we'll keep "warm" (resumed from snapshot, ready to - # answer) concurrently. - max_warm_vms: int # Watchdog thresholds. cpu_busy_window_sec: int cpu_busy_threshold: float @@ -120,7 +117,6 @@ def load() -> Config: # disabled in systems.py instead of bumping VM RAM for everyone. vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024), vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), - max_warm_vms=_env_int("PLAYGROUND_MAX_VMS", 16), cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120), cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), vm_cpu_total_seconds_cap=_env_int("VM_CPU_TOTAL_SECONDS_CAP", 3600), From 8147d664e2ca79cbab20cef5c9bb662ac52615aa Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:39:19 +0000 Subject: [PATCH 178/221] playground: aiohttp pin, systemd hardening, per-IP rate limits aiohttp: - Add startup assertion: aiohttp >= 3.10 (covers GHSA-5h86-8mv2-jq9f static-handler path traversal fixed in 3.9.2, and the request- smuggling fixes in 3.9.4 / 3.10.x). Already true on the running host (Ubuntu's python3-aiohttp ships 3.13.3), but the assertion catches a future install on a stale image. - Add playground/requirements.txt with the pin for pip-based setups. systemd unit: - Drop in ProtectSystem=full, ProtectHome=read-only, ProtectKernelTunables/Modules/ControlGroups/Clock, PrivateTmp, RestrictAddressFamilies, LockPersonality, RestrictRealtime, RestrictNamespaces. - Explicit ReadWritePaths to /opt/clickbench-playground + ~/.cache (Python bytecode). - Comments explain what we DON'T set (NoNewPrivileges / RestrictSUIDSGID would break sudo, ProtectSystem=strict would break the privileged children, PrivateNetwork / PrivateDevices would break TAP + /dev/kvm). Rate limiting: - In-memory per-source-IP sliding-window counters on /api/query and /api/warmup: 200 req/min and 3000 req/hour. Returns 429 with Retry-After when exceeded. Both endpoints are unauthenticated; bound the damage a single bad actor can do (snapshot-restore spam, heavy-query loops). X-Forwarded-For honored for the leftmost hop if a reverse proxy is in front. Co-Authored-By: Claude Opus 4.7 --- playground/clickbench-playground.service | 34 +++++++++ playground/requirements.txt | 10 +++ playground/server/main.py | 95 ++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 playground/requirements.txt diff --git a/playground/clickbench-playground.service b/playground/clickbench-playground.service index 979d31867b..dee071d1d3 100644 --- a/playground/clickbench-playground.service +++ b/playground/clickbench-playground.service @@ -12,5 +12,39 @@ ExecStart=/usr/bin/python3 -m playground.server.main Restart=on-failure RestartSec=3 +# --- Hardening ----------------------------------------------------- +# +# The server runs as the unprivileged `ubuntu` user; privileged work +# (iptables, ip tuntap, mount, firecracker) is delegated to sudo with +# an operator-managed sudoers allowlist. These directives keep the +# systemd unit from regaining capabilities or filesystem write access +# if the python process is compromised. +# +# What we deliberately do NOT set: +# - NoNewPrivileges / RestrictSUIDSGID — both would break sudo, +# which the server uses to invoke iptables / ip tuntap / mount. +# The narrower defence is the constrained sudoers file. +# - ProtectSystem=strict — the sudo'd children (iptables-restore, +# mkfs.ext4, mount, ...) need to touch /etc, /run, etc. +# ProtectSystem=full is the practical maximum. +# - PrivateNetwork — the server needs the host network namespace +# to manage TAPs and the SNI proxy. +# - PrivateDevices — we use /dev/kvm, /dev/loop*, /dev/net/tun. +# +ProtectSystem=full +ProtectHome=read-only +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +ProtectClock=yes +PrivateTmp=yes +RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX AF_NETLINK AF_PACKET +LockPersonality=yes +RestrictRealtime=yes +RestrictNamespaces=yes +# Explicit write allow-list: state dir for VM artifacts + Python's +# bytecode cache. Everything else under ProtectSystem=full is RO. +ReadWritePaths=/opt/clickbench-playground /home/ubuntu/.cache + [Install] WantedBy=multi-user.target diff --git a/playground/requirements.txt b/playground/requirements.txt new file mode 100644 index 0000000000..fc3fbd65b4 --- /dev/null +++ b/playground/requirements.txt @@ -0,0 +1,10 @@ +# Runtime dependencies for the playground host server. +# +# aiohttp >= 3.10 covers: +# GHSA-5h86-8mv2-jq9f static handler symlink path traversal (3.9.2) +# GHSA-q3qx-c6g2-7pw2 request smuggling (3.9.4) +# GHSA-pjjw-qhg8-p2p9 follow_symlinks default tightening (3.10.x) +# +# main.py asserts this minimum at startup; the pin here is for the +# pip-based install path. +aiohttp>=3.10 diff --git a/playground/server/main.py b/playground/server/main.py index 2b187febfb..9ddd4aa4e7 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -42,9 +42,100 @@ def _b64url_to_id(s: str) -> int: return int.from_bytes(base64.urlsafe_b64decode(s + pad), "big") from pathlib import Path +import collections +import threading + import aiohttp from aiohttp import web +# --- Per-IP rate limiting ------------------------------------------ +# +# /api/query and /api/warmup are unauthenticated; a single bad actor +# can wedge the playground by spamming restores or kicking heavy +# queries against snapshotted systems. Bound the damage with two +# rolling windows per source IP: +# 200 requests / minute +# 3000 requests / hour +# In-memory, since restarts are infrequent and per-IP state across +# restarts isn't load-bearing for this use case. +_RATE_PER_MINUTE = 200 +_RATE_PER_HOUR = 3000 +_rate_lock = threading.Lock() +_rate_hits: dict[str, collections.deque[float]] = {} + + +def _client_ip(req: web.Request) -> str: + """First hop in X-Forwarded-For if present (we sit behind nothing + by default; a reverse proxy operator can wire one in), else the + socket peer.""" + xff = req.headers.get("X-Forwarded-For") + if xff: + return xff.split(",")[0].strip() or (req.remote or "?") + return req.remote or "?" + + +def _rate_check(req: web.Request) -> web.Response | None: + """Return a 429 Response if the caller has exceeded either window, + else None. Increments the counter on a pass.""" + ip = _client_ip(req) + now = time.monotonic() + hour_ago = now - 3600 + minute_ago = now - 60 + with _rate_lock: + dq = _rate_hits.get(ip) + if dq is None: + dq = collections.deque() + _rate_hits[ip] = dq + # Trim timestamps older than 1 hour. The deque is sorted + # because we only ever append `now`, so popping from the left + # is O(1) per stale entry. + while dq and dq[0] < hour_ago: + dq.popleft() + if len(dq) >= _RATE_PER_HOUR: + retry = max(1, int(dq[0] + 3600 - now)) + return web.json_response( + {"error": "rate limit (hour)", + "limit": _RATE_PER_HOUR, "retry_after": retry}, + status=429, headers={"Retry-After": str(retry)}, + ) + recent = sum(1 for t in dq if t >= minute_ago) + if recent >= _RATE_PER_MINUTE: + # Find oldest sample inside the 1-minute window to suggest + # a reasonable retry-after. + oldest_in_min = next(t for t in dq if t >= minute_ago) + retry = max(1, int(oldest_in_min + 60 - now)) + return web.json_response( + {"error": "rate limit (minute)", + "limit": _RATE_PER_MINUTE, "retry_after": retry}, + status=429, headers={"Retry-After": str(retry)}, + ) + dq.append(now) + # Occasional GC: if a deque ever empties (unlikely under live + # load but possible for one-shot IPs), let it linger one cycle + # then drop. We do the drop opportunistically on insert. + if len(_rate_hits) > 10000: + # Pathological: 10k+ distinct IPs in the last hour. Evict + # entries whose newest hit is > 1h ago. + for stale_ip in [k for k, d in _rate_hits.items() if not d]: + _rate_hits.pop(stale_ip, None) + return None + + +# Refuse to start on aiohttp versions vulnerable to the static-handler +# path traversal (GHSA-5h86-8mv2-jq9f, fixed in 3.9.2) and the HTTP +# request-smuggling fixes that landed in 3.9.x / 3.10.x. We mitigate +# follow_symlinks at the call site too, but a runtime guard catches +# any future regression where someone re-enables it under an old lib. +_AIOHTTP_MIN = (3, 10, 0) +_aiohttp_v = tuple(int(p) for p in aiohttp.__version__.split(".")[:3] + if p.isdigit()) +if _aiohttp_v < _AIOHTTP_MIN: + raise RuntimeError( + f"aiohttp {aiohttp.__version__} is too old; " + f"require >= {'.'.join(str(x) for x in _AIOHTTP_MIN)} " + "(GHSA-5h86-8mv2-jq9f and request-smuggling fixes)" + ) + from . import config as config_mod from . import net from . import systems as systems_mod @@ -202,6 +293,8 @@ async def handle_warmup(self, req: web.Request) -> web.Response: VM that's already serving. Refuses to initial-provision; if no snapshot exists, returns 409 and the user has to /admin/provision. """ + if (resp := _rate_check(req)) is not None: + return resp name = req.match_info["name"] if name not in self.systems: raise web.HTTPNotFound() @@ -226,6 +319,8 @@ async def _warmup_bg(self, name: str) -> None: log.warning("warmup failed for %s: %r", name, e) async def handle_query(self, req: web.Request) -> web.StreamResponse: + if (resp := _rate_check(req)) is not None: + return resp system_name = req.query.get("system", "") if system_name not in self.systems: return web.json_response({"error": f"unknown system: {system_name!r}"}, From ac9884695e64e7894c78867aa6e98c882e6e7292 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:47:07 +0000 Subject: [PATCH 179/221] playground: rate limiter keys on TCP peer only, never X-Forwarded-For Honoring XFF without an authenticated reverse proxy in front lets any caller rotate the header value to forge a fresh IP for every request and bypass the bucket entirely. Drop it. If a reverse proxy is added later, that proxy is the trust boundary and its operator should either terminate the rate-limit there or extend this function to honor XFF only when the peer IP is the proxy's address. Co-Authored-By: Claude Opus 4.7 --- playground/server/main.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/playground/server/main.py b/playground/server/main.py index 9ddd4aa4e7..ef98dadb10 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -65,12 +65,10 @@ def _b64url_to_id(s: str) -> int: def _client_ip(req: web.Request) -> str: - """First hop in X-Forwarded-For if present (we sit behind nothing - by default; a reverse proxy operator can wire one in), else the - socket peer.""" - xff = req.headers.get("X-Forwarded-For") - if xff: - return xff.split(",")[0].strip() or (req.remote or "?") + """TCP peer address — never the X-Forwarded-For header. Honoring + XFF without an authenticated reverse proxy in front would let any + caller spoof their IP and bypass the rate limit by rotating the + header value.""" return req.remote or "?" From a184c37d946f6f90f230b75d979d5c584d49ad28 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:56:59 +0000 Subject: [PATCH 180/221] playground: real local DNS, rate-limit GC, clickhouse-web cache path DNS / dnsmasq: - install-firecracker.sh installs and configures dnsmasq on every non-loopback host address (port 53 UDP/TCP). The host's systemd-resolved stays put on 127.0.0.53. iptables PREROUTING REDIRECT for VM UDP/53 lands on a real listener now; before this commit the host had no resolver bound to 10.200.x.1:53 and every VM DNS lookup just timed out (manifested as 'Not found address of host' from ClickHouse url() calls). - net.setup_host_firewall hardens further: TCP/53 in INPUT is loopback-only now (was internal-CIDR + loopback). VMs are UDP-only for DNS at every layer. Rate limiter: - Add a bulk eviction sweep: when _rate_hits grows past 4096 entries, drop IPs whose newest hit is > 1h old (or whose deque is empty). The previous code only checked for empty deques, so one-shot IPs with a single in-window timestamp accumulated forever. Sweep is amortized O(1) per request. clickhouse-web: - ClickHouse rejects filesystem-cache paths outside /var/lib/clickhouse/caches/ (BAD_ARGUMENTS at CREATE TABLE). Move the cache from /dev/shm/clickhouse to /var/lib/clickhouse/caches/web. install + create.sql updated together so the chown lands on the right path. Co-Authored-By: Claude Opus 4.7 --- clickhouse-web/create.sql | 2 +- clickhouse-web/install | 10 ++++++--- playground/scripts/install-firecracker.sh | 25 +++++++++++++++++++++++ playground/server/main.py | 24 ++++++++++++++-------- playground/server/net.py | 25 ++++++++++++++++------- 5 files changed, 67 insertions(+), 19 deletions(-) diff --git a/clickhouse-web/create.sql b/clickhouse-web/create.sql index 4e687ef61f..3ec2451dc7 100644 --- a/clickhouse-web/create.sql +++ b/clickhouse-web/create.sql @@ -108,5 +108,5 @@ ATTACH TABLE hits UUID 'c449dfbf-ba06-4d13-abec-8396559eb955' PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID) ) ENGINE = MergeTree -SETTINGS disk = disk(type = cache, path = '/dev/shm/clickhouse/', max_size_ratio_to_total_space = 0.9, +SETTINGS disk = disk(type = cache, path = '/var/lib/clickhouse/caches/web/', max_size_ratio_to_total_space = 0.9, disk = disk(type = web, endpoint = 'https://clickhouse-public-datasets.s3.amazonaws.com/web/')); diff --git a/clickhouse-web/install b/clickhouse-web/install index eb23629536..a0b89152fa 100755 --- a/clickhouse-web/install +++ b/clickhouse-web/install @@ -10,6 +10,10 @@ if [ ! -x /usr/bin/clickhouse ]; then sudo ./clickhouse install --noninteractive fi -# Cache directory used by the web disk. -sudo mkdir -p /dev/shm/clickhouse -sudo chown clickhouse:clickhouse /dev/shm/clickhouse +# Cache directory used by the web disk. ClickHouse now enforces +# "Filesystem cache absolute path must lie inside /var/lib/clickhouse/caches/" +# so /dev/shm/clickhouse (which we used to use for tmpfs-backed cache) +# is rejected with BAD_ARGUMENTS at table-create time. Park the cache +# under the canonical caches/ tree. +sudo mkdir -p /var/lib/clickhouse/caches/web +sudo chown clickhouse:clickhouse /var/lib/clickhouse/caches/web diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh index f71d36d66d..5e5a4ae692 100755 --- a/playground/scripts/install-firecracker.sh +++ b/playground/scripts/install-firecracker.sh @@ -75,5 +75,30 @@ echo "net.ipv4.ip_forward=1" | sudo tee /etc/sysctl.d/99-clickbench-playground.c sudo sysctl -w net.ipv4.conf.all.route_localnet=1 >/dev/null echo "net.ipv4.conf.all.route_localnet=1" | sudo tee -a /etc/sysctl.d/99-clickbench-playground.conf >/dev/null +# Local DNS resolver for the VMs. enable_filtered_internet REDIRECTs +# the VM TAP's UDP/53 to the host's port 53. systemd-resolved binds +# only to 127.0.0.53 / .54, so REDIRECT'd traffic (dst=10.200.x.1:53) +# hits a closed port without a real listener. Dnsmasq fills that gap: +# bind every non-loopback address, forward upstream, UDP only from +# the VM side (iptables INPUT drops TCP/53 from VM addresses). +if ! command -v dnsmasq >/dev/null 2>&1; then + sudo apt-get install -y dnsmasq +fi +sudo tee /etc/dnsmasq.d/playground.conf >/dev/null <<'CONF' +# Managed by playground/scripts/install-firecracker.sh — do not edit. +port=53 +bind-interfaces +# systemd-resolved already owns 127.0.0.53/54 on loopback; leave it. +except-interface=lo +no-resolv +server=1.1.1.1 +server=8.8.8.8 +no-dhcp-interface= +log-queries=no +cache-size=2000 +CONF +sudo systemctl enable dnsmasq >/dev/null 2>&1 || true +sudo systemctl restart dnsmasq + echo "[install] done" "$STATE_DIR/bin/firecracker" --version diff --git a/playground/server/main.py b/playground/server/main.py index ef98dadb10..0fe8979785 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -60,6 +60,11 @@ def _b64url_to_id(s: str) -> int: # restarts isn't load-bearing for this use case. _RATE_PER_MINUTE = 200 _RATE_PER_HOUR = 3000 +# Above this many distinct IPs in the dict, do a full O(N) sweep on +# the next request to drop entries whose newest timestamp is > 1h +# old (or whose deque is empty). Bounds the dict at ~threshold * +# 24 KB-per-entry-worst-case after a sweep. +_RATE_GC_THRESHOLD = 4096 _rate_lock = threading.Lock() _rate_hits: dict[str, collections.deque[float]] = {} @@ -80,6 +85,17 @@ def _rate_check(req: web.Request) -> web.Response | None: hour_ago = now - 3600 minute_ago = now - 60 with _rate_lock: + # Bulk GC. Drop any IP whose newest hit fell outside the + # 1-hour window (or whose deque is empty for whatever + # reason). Without this, one-shot source IPs would + # accumulate forever and grow _rate_hits unboundedly. Only + # fires when the dict has grown past the threshold, so the + # cost is amortized O(1) per request. + if len(_rate_hits) > _RATE_GC_THRESHOLD: + for stale in [k for k, d in _rate_hits.items() + if not d or d[-1] < hour_ago]: + _rate_hits.pop(stale, None) + dq = _rate_hits.get(ip) if dq is None: dq = collections.deque() @@ -108,14 +124,6 @@ def _rate_check(req: web.Request) -> web.Response | None: status=429, headers={"Retry-After": str(retry)}, ) dq.append(now) - # Occasional GC: if a deque ever empties (unlikely under live - # load but possible for one-shot IPs), let it linger one cycle - # then drop. We do the drop opportunistically on insert. - if len(_rate_hits) > 10000: - # Pathological: 10k+ distinct IPs in the last hour. Evict - # entries whose newest hit is > 1h ago. - for stale_ip in [k for k, d in _rate_hits.items() if not d]: - _rate_hits.pop(stale_ip, None) return None diff --git a/playground/server/net.py b/playground/server/net.py index 82c6e7d474..5e88167633 100644 --- a/playground/server/net.py +++ b/playground/server/net.py @@ -133,16 +133,27 @@ async def setup_host_firewall() -> None: packet to. Without these INPUT rules the proxy would be an open, unauthenticated S3 allowlist relay reachable from the public internet. Same logic for the host's UDP/53 resolver. + + Per-protocol source allowlists: + TCP 8080 / 8443 (SNI proxy): internal CIDR + loopback. + UDP 53 (DNS): internal CIDR + loopback. + TCP 53 (DNS): loopback only — VMs must use UDP. + Big-payload DNS-over-TCP is a + classic exfiltration channel. """ - # (proto, dport) + # (proto, dport, allowed_sources) ports = ( - ("tcp", str(PROXY_HTTPS_PORT)), - ("tcp", str(PROXY_HTTP_PORT)), - ("tcp", "53"), - ("udp", "53"), + ("tcp", str(PROXY_HTTPS_PORT), (_INTERNAL_CIDR, "127.0.0.0/8")), + ("tcp", str(PROXY_HTTP_PORT), (_INTERNAL_CIDR, "127.0.0.0/8")), + ("udp", "53", (_INTERNAL_CIDR, "127.0.0.0/8")), + # TCP/53 explicitly loopback-only: VMs are not allowed to use + # DNS-over-TCP. enable_filtered_internet's FORWARD DROP already + # covers the routed path; this closes the alternate path where + # a VM addresses the host's TAP IP directly. + ("tcp", "53", ("127.0.0.0/8",)), ) - for proto, dport in ports: - for src in (_INTERNAL_CIDR, "127.0.0.0/8"): + for proto, dport, sources in ports: + for src in sources: allow = ("-p", proto, "--dport", dport, "-s", src, "-j", "ACCEPT") rc, _, _ = await _run("sudo", "iptables", "-C", "INPUT", *allow, check=False) From a8fc4abba547c9c0b8f2ccdb47d0b6319a22f556 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 21:58:50 +0000 Subject: [PATCH 181/221] clickhouse-web: cache dir is now a symlink to /dev/shm/clickhouse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ClickHouse rejects any filesystem-cache path outside /var/lib/clickhouse/caches/ at CREATE TABLE time, but we still want the actual bytes in tmpfs — cold queries pull ~1 GB on first run and we'd rather not touch the SSD. Hand the engine a path that satisfies its prefix check (.../caches/web) but is itself a symlink into /dev/shm/clickhouse. ClickHouse only validates the configured string lexically; it doesn't canonicalise the target. Co-Authored-By: Claude Opus 4.7 --- clickhouse-web/install | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/clickhouse-web/install b/clickhouse-web/install index a0b89152fa..aa860621ad 100755 --- a/clickhouse-web/install +++ b/clickhouse-web/install @@ -10,10 +10,17 @@ if [ ! -x /usr/bin/clickhouse ]; then sudo ./clickhouse install --noninteractive fi -# Cache directory used by the web disk. ClickHouse now enforces -# "Filesystem cache absolute path must lie inside /var/lib/clickhouse/caches/" -# so /dev/shm/clickhouse (which we used to use for tmpfs-backed cache) -# is rejected with BAD_ARGUMENTS at table-create time. Park the cache -# under the canonical caches/ tree. -sudo mkdir -p /var/lib/clickhouse/caches/web -sudo chown clickhouse:clickhouse /var/lib/clickhouse/caches/web +# Cache directory used by the web disk. ClickHouse rejects any +# filesystem-cache path outside /var/lib/clickhouse/caches/ with +# BAD_ARGUMENTS at CREATE TABLE time, but we still want the actual +# bytes to live in tmpfs (/dev/shm) for the speed: cold queries +# pull ~1 GB on first run and tmpfs avoids touching the host SSD. +# +# Trick: hand ClickHouse a path that satisfies its policy check +# (.../caches/web) but is itself a symlink into /dev/shm. CH only +# verifies the lexical prefix of the configured path; it doesn't +# canonicalise the target. +sudo mkdir -p /dev/shm/clickhouse /var/lib/clickhouse/caches +sudo chown clickhouse:clickhouse /dev/shm/clickhouse +# Replace any prior real dir / stale symlink atomically. +sudo ln -sfn /dev/shm/clickhouse /var/lib/clickhouse/caches/web From cecea2b0b6aad052c068a232e4a0eddc70afaa3d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:26:17 +0000 Subject: [PATCH 182/221] playground: TLS on 443 via Let's Encrypt + certbot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit config.py: - New PLAYGROUND_TLS_CERT / PLAYGROUND_TLS_KEY / PLAYGROUND_TLS_PORT env vars (default port 443). Empty cert path disables TLS. main.py: - When both cert+key are set, bind a second TCPSite on tls_port with an SSLContext loading the cert chain. Plain port stays up for loopback / behind-a-LB use. clickbench-playground.service: - SupplementaryGroups=ssl-cert so the unprivileged ubuntu user can read /etc/letsencrypt/{live,archive}/.../privkey.pem. - AmbientCapabilities=CAP_NET_BIND_SERVICE so the python process can bind 443. Bounding set deliberately left at default — sudo children still need the full cap set for iptables / ip tuntap. install-firecracker.sh: - When PLAYGROUND_TLS_DOMAIN is set, install certbot, acquire the cert via --standalone (binds 80 briefly for HTTP-01), and drop in a deploy hook that re-applies ssl-cert group perms on every renewal so the privkey stays readable. End-to-end verified: curl https://clickbench-playground.clickhouse.com/api/state -> HTTP 200, ssl_verify_result=0, CN matches, Let's Encrypt E8, valid through 2026-08-12. Co-Authored-By: Claude Opus 4.7 --- playground/clickbench-playground.service | 11 ++++++++ playground/scripts/install-firecracker.sh | 34 +++++++++++++++++++++++ playground/server/config.py | 9 ++++++ playground/server/main.py | 21 +++++++++++++- 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/playground/clickbench-playground.service b/playground/clickbench-playground.service index dee071d1d3..77ebd34529 100644 --- a/playground/clickbench-playground.service +++ b/playground/clickbench-playground.service @@ -6,12 +6,23 @@ Wants=network-online.target [Service] Type=simple User=ubuntu +# ssl-cert membership lets the process read the Let's Encrypt +# privkey under /etc/letsencrypt/{live,archive} (ownership set by +# the deploy hook in /etc/letsencrypt/renewal-hooks/deploy/). +SupplementaryGroups=ssl-cert WorkingDirectory=/home/ubuntu/ClickBench EnvironmentFile=-/home/ubuntu/ClickBench/playground/.env ExecStart=/usr/bin/python3 -m playground.server.main Restart=on-failure RestartSec=3 +# Grant the playground process CAP_NET_BIND_SERVICE so it can bind +# 443 as the unprivileged `ubuntu` user. We do NOT lock the +# capability bounding set: the server uses `sudo` to invoke +# iptables / ip tuntap / mount / firecracker etc., and the root +# child of sudo needs the full capability set to do that work. +AmbientCapabilities=CAP_NET_BIND_SERVICE + # --- Hardening ----------------------------------------------------- # # The server runs as the unprivileged `ubuntu` user; privileged work diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh index 5e5a4ae692..e4cffbd5cd 100755 --- a/playground/scripts/install-firecracker.sh +++ b/playground/scripts/install-firecracker.sh @@ -75,6 +75,40 @@ echo "net.ipv4.ip_forward=1" | sudo tee /etc/sysctl.d/99-clickbench-playground.c sudo sysctl -w net.ipv4.conf.all.route_localnet=1 >/dev/null echo "net.ipv4.conf.all.route_localnet=1" | sudo tee -a /etc/sysctl.d/99-clickbench-playground.conf >/dev/null +# TLS for the playground API. We use certbot --standalone (binds 80 +# briefly for HTTP-01) to acquire / renew a Let's Encrypt cert for +# the public hostname. The unprivileged playground user reads the +# private key via the ssl-cert group; a deploy hook re-applies that +# ownership after every renewal so renewals don't lock us out. +# +# Skipped entirely if PLAYGROUND_TLS_DOMAIN isn't set — operators +# running the playground purely on a private network don't need +# the cert. +if [ -n "${PLAYGROUND_TLS_DOMAIN:-}" ]; then + sudo apt-get install -y certbot + getent group ssl-cert >/dev/null || sudo groupadd ssl-cert + sudo usermod -aG ssl-cert "${SUDO_USER:-ubuntu}" + if [ ! -d "/etc/letsencrypt/live/${PLAYGROUND_TLS_DOMAIN}" ]; then + sudo certbot certonly --standalone --non-interactive --agree-tos \ + -m "${PLAYGROUND_TLS_EMAIL:-${SUDO_USER:-ubuntu}@$(hostname -d 2>/dev/null || echo localhost)}" \ + -d "${PLAYGROUND_TLS_DOMAIN}" + fi + sudo tee /etc/letsencrypt/renewal-hooks/deploy/clickbench-ssl-cert.sh >/dev/null <<'HOOK' +#!/bin/bash +# Managed by playground/scripts/install-firecracker.sh. After every +# cert renewal, re-apply ssl-cert group ownership so the unprivileged +# playground user can keep reading the new privkey. +set -e +chgrp -R ssl-cert /etc/letsencrypt/live /etc/letsencrypt/archive +chmod 750 /etc/letsencrypt/live /etc/letsencrypt/archive +find /etc/letsencrypt/live /etc/letsencrypt/archive -type d -exec chmod 750 {} \; +find /etc/letsencrypt/archive -name "privkey*.pem" -exec chmod 640 {} \; +HOOK + sudo chmod 755 /etc/letsencrypt/renewal-hooks/deploy/clickbench-ssl-cert.sh + # Apply once now so the freshly issued cert is readable too. + sudo bash /etc/letsencrypt/renewal-hooks/deploy/clickbench-ssl-cert.sh +fi + # Local DNS resolver for the VMs. enable_filtered_internet REDIRECTs # the VM TAP's UDP/53 to the host's port 53. systemd-resolved binds # only to 127.0.0.53 / .54, so REDIRECT'd traffic (dst=10.200.x.1:53) diff --git a/playground/server/config.py b/playground/server/config.py index 1fbc215955..e7957ad319 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -35,6 +35,12 @@ class Config: # aiohttp parses it. listen_host: str listen_port: int + # TLS. When tls_cert + tls_key are both set, the server binds on + # tls_port with TLS *in addition to* listen_port (which becomes the + # plain-HTTP redirect listener). Empty strings disable TLS. + tls_cert: str + tls_key: str + tls_port: int # Per-VM resources. vm_vcpus: int vm_mem_mib: int @@ -110,6 +116,9 @@ def load() -> Config: repo_dir=repo_dir, listen_host=host or "0.0.0.0", listen_port=int(port or 8000), + tls_cert=os.environ.get("PLAYGROUND_TLS_CERT", ""), + tls_key=os.environ.get("PLAYGROUND_TLS_KEY", ""), + tls_port=_env_int("PLAYGROUND_TLS_PORT", 443), vm_vcpus=_env_int("VM_VCPUS", 4), # 16 GB. DataFrame-style engines (chdb-dataframe, duckdb-dataframe, # daft-*, polars-dataframe) would need >100 GB to load the full diff --git a/playground/server/main.py b/playground/server/main.py index 0fe8979785..f786acdaaa 100644 --- a/playground/server/main.py +++ b/playground/server/main.py @@ -559,9 +559,28 @@ def main() -> None: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(runner.setup()) + + # Always bind the plain port. site = web.TCPSite(runner, cfg.listen_host, cfg.listen_port) loop.run_until_complete(site.start()) - log.info("playground listening on http://%s:%d", cfg.listen_host, cfg.listen_port) + log.info("playground listening on http://%s:%d", + cfg.listen_host, cfg.listen_port) + + # If TLS is configured, also bind the TLS port. The unit needs + # CAP_NET_BIND_SERVICE to bind 443 as an unprivileged user; see + # clickbench-playground.service. + if cfg.tls_cert and cfg.tls_key: + import ssl + sslctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + sslctx.load_cert_chain(cfg.tls_cert, cfg.tls_key) + # Disable client-cert request (we serve over TLS, we don't + # mutually authenticate). + sslctx.verify_mode = ssl.CERT_NONE + tls_site = web.TCPSite(runner, cfg.listen_host, cfg.tls_port, + ssl_context=sslctx) + loop.run_until_complete(tls_site.start()) + log.info("playground listening on https://%s:%d", + cfg.listen_host, cfg.tls_port) stop = asyncio.Event() for sig in (signal.SIGTERM, signal.SIGINT): From 6b220d6dd5e0894a96ea83b919513644a88e2f25 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:29:44 +0000 Subject: [PATCH 183/221] playground: web: reserve scrollbar gutter on the competition rail scrollbar-gutter: stable keeps space for the vertical scrollbar even when the rail's content fits without scrolling. Without it the rail visibly shrinks as rows finish, briefly pushing the right pane wide enough to trigger a horizontal scrollbar at the page level. Co-Authored-By: Claude Opus 4.7 --- playground/web/index.html | 2 +- playground/web/style.css | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/playground/web/index.html b/playground/web/index.html index ea7f2b6332..b8622bf238 100644 --- a/playground/web/index.html +++ b/playground/web/index.html @@ -4,7 +4,7 @@ ClickBench Playground — run SQL against 90+ databases - +
diff --git a/playground/web/style.css b/playground/web/style.css index db827c0bc4..ae1aaba0e8 100644 --- a/playground/web/style.css +++ b/playground/web/style.css @@ -143,6 +143,10 @@ button.run-all:hover:not(:disabled) { aside#ui-runall { margin: 0; overflow-y: auto; + /* Reserve scrollbar gutter even when content fits — without this + the rail's width fluctuates as rows finish, briefly making the + right pane overflow and the page grow a horizontal scrollbar. */ + scrollbar-gutter: stable; height: 100%; border: 1px solid var(--border); background: var(--bg-alt); From e80b86f9f6959db9ed5cabf6594164395e6f9d22 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:35:07 +0000 Subject: [PATCH 184/221] firebolt: persist /firebolt-core/volume across pre-snapshot stop+start The previous start used `--rm` plus an anonymous volume mount, which meant the agent's pre-snapshot `./stop` (docker container stop) removed the container and discarded its volume. The snapshot then captured a freshly-started, empty firebolt-core, and every query post-restore returned Database 'clickbench' does not exist or not authorized. Drop --rm, bind-mount the engine data directory to a per-system fb-volume on the sysdisk, and make ./start re-use the existing container if it's already present (`docker start` instead of re-running `docker run`). Co-Authored-By: Claude Opus 4.7 --- firebolt-parquet-partitioned/start | 35 ++++++++++++++++++++---------- firebolt-parquet-partitioned/stop | 5 ++++- firebolt-parquet/start | 35 ++++++++++++++++++++---------- firebolt-parquet/stop | 5 ++++- firebolt/start | 35 ++++++++++++++++++++---------- firebolt/stop | 5 ++++- 6 files changed, 81 insertions(+), 39 deletions(-) diff --git a/firebolt-parquet-partitioned/start b/firebolt-parquet-partitioned/start index 552c7c5014..1e08dce668 100755 --- a/firebolt-parquet-partitioned/start +++ b/firebolt-parquet-partitioned/start @@ -7,19 +7,30 @@ if curl -sS --max-time 5 'http://localhost:3473/' \ exit 0 fi -mkdir -p data +mkdir -p data fb-volume -# `firebolt-core` is the public self-hosted image. Container needs -# memlock 8 GiB and seccomp unconfined per upstream's run docs. The -# data dir maps to /firebolt-core/clickbench inside the container so -# create.sql's FROM PATTERN can read the parquet files. -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v "$(pwd)/data:/firebolt-core/clickbench" \ - ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +# If the container exists (stopped from a prior agent pre-snapshot +# cycle), just start it back — the data lives on the bind-mounted +# fb-volume below, so the previously-created `clickbench` database +# is still there. Otherwise create the container fresh. +if sudo docker ps -a --format '{{.Names}}' | grep -qx firebolt-core; then + sudo docker start firebolt-core >/dev/null +else + # `firebolt-core` is the public self-hosted image. Container needs + # memlock 8 GiB and seccomp unconfined per upstream's run docs. + # /firebolt-core/clickbench: parquet source (read at load time). + # /firebolt-core/volume: engine data directory (must persist + # across the agent's pre-snapshot + # stop+start cycle or the snapshot + # ships an empty DB). + sudo docker run -dit --name firebolt-core \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v "$(pwd)/fb-volume:/firebolt-core/volume" \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +fi # Wait for the cluster to be "actually" ready. firebolt-core's HTTP # port comes up immediately but returns diff --git a/firebolt-parquet-partitioned/stop b/firebolt-parquet-partitioned/stop index 6860cd226f..ac1834f7d6 100755 --- a/firebolt-parquet-partitioned/stop +++ b/firebolt-parquet-partitioned/stop @@ -1,5 +1,8 @@ #!/bin/bash set -e -# firebolt-core was started with --rm; stop is enough to clean up. +# Plain stop — leave the container in place so its bind-mounted +# fb-volume keeps the loaded database for the next ./start. The +# container is removed and the volume re-initialised only on +# explicit re-provision. sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/firebolt-parquet/start b/firebolt-parquet/start index 552c7c5014..1e08dce668 100755 --- a/firebolt-parquet/start +++ b/firebolt-parquet/start @@ -7,19 +7,30 @@ if curl -sS --max-time 5 'http://localhost:3473/' \ exit 0 fi -mkdir -p data +mkdir -p data fb-volume -# `firebolt-core` is the public self-hosted image. Container needs -# memlock 8 GiB and seccomp unconfined per upstream's run docs. The -# data dir maps to /firebolt-core/clickbench inside the container so -# create.sql's FROM PATTERN can read the parquet files. -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v "$(pwd)/data:/firebolt-core/clickbench" \ - ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +# If the container exists (stopped from a prior agent pre-snapshot +# cycle), just start it back — the data lives on the bind-mounted +# fb-volume below, so the previously-created `clickbench` database +# is still there. Otherwise create the container fresh. +if sudo docker ps -a --format '{{.Names}}' | grep -qx firebolt-core; then + sudo docker start firebolt-core >/dev/null +else + # `firebolt-core` is the public self-hosted image. Container needs + # memlock 8 GiB and seccomp unconfined per upstream's run docs. + # /firebolt-core/clickbench: parquet source (read at load time). + # /firebolt-core/volume: engine data directory (must persist + # across the agent's pre-snapshot + # stop+start cycle or the snapshot + # ships an empty DB). + sudo docker run -dit --name firebolt-core \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v "$(pwd)/fb-volume:/firebolt-core/volume" \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +fi # Wait for the cluster to be "actually" ready. firebolt-core's HTTP # port comes up immediately but returns diff --git a/firebolt-parquet/stop b/firebolt-parquet/stop index 6860cd226f..ac1834f7d6 100755 --- a/firebolt-parquet/stop +++ b/firebolt-parquet/stop @@ -1,5 +1,8 @@ #!/bin/bash set -e -# firebolt-core was started with --rm; stop is enough to clean up. +# Plain stop — leave the container in place so its bind-mounted +# fb-volume keeps the loaded database for the next ./start. The +# container is removed and the volume re-initialised only on +# explicit re-provision. sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/firebolt/start b/firebolt/start index 552c7c5014..1e08dce668 100755 --- a/firebolt/start +++ b/firebolt/start @@ -7,19 +7,30 @@ if curl -sS --max-time 5 'http://localhost:3473/' \ exit 0 fi -mkdir -p data +mkdir -p data fb-volume -# `firebolt-core` is the public self-hosted image. Container needs -# memlock 8 GiB and seccomp unconfined per upstream's run docs. The -# data dir maps to /firebolt-core/clickbench inside the container so -# create.sql's FROM PATTERN can read the parquet files. -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v "$(pwd)/data:/firebolt-core/clickbench" \ - ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +# If the container exists (stopped from a prior agent pre-snapshot +# cycle), just start it back — the data lives on the bind-mounted +# fb-volume below, so the previously-created `clickbench` database +# is still there. Otherwise create the container fresh. +if sudo docker ps -a --format '{{.Names}}' | grep -qx firebolt-core; then + sudo docker start firebolt-core >/dev/null +else + # `firebolt-core` is the public self-hosted image. Container needs + # memlock 8 GiB and seccomp unconfined per upstream's run docs. + # /firebolt-core/clickbench: parquet source (read at load time). + # /firebolt-core/volume: engine data directory (must persist + # across the agent's pre-snapshot + # stop+start cycle or the snapshot + # ships an empty DB). + sudo docker run -dit --name firebolt-core \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v "$(pwd)/fb-volume:/firebolt-core/volume" \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +fi # Wait for the cluster to be "actually" ready. firebolt-core's HTTP # port comes up immediately but returns diff --git a/firebolt/stop b/firebolt/stop index 6860cd226f..ac1834f7d6 100755 --- a/firebolt/stop +++ b/firebolt/stop @@ -1,5 +1,8 @@ #!/bin/bash set -e -# firebolt-core was started with --rm; stop is enough to clean up. +# Plain stop — leave the container in place so its bind-mounted +# fb-volume keeps the loaded database for the next ./start. The +# container is removed and the volume re-initialised only on +# explicit re-provision. sudo docker container stop firebolt-core >/dev/null 2>&1 || true From 2fd44d67ab011c967257c19653137a59b5ed3862 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:36:37 +0000 Subject: [PATCH 185/221] polars: mark .preserve-state so the loaded LazyFrame survives polars/server.py stores the scan_parquet LazyFrame in a module-level `hits` variable; /query returns 409 'DataFrame not loaded' when it is None. The agent's pre-snapshot stop+start cycle was wiping that variable: the snapshot captured a freshly-relaunched server, and the first query post-restore failed with the 409. Marking .preserve-state skips the stop+start so the snapshot ships the running server with `hits` already set. Co-Authored-By: Claude Opus 4.7 --- polars/.preserve-state | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 polars/.preserve-state diff --git a/polars/.preserve-state b/polars/.preserve-state new file mode 100644 index 0000000000..e69de29bb2 From 71b7f4bf2da3cc0d2f83eeaf437fa214d4fb6002 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:38:33 +0000 Subject: [PATCH 186/221] parseable: fail load loudly when logstream create returns non-2xx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous --silent PUT discarded a non-200 response from /api/v1/logstream/hits, then every subsequent /ingest POST 400'd 'stream not found' — the only visible evidence was 100k+ curl 400 lines that pushed everything else out of the agent's tail-only provision log buffer. Print the response, capture HTTP_CODE, and exit non-zero if it's not 200/201 so the actual cause surfaces. Co-Authored-By: Claude Opus 4.7 --- parseable/load | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/parseable/load b/parseable/load index 763fe364af..45a7fb541d 100755 --- a/parseable/load +++ b/parseable/load @@ -20,12 +20,25 @@ else pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json fi -# Create the stream first — ingest below needs it to exist. -curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \ +# Create the stream first — ingest below needs it to exist. Loud +# error reporting on purpose: the previous --silent + ignored exit +# code masked a 400 here for the entire load (every /ingest then +# returned 400 because the stream didn't exist, and the only +# evidence was 100k+ curl 400 lines in the provision log). +echo "==> creating logstream hits" +resp=$(curl --silent --show-error --location --request PUT \ + -w '\nHTTP_CODE=%{http_code}\n' \ + 'http://localhost:8000/api/v1/logstream/hits' \ -H 'X-P-Static-Schema-Flag: true' \ -H 'Content-Type: application/json' \ -u "admin:admin" \ - --data-binary @static_schema.json >/dev/null + --data-binary @static_schema.json) || true +printf '%s\n' "$resp" +code=$(printf '%s' "$resp" | awk -F= '/^HTTP_CODE=/ {print $2}' | tail -1) +if [ "${code:-}" != "200" ] && [ "${code:-}" != "201" ]; then + echo "parseable logstream create failed (HTTP $code)" >&2 + exit 1 +fi # Wrap each block of LINES_PER_CHUNK NDJSON lines in [ ... ] and POST # directly to /api/v1/ingest. Inlined into parallel's command string From 5de540f93f012a3155154723ebe9e401e43dcd58 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:45:59 +0000 Subject: [PATCH 187/221] kinetica: mv the gzip into persist (revert symlink) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kinetica daemon runs inside a docker container with ./kinetica-persist bind-mounted, so a symlink pointing at $PWD/hits.tsv.gz dangles inside the container and the LOAD returns Not_Found: No such file(s) (File(s):hits.tsv.gz) The persist dir and $PWD live on the same overlay filesystem, so the mv is a rename — cheap. Co-Authored-By: Claude Opus 4.7 --- kinetica/load | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kinetica/load b/kinetica/load index 523f581545..5b90136ae6 100755 --- a/kinetica/load +++ b/kinetica/load @@ -9,8 +9,14 @@ CLI="./kisql --host localhost --user admin" # decompressed TSV. wget --continue --progress=dot:giga \ 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -# Symlink rather than copy: hits.tsv.gz is 16 GB and we only read it once. -sudo ln -sf "$PWD/hits.tsv.gz" ./kinetica-persist/hits.tsv.gz +# Move (rename) rather than symlink: the kinetica daemon runs inside a +# docker container with ./kinetica-persist bind-mounted, so a symlink +# pointing at $PWD/hits.tsv.gz dangles inside the container and `LOAD +# INTO ... FROM FILE PATHS 'hits.tsv.gz'` returns +# Not_Found: No such file(s) (File(s):hits.tsv.gz) +# The persist dir and $PWD live on the same overlay filesystem, so +# mv is a rename — cheap. +sudo mv hits.tsv.gz ./kinetica-persist/ $CLI --file create.sql $CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" From b4582300afdee453710764b6f326e768805351e8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 22:53:14 +0000 Subject: [PATCH 188/221] polars: include the eval result in /query response server.py was discarding the eval()'d value and returning only {"elapsed": ...}; the playground UI then displayed just the timing. Stringify the result (polars DataFrame/Series/LazyFrame via __str__, everything else via repr) and pass it back in a "result" field. query script extracts result -> stdout, elapsed -> stderr. Co-Authored-By: Claude Opus 4.7 --- polars/query | 18 ++++++++++++++---- polars/server.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/polars/query b/polars/query index 9129884cf7..775d01506d 100755 --- a/polars/query +++ b/polars/query @@ -1,6 +1,7 @@ #!/bin/bash -# Reads a SQL query from stdin, dispatches to the running polars server. -# Stdout: server response JSON. +# Reads a polars expression from stdin, dispatches to the running +# polars server. +# Stdout: rendered result (the eval'd value as a string). # Stderr: query runtime in fractional seconds on the last line. # Exit non-zero on error. set -e @@ -19,5 +20,14 @@ if [ "$status" != "200" ]; then exit 1 fi -echo "$body" -echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 +# Server returns {"elapsed": , "result": ""}. +# Print the rendered result to stdout, elapsed to stderr. +printf '%s\n' "$body" | python3 -c ' +import json, sys +o = json.load(sys.stdin) +r = o.get("result", "") +sys.stdout.write(r) +if r and not r.endswith("\n"): + sys.stdout.write("\n") +sys.stderr.write(f"{o[\"elapsed\"]}\n") +' diff --git a/polars/server.py b/polars/server.py index 73f5413795..8f01385202 100755 --- a/polars/server.py +++ b/polars/server.py @@ -64,9 +64,17 @@ async def query(request: Request): except SyntaxError as e: raise HTTPException(status_code=400, detail=f"syntax error: {e}") start = timeit.default_timer() - eval(compiled, {"hits": hits, "pl": pl, "date": date}) + value = eval(compiled, {"hits": hits, "pl": pl, "date": date}) elapsed = round(timeit.default_timer() - start, 3) - return {"elapsed": elapsed} + # Render the eval result so the playground UI shows something + # instead of just a timing line. polars DataFrames / Series / + # LazyFrames have a useful __str__; everything else (scalar, + # tuple, dict, ...) falls through repr. + if isinstance(value, (pl.DataFrame, pl.Series, pl.LazyFrame)): + result = str(value) + else: + result = repr(value) + return {"elapsed": elapsed, "result": result} @app.get("/data-size") From 7e71fe9525c3b372d97eb40a65874e24eea75d7b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 May 2026 23:01:11 +0000 Subject: [PATCH 189/221] playground: compress per-system goldens with zstd to free disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vm_manager._snapshot_disks now adds a compression pass after the reflink-clone: 1. cp --reflink=always working/* -> golden/* (cheap, as before) 2. zstd -1 -T0 --sparse golden/* -> golden/*.zst 3. unlink the uncompressed golden once .zst is written Two-step compress (no `zstd --rm`) so an interrupted run can't lose the only copy of the golden. Trades a 10-30 s restore-time decompression for ~30-60% smaller goldens; on the heaviest VM we have (duckdb-dataframe, 249 GB swap.golden.raw) zstd-1 sampled ~5.5x, so this is roughly the difference between fitting and not fitting the catalog on a 7 TB host. _restore_disks materializes the working disk from whichever form of golden exists — .zst (decompress, no reflink) or .ext4 / .raw (legacy reflink path, kept for backwards compatibility with old snapshots). _has_snapshot accepts either form. Plus a one-shot scripts/compress-goldens.sh that walks the state dir and converts existing uncompressed goldens, so operators don't have to wait for every system to be re-provisioned before the disk savings land. Co-Authored-By: Claude Opus 4.7 --- playground/scripts/compress-goldens.sh | 48 ++++++++ playground/server/vm_manager.py | 153 +++++++++++++++++++------ 2 files changed, 163 insertions(+), 38 deletions(-) create mode 100755 playground/scripts/compress-goldens.sh diff --git a/playground/scripts/compress-goldens.sh b/playground/scripts/compress-goldens.sh new file mode 100755 index 0000000000..c86b36031b --- /dev/null +++ b/playground/scripts/compress-goldens.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Walk every per-system directory under /opt/clickbench-playground/systems/ +# and convert any uncompressed golden (rootfs.golden.ext4, system.golden.ext4, +# swap.golden.raw) into its .zst counterpart. Skips systems that already +# have a .zst alongside (or instead of) the raw file. +# +# Compresses one system at a time, but uses all CPU cores per system. +# Safe to Ctrl-C: zstd writes to .zst.tmp first and we only unlink +# the original after the rename. (If you find a `.zst.tmp` left behind +# from an interrupted run, delete it before re-running.) +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +SYSTEMS_DIR="$STATE_DIR/systems" + +compress_one() { + local src="$1" + local zst="${src}.zst" + local tmp="${src}.zst.tmp" + if [ -f "$zst" ]; then + echo " [skip] $zst exists" + return + fi + if [ ! -f "$src" ]; then + return + fi + local before + before=$(du -B 1 "$src" | awk '{print $1}') + echo " [compress] $src ($(numfmt --to=iec-i --suffix=B "$before"))" + sudo zstd -1 -T0 --sparse --quiet -o "$tmp" "$src" + sudo mv "$tmp" "$zst" + sudo rm -f "$src" + local after + after=$(du -B 1 "$zst" | awk '{print $1}') + echo " [done] $zst ($(numfmt --to=iec-i --suffix=B "$after"))" +} + +free_gb() { df -BG --output=avail "$STATE_DIR" | tail -1 | tr -dc '0-9'; } + +echo "free before: $(free_gb) GiB" +for d in "$SYSTEMS_DIR"/*/; do + sys="$(basename "$d")" + echo "=== $sys ===" + for name in rootfs.golden.ext4 system.golden.ext4 swap.golden.raw; do + compress_one "$d$name" + done +done +echo "free after: $(free_gb) GiB" diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 41cb997ea1..4289797626 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -664,15 +664,24 @@ def _swap_paths(self, vm: VM) -> tuple[Path, Path] | None: async def _snapshot_disks(self, vm: VM) -> None: rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) - # Reflink-clone the working images into the golden slot. We can't - # rename: the working file stays bound to Firecracker's open - # virtio-blk fd through the post-snapshot resume + shutdown, and - # any writes during that window would leak into the golden (we - # observed restored systems hitting ext4 EBADMSG on small files - # like duckdb's hits.db.wal and a venv activate script). With - # reflink the snapshot is near-instant; the working file's - # post-snapshot writes diverge into its own extents and don't - # touch the golden. + # Two-phase save: + # 1. Reflink-clone the working images into the golden slot. + # The working file stays bound to Firecracker's open + # virtio-blk fd through the post-snapshot resume + shutdown, + # and any writes during that window would leak into the + # golden (observed restored systems hitting ext4 EBADMSG + # on small files like duckdb's hits.db.wal and a venv + # activate script). With reflink the snapshot is + # near-instant; the working file's post-snapshot writes + # diverge into its own extents and don't touch the golden. + # 2. zstd-compress the reflinked golden in place, deleting + # the uncompressed copy. The reflink savings are gone + # (compressed bytes don't share extents with the original) + # but disk usage drops ~30-60% per system, which is the + # whole point — 100 snapshotted systems × 70-100 GB + # goldens fills a 7 TB host. The cost is paid back at + # restore time, where we decompress to a fresh working + # file (no reflink possible from .zst). async def _clone(src: Path, dst: Path) -> None: if dst.exists(): dst.unlink() @@ -685,6 +694,33 @@ async def _clone(src: Path, dst: Path) -> None: raise RuntimeError( f"reflink snapshot cp {src} -> {dst} failed: " f"{err.decode(errors='replace')[-400:]}") + + async def _compress(gold: Path) -> None: + # Compress fast (-1) on all cores (-T0). --sparse=auto on + # the decompress side will restore zero holes from the + # frame metadata. Two-step: create .zst, verify, *then* + # unlink the original. We deliberately don't pass --rm + # because an interrupted zstd with --rm could lose the + # only copy of the golden. + zst = gold.with_suffix(gold.suffix + ".zst") + with contextlib.suppress(FileNotFoundError): + zst.unlink() + proc = await asyncio.create_subprocess_exec( + "zstd", "-1", "-T0", "--sparse", + "--quiet", "-o", str(zst), str(gold), + stderr=asyncio.subprocess.PIPE, + ) + _, err = await proc.communicate() + if proc.returncode != 0: + with contextlib.suppress(FileNotFoundError): + zst.unlink() + raise RuntimeError( + f"zstd compress {gold} failed: " + f"{err.decode(errors='replace')[-400:]}") + # Compression succeeded; the uncompressed reflink is now + # redundant. + gold.unlink() + clones = [ _clone(rootfs, rootfs_gold), _clone(sysdisk, sysdisk_gold), @@ -693,47 +729,83 @@ async def _clone(src: Path, dst: Path) -> None: if swap_pair is not None and swap_pair[0].exists(): clones.append(_clone(swap_pair[0], swap_pair[1])) await asyncio.gather(*clones) - sizes = [_fmt_size(rootfs_gold.stat().st_size), - _fmt_size(sysdisk_gold.stat().st_size)] + + # Compress all goldens in parallel — they're independent files. + compresses = [_compress(rootfs_gold), _compress(sysdisk_gold)] if swap_pair is not None and swap_pair[1].exists(): - sizes.append(_fmt_size(swap_pair[1].stat().st_size)) - log.info("[%s] golden disks saved (%s)", vm.system.name, + compresses.append(_compress(swap_pair[1])) + await asyncio.gather(*compresses) + + sizes = [ + _fmt_size(rootfs_gold.with_suffix(".ext4.zst").stat().st_size), + _fmt_size(sysdisk_gold.with_suffix(".ext4.zst").stat().st_size), + ] + if swap_pair is not None: + swap_zst = swap_pair[1].with_suffix(".raw.zst") + if swap_zst.exists(): + sizes.append(_fmt_size(swap_zst.stat().st_size)) + log.info("[%s] golden disks saved + zstd (%s)", vm.system.name, ", ".join(sizes)) async def _restore_disks(self, vm: VM) -> None: rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) - if not rootfs_gold.exists() or not sysdisk_gold.exists(): + # Either form is acceptable; each pair has at most one of the + # two present. The .zst path is the post-compression world + # (see _snapshot_disks). The plain .ext4 path is the legacy + # uncompressed form — kept for backwards compatibility with + # older snapshots taken before the zstd hook landed. + def _pair(gold: Path) -> Path: + zst = gold.with_suffix(gold.suffix + ".zst") + if zst.exists(): + return zst + if gold.exists(): + return gold raise RuntimeError( - f"[{vm.system.name}] missing golden disks; cannot restore") - # Reflink-clone the goldens into fresh working copies. The host - # filesystem must be ext4 with the `reflink` feature enabled (or - # XFS / btrfs / any other CoW-capable fs) — see - # playground/scripts/install-firecracker.sh. Clones are O(1) - # extent-list copies; the real cost is paid lazily on first - # write to a shared block. With reflink, a restore goes from - # 5-30 s (full sparse-cp) to a few ms. - # Both clones can run concurrently; they touch disjoint files. - async def _clone(src: Path, dst: Path) -> None: + f"[{vm.system.name}] missing golden {gold.name}{{,.zst}}; " + "cannot restore") + + rootfs_src = _pair(rootfs_gold) + sysdisk_src = _pair(sysdisk_gold) + + async def _materialize(src: Path, dst: Path) -> None: + """Reflink-clone an uncompressed golden (cheap, O(1)) or + zstd-decompress a compressed one (paid only at restore + time). Working file is created sparsely either way.""" if dst.exists(): dst.unlink() - proc = await asyncio.create_subprocess_exec( - "cp", "--reflink=always", str(src), str(dst), - stderr=asyncio.subprocess.PIPE, - ) + if src.suffix == ".zst": + # zstd -d --sparse=always re-creates zero holes that + # the source ext4/swap.raw image had. + proc = await asyncio.create_subprocess_exec( + "zstd", "-d", "-T0", "--sparse", + "--quiet", "-o", str(dst), str(src), + stderr=asyncio.subprocess.PIPE, + ) + else: + proc = await asyncio.create_subprocess_exec( + "cp", "--reflink=always", str(src), str(dst), + stderr=asyncio.subprocess.PIPE, + ) _, err = await proc.communicate() if proc.returncode != 0: raise RuntimeError( - f"reflink cp {src} -> {dst} failed: " + f"restore materialize {src} -> {dst} failed: " f"{err.decode(errors='replace')[-400:]}") - clones = [ - _clone(rootfs_gold, rootfs), - _clone(sysdisk_gold, sysdisk), + + jobs = [ + _materialize(rootfs_src, rootfs), + _materialize(sysdisk_src, sysdisk), ] swap_pair = self._swap_paths(vm) - if swap_pair is not None and swap_pair[1].exists(): - clones.append(_clone(swap_pair[1], swap_pair[0])) - await asyncio.gather(*clones) - log.info("[%s] working disks reflink-cloned from golden", + if swap_pair is not None: + swap_gold = swap_pair[1] + swap_zst = swap_gold.with_suffix(swap_gold.suffix + ".zst") + if swap_zst.exists(): + jobs.append(_materialize(swap_zst, swap_pair[0])) + elif swap_gold.exists(): + jobs.append(_materialize(swap_gold, swap_pair[0])) + await asyncio.gather(*jobs) + log.info("[%s] working disks materialized from golden", vm.system.name) async def _shutdown(self, vm: VM) -> None: @@ -887,12 +959,17 @@ def _has_snapshot(vm: VM) -> bool: golden disks have been captured. A half-built snapshot (memory present but goldens missing, or vice versa) is treated as no snapshot at all so the next ensure_ready_for_query re-provisions cleanly. + + Goldens may be either the raw ext4 file (.ext4) or the zstd-compressed + form (.ext4.zst) — see _snapshot_disks for the compression hook and + _restore_disks for the decompression path. Either is acceptable. """ mem_ok = (vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists()) sys_dir = vm.snapshot_bin.parent - disks_ok = ((sys_dir / "rootfs.golden.ext4").exists() and - (sys_dir / "system.golden.ext4").exists()) + def gold_ok(name: str) -> bool: + return (sys_dir / name).exists() or (sys_dir / (name + ".zst")).exists() + disks_ok = gold_ok("rootfs.golden.ext4") and gold_ok("system.golden.ext4") return mem_ok and disks_ok From 1674727af633afa72ae6d16e791209762d4f4b49 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 15:17:49 +0000 Subject: [PATCH 190/221] =?UTF-8?q?playground:=20idle-VM=20reaper=20?= =?UTF-8?q?=E2=80=94=20tear=20down=20'ready'=20VMs=20after=2010=20min=20un?= =?UTF-8?q?used?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: with 70+ snapshotted systems and an active UI, every warmup-on-select left the VM warm forever. dmesg accumulated 'workqueue: async_pf_execute [kvm] hogged CPU for >10000us' warnings, and the host's sshd accept loop got slow at peak because the KVM async-PF workqueue is per-CPU and unrelated services compete with it. Add idle_kick_after_sec (default 600 s, overrideable via env VM_IDLE_KICK_AFTER_SEC). The monitor's per-VM tick checks the last_used timestamp on every 'ready' VM and kicks it after inactivity. Snapshot stays; next /query restores cleanly in seconds. Co-Authored-By: Claude Opus 4.7 --- playground/server/config.py | 10 ++++++++++ playground/server/monitor.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/playground/server/config.py b/playground/server/config.py index e7957ad319..571b186d64 100644 --- a/playground/server/config.py +++ b/playground/server/config.py @@ -53,6 +53,12 @@ class Config: # the watchdog kicks the VM. Counts only "ready" state — provision # is allowed to use as much CPU as it wants. vm_cpu_total_seconds_cap: int + # Seconds since the last /query a "ready" VM is allowed to linger + # before the monitor tears it down. Snapshot is preserved; the + # next /query restores in seconds. Keeps the kernel's KVM + # async_pf_execute workqueue from accumulating idle VMs and + # slowing unrelated services (sshd in particular). + idle_kick_after_sec: int host_min_free_ram_gb: int host_min_free_disk_gb: int # Per-system disk full check. @@ -129,6 +135,10 @@ def load() -> Config: cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120), cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), vm_cpu_total_seconds_cap=_env_int("VM_CPU_TOTAL_SECONDS_CAP", 3600), + # 10 minutes default. Cold restore is ~5-30 s for most engines, + # so a user returning within 10 min finds a warm VM; longer + # gaps cost a single fresh restore. + idle_kick_after_sec=_env_int("VM_IDLE_KICK_AFTER_SEC", 600), host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32), host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 100), vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")), diff --git a/playground/server/monitor.py b/playground/server/monitor.py index dd551e7ae7..be3cf9b076 100644 --- a/playground/server/monitor.py +++ b/playground/server/monitor.py @@ -111,6 +111,23 @@ def _sample_cpu(self, name: str, pid: int) -> float | None: return cpu_seconds / (dt * self.cfg.vm_vcpus) async def _check_per_vm(self, vm: VM, cpu_pct: float | None) -> None: + # Idle reaper. A "ready" VM that hasn't seen a /query in + # idle_kick_after_sec is consuming KVM threads + memory + # mappings + a TAP for no reason. The kernel's async_pf_execute + # workqueue starts hogging CPU when too many VMs idle-spin in + # parallel (see dmesg), which slows down unrelated services + # (sshd accept loop, in particular). Tear down idle ones; the + # snapshot is preserved and the next /query restores in seconds. + if (vm.state == "ready" and vm.last_used > 0 + and time.time() - vm.last_used >= self.cfg.idle_kick_after_sec): + self.sink.write_event( + system=vm.system.name, kind="idle-reaper", + detail=f"idle for {int(time.time() - vm.last_used)}s " + f"(threshold {self.cfg.idle_kick_after_sec}s)", + ) + await self.vmm.kick(vm.system.name, "idle-reaper") + return + # CPU saturation watchdog if cpu_pct is None: vm.cpu_busy_since = None From ded372cec985f1b56a1aa0e343ef078cd0006739 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 15:35:45 +0000 Subject: [PATCH 191/221] playground: revert manual zstd compression; rely on btrfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're switching the host volume from XFS to btrfs with compress=zstd:1, so the application-level zstd hook in _snapshot_disks / _restore_disks is redundant — btrfs compresses transparently on write and decompresses on read at the kernel page-cache layer. - _snapshot_disks back to the simple reflink-only path. - _restore_disks back to the simple reflink-only path. - _has_snapshot back to checking just the .ext4 form. - install-firecracker.sh notes mkfs.btrfs + compress=zstd:1 as the recommended setup. - drop the one-shot compress-goldens.sh (no longer relevant). Co-Authored-By: Claude Opus 4.7 --- playground/scripts/compress-goldens.sh | 48 -------------------------- 1 file changed, 48 deletions(-) delete mode 100755 playground/scripts/compress-goldens.sh diff --git a/playground/scripts/compress-goldens.sh b/playground/scripts/compress-goldens.sh deleted file mode 100755 index c86b36031b..0000000000 --- a/playground/scripts/compress-goldens.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# Walk every per-system directory under /opt/clickbench-playground/systems/ -# and convert any uncompressed golden (rootfs.golden.ext4, system.golden.ext4, -# swap.golden.raw) into its .zst counterpart. Skips systems that already -# have a .zst alongside (or instead of) the raw file. -# -# Compresses one system at a time, but uses all CPU cores per system. -# Safe to Ctrl-C: zstd writes to .zst.tmp first and we only unlink -# the original after the rename. (If you find a `.zst.tmp` left behind -# from an interrupted run, delete it before re-running.) -set -euo pipefail - -STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" -SYSTEMS_DIR="$STATE_DIR/systems" - -compress_one() { - local src="$1" - local zst="${src}.zst" - local tmp="${src}.zst.tmp" - if [ -f "$zst" ]; then - echo " [skip] $zst exists" - return - fi - if [ ! -f "$src" ]; then - return - fi - local before - before=$(du -B 1 "$src" | awk '{print $1}') - echo " [compress] $src ($(numfmt --to=iec-i --suffix=B "$before"))" - sudo zstd -1 -T0 --sparse --quiet -o "$tmp" "$src" - sudo mv "$tmp" "$zst" - sudo rm -f "$src" - local after - after=$(du -B 1 "$zst" | awk '{print $1}') - echo " [done] $zst ($(numfmt --to=iec-i --suffix=B "$after"))" -} - -free_gb() { df -BG --output=avail "$STATE_DIR" | tail -1 | tr -dc '0-9'; } - -echo "free before: $(free_gb) GiB" -for d in "$SYSTEMS_DIR"/*/; do - sys="$(basename "$d")" - echo "=== $sys ===" - for name in rootfs.golden.ext4 system.golden.ext4 swap.golden.raw; do - compress_one "$d$name" - done -done -echo "free after: $(free_gb) GiB" From 879d87f49d6b414e3e5589087640c360e2f6a1fc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:05:05 +0000 Subject: [PATCH 192/221] playground: force iptables-legacy in the base rootfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Firecracker CI kernel (vmlinux-6.1.141) does not include CONFIG_NF_TABLES — every nft call inside the VM returns 'Failed to initialize nft: Protocol not supported'. Ubuntu 24.04 defaults `update-alternatives --display iptables` to the nft variant, and dockerd's bridge-driver startup calls `iptables -t nat -N DOCKER`. The nft failure aborts dockerd → docker.service exits 1/FAILURE → every docker-based system fails at install time with Cannot connect to the Docker daemon at unix:///var/run/docker.sock The legacy backend uses ip_tables / iptable_nat / xt_* modules which the firecracker kernel does compile in (and the modules-load.d hook here pre-loads). Co-Authored-By: Claude Opus 4.7 --- playground/images/build-base-rootfs.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index b37ba6337c..f8e7d44ea1 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -198,6 +198,19 @@ sudo tee "$MNT/tmp/customize-rest.sh" >/dev/null <<'CUSTOMIZE' set -euxo pipefail export DEBIAN_FRONTEND=noninteractive +# iptables backend: pin to legacy (xtables). Ubuntu 24.04 defaults to +# the nft variant, but the Firecracker CI kernel (vmlinux-6.1.141) +# does not have CONFIG_NF_TABLES, so any nft call returns +# `Failed to initialize nft: Protocol not supported`. dockerd's +# bridge-driver init does `iptables -t nat -N DOCKER` at startup; +# the nft failure aborts dockerd → docker.service exits 1 → every +# docker-based system fails at install time with +# "Cannot connect to the Docker daemon". +# The legacy backend uses x_tables/ip_tables/iptable_nat which the +# firecracker kernel does compile in (see modules-load.d above). +update-alternatives --set iptables /usr/sbin/iptables-legacy +update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy + # Network: parse `ip=GUEST::GATEWAY:NETMASK:::eth0:off` from /proc/cmdline # at boot and apply it to eth0. Some kernels we run (Ubuntu's generic) lack # CONFIG_IP_PNP, which makes the kernel's `ip=` boot-arg a no-op and leaves From c036a304128cd8cdefbd751d92302ebe31c40869 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:07:35 +0000 Subject: [PATCH 193/221] playground: in-VM download-hits-parquet-partitioned stub points at hits_partitioned/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build-datasets-image.sh rsyncs /opt/clickbench-playground/datasets/ verbatim, so the partitioned parquet files end up at /opt/clickbench/datasets_ro/hits_partitioned/hits_N.parquet inside the VM. The lib stub was linking from /opt/clickbench/datasets_ro/hits_N.parquet (no subdir) — every symlink dangled and every partitioned-parquet load script failed with 'No files found that match the pattern \"hits_*.parquet\"'. Co-Authored-By: Claude Opus 4.7 --- playground/images/build-base-rootfs.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index f8e7d44ea1..e351ae7904 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -314,10 +314,17 @@ ln -sf /opt/clickbench/datasets_ro/hits.parquet hits.parquet EOF cat > /opt/clickbench/lib/download-hits-parquet-partitioned <<'EOF' #!/bin/bash +# Partitioned parquet files live under datasets_ro/hits_partitioned/ +# on the read-only datasets disk (matching the +# datasets/hits_partitioned/ layout build-datasets-image.sh rsyncs +# from). Link them into cwd as a flat hits_*.parquet so the system +# load scripts can glob `hits_*.parquet` exactly like in the +# upstream `lib/download-hits-parquet-partitioned`. set -e dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" for i in $(seq 0 99); do - ln -sf "/opt/clickbench/datasets_ro/hits_${i}.parquet" "hits_${i}.parquet" + ln -sf "/opt/clickbench/datasets_ro/hits_partitioned/hits_${i}.parquet" \ + "hits_${i}.parquet" done EOF cat > /opt/clickbench/lib/download-hits-tsv <<'EOF' From fee66e5f857753cbb68dddf3656372e680dd9103 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:09:28 +0000 Subject: [PATCH 194/221] playground: preload iptable_raw + friends so dockerd networking works MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After fixing the iptables-nft → legacy default, dockerd installs and starts cleanly. `docker run` then fails with: iptables v1.8.10 (legacy): can't initialize iptables table 'raw': Table does not exist (do you need to insmod?) because modprobe doesn't auto-load every iptables filter table on demand inside a stripped-down firecracker rootfs. dockerd's DIRECT ACCESS FILTERING uses the `raw` table; we already pre-load `iptable_nat`, so add `iptable_raw`, `iptable_filter`, `iptable_mangle`, and `xt_conntrack` to the list. Co-Authored-By: Claude Opus 4.7 --- playground/images/build-base-rootfs.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index e351ae7904..20ee84eece 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -180,11 +180,15 @@ cat > /etc/modules-load.d/clickbench.conf < Date: Fri, 15 May 2026 17:13:22 +0000 Subject: [PATCH 195/221] firebolt{,-parquet,-parquet-partitioned}: bring up to PR #860 per-step layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three were added to the playground before being rewritten in the PR #860 split-benchmark.sh refactor — they still carried the old monolithic benchmark.sh + run.sh. Replace benchmark.sh with the thin shim that sources lib/benchmark-common.sh, drop run.sh, and add a data-size script measuring fb-volume (the bind-mounted firebolt-core data directory). install/start/check/load/query/stop already existed from when we wrote them per-step originally; this only catches the metadata files up. Co-Authored-By: Claude Opus 4.7 --- firebolt-parquet-partitioned/benchmark.sh | 55 ++---------------- firebolt-parquet-partitioned/data-size | 6 ++ firebolt-parquet-partitioned/run.sh | 18 ------ firebolt-parquet/benchmark.sh | 52 ++--------------- firebolt-parquet/data-size | 6 ++ firebolt-parquet/run.sh | 18 ------ firebolt/benchmark.sh | 69 ++--------------------- firebolt/data-size | 6 ++ firebolt/run.sh | 28 --------- 9 files changed, 30 insertions(+), 228 deletions(-) create mode 100755 firebolt-parquet-partitioned/data-size delete mode 100755 firebolt-parquet-partitioned/run.sh create mode 100755 firebolt-parquet/data-size delete mode 100755 firebolt-parquet/run.sh create mode 100755 firebolt/data-size delete mode 100755 firebolt/run.sh diff --git a/firebolt-parquet-partitioned/benchmark.sh b/firebolt-parquet-partitioned/benchmark.sh index d55d945a07..dbb9072c56 100755 --- a/firebolt-parquet-partitioned/benchmark.sh +++ b/firebolt-parquet-partitioned/benchmark.sh @@ -1,53 +1,6 @@ #!/bin/bash - -# Playground reads this line to pre-stage the dataset on the per-VM -# system disk; the rest of benchmark.sh is unchanged. +# Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" - -# Download the partitioned hits parquet files -echo "Downloading dataset..." -rm -rf data -../lib/download-hits-parquet-partitioned data - -# Start the container -sudo apt-get install -y docker.io jq -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v ./data/:/firebolt-core/clickbench \ - ghcr.io/firebolt-db/firebolt-core:preview-rc - -# See firebolt/benchmark.sh — the old curl-and-break pattern accepted the -# "Cluster not yet healthy" JSON error body as success. -for _ in {1..600} -do - if curl -sS "http://localhost:3473/" \ - --data-binary "SELECT 'Firebolt is ready';" 2>/dev/null \ - | grep -q "Firebolt is ready"; then - break - fi - sleep 1 -done - -# Create the database and external table -echo "Creating external table..." -curl -sS "http://localhost:3473/?enable_multi_query_requests=true" --data-binary "DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;" -curl -sS "http://localhost:3473/?database=clickbench&enable_multi_query_requests=true" --data-binary @create.sql - -# Print statistics -DATA_SIZE=$(du -bcs data/hits_*.parquet 2>/dev/null | grep total | awk '{print $1}') -if [ -z "$DATA_SIZE" ]; then - DATA_SIZE=$(du -cs data/hits_*.parquet | grep total | awk '{print $1}') -fi -echo "Load time: 0" -echo "Data size: $DATA_SIZE" - -# Run the benchmark -echo "Running the benchmark..." -./run.sh - -# Stop the container and remove the data -sudo docker container stop firebolt-core -rm -rf data +export BENCH_DURABLE=no +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/firebolt-parquet-partitioned/data-size b/firebolt-parquet-partitioned/data-size new file mode 100755 index 0000000000..b5fe999ff8 --- /dev/null +++ b/firebolt-parquet-partitioned/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Firebolt-core writes its database state under /firebolt-core/volume +# inside the container, which we bind-mount to ./fb-volume on the host. +du -bcs fb-volume 2>/dev/null | awk '/total$/ { print $1 }' diff --git a/firebolt-parquet-partitioned/run.sh b/firebolt-parquet-partitioned/run.sh deleted file mode 100755 index 9b810c99ea..0000000000 --- a/firebolt-parquet-partitioned/run.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Disable the result and subresult caches. -QUERY_PARAMS="enable_result_cache=false&enable_subresult_cache=false&output_format=JSON_Compact" - -cat queries.sql | while read -r query; do - # Firebolt is a database with local on-disk storage: drop the page cache before the first run of each query. - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - # Run the query three times. - # Extract the elapsed time from the response's statistics. - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n "[${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n ",${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo ",${ELAPSED}]," -done diff --git a/firebolt-parquet/benchmark.sh b/firebolt-parquet/benchmark.sh index 4517ed36b1..3a332d30db 100755 --- a/firebolt-parquet/benchmark.sh +++ b/firebolt-parquet/benchmark.sh @@ -1,50 +1,6 @@ #!/bin/bash - -# Playground reads this line to pre-stage the dataset on the per-VM -# system disk; the rest of benchmark.sh is unchanged. +# Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" - -# Download the hits.parquet file -echo "Downloading dataset..." -rm -rf data -../lib/download-hits-parquet-single data - -# Start the container -sudo apt-get install -y docker.io jq -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v ./data/:/firebolt-core/clickbench \ - ghcr.io/firebolt-db/firebolt-core:preview-rc - -# See firebolt/benchmark.sh — the old curl-and-break pattern accepted the -# "Cluster not yet healthy" JSON error body as success. -for _ in {1..600} -do - if curl -sS "http://localhost:3473/" \ - --data-binary "SELECT 'Firebolt is ready';" 2>/dev/null \ - | grep -q "Firebolt is ready"; then - break - fi - sleep 1 -done - -# Create the database and external table -echo "Creating external table..." -curl -sS "http://localhost:3473/?enable_multi_query_requests=true" --data-binary "DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;" -curl -sS "http://localhost:3473/?database=clickbench&enable_multi_query_requests=true" --data-binary @create.sql - -# Print statistics -DATA_SIZE=$(stat -c%s data/hits.parquet 2>/dev/null || stat -f%z data/hits.parquet) -echo "Load time: 0" -echo "Data size: $DATA_SIZE" - -# Run the benchmark -echo "Running the benchmark..." -./run.sh - -# Stop the container and remove the data -sudo docker container stop firebolt-core -rm -rf data +export BENCH_DURABLE=no +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/firebolt-parquet/data-size b/firebolt-parquet/data-size new file mode 100755 index 0000000000..b5fe999ff8 --- /dev/null +++ b/firebolt-parquet/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Firebolt-core writes its database state under /firebolt-core/volume +# inside the container, which we bind-mount to ./fb-volume on the host. +du -bcs fb-volume 2>/dev/null | awk '/total$/ { print $1 }' diff --git a/firebolt-parquet/run.sh b/firebolt-parquet/run.sh deleted file mode 100755 index 9b810c99ea..0000000000 --- a/firebolt-parquet/run.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Disable the result and subresult caches. -QUERY_PARAMS="enable_result_cache=false&enable_subresult_cache=false&output_format=JSON_Compact" - -cat queries.sql | while read -r query; do - # Firebolt is a database with local on-disk storage: drop the page cache before the first run of each query. - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - # Run the query three times. - # Extract the elapsed time from the response's statistics. - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n "[${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n ",${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo ",${ELAPSED}]," -done diff --git a/firebolt/benchmark.sh b/firebolt/benchmark.sh index e6ca77e1e2..617422ddc2 100755 --- a/firebolt/benchmark.sh +++ b/firebolt/benchmark.sh @@ -1,67 +1,6 @@ #!/bin/bash - -# Playground reads this line to pre-stage the dataset on the per-VM -# system disk; the rest of benchmark.sh is unchanged. +# Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" - -# Download the hits.parquet file -echo "Downloading dataset..." -rm -rf data -../lib/download-hits-parquet-single data - -# Start the container -sudo apt-get install -y docker.io jq -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v ./data/:/firebolt-core/clickbench \ - ghcr.io/firebolt-db/firebolt-core:preview-rc - -# Wait until Firebolt is ready. The old loop just did -# curl -s ... > /dev/null && break -# which treated any HTTP response as success, including the JSON error -# body -# {"errors":[{"description":"Cluster not yet healthy: ..."}]} -# that Firebolt returns at HTTP 200 while the container is still -# warming up. The loop exited on the first reply, the next -# CREATE TABLE / queries all hit the same "Cluster not yet healthy" -# error, and every query got recorded as "elapsed":0.0 — sink.parser -# then rejected the run for having no timing > 0.1 s, which is why -# Firebolt stopped showing up in sink.results after 2026-02-21 -# despite the bench completing 43/43 each time. -for _ in {1..600} -do - if curl -sS "http://localhost:3473/" \ - --data-binary "SELECT 'Firebolt is ready';" 2>/dev/null \ - | grep -q "Firebolt is ready"; then - break - fi - sleep 1 -done - -# Ingest the data -echo "Ingesting the data..." -curl -s "http://localhost:3473/?enable_multi_query_requests=true" --data-binary "DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;" -LOAD_TIME=$(curl -w "%{time_total}\n" -s "http://localhost:3473/?database=clickbench&enable_multi_query_requests=true" --data-binary @create.sql) - -# Print statistics -COMPRESSED_SIZE=$(curl -s "http://localhost:3473/?database=clickbench&output_format=JSON_Compact" --data-binary "SELECT compressed_bytes FROM information_schema.tables WHERE table_name = 'hits';" | jq '.data[0][0] | tonumber') -UNCOMPRESSED_SIZE=$(curl -s "http://localhost:3473/?database=clickbench&output_format=JSON_Compact" --data-binary "SELECT uncompressed_bytes FROM information_schema.tables WHERE table_name = 'hits';" | jq '.data[0][0] | tonumber') -echo "Load time: $LOAD_TIME" -echo "Data size: $COMPRESSED_SIZE" -echo "Uncompressed data size: $UNCOMPRESSED_SIZE bytes" - -if [ "$1" != "" ] && [ "$1" != "scan-cache" ]; then - echo "Error: command line argument must be one of {'', 'scan-cache'}" - exit 1 -fi - -# Run the benchmark -echo "Running the benchmark..." -./run.sh "$1" - -# Stop the container and remove the data -sudo docker container stop firebolt-core -rm -rf data +export BENCH_DURABLE=yes +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/firebolt/data-size b/firebolt/data-size new file mode 100755 index 0000000000..b5fe999ff8 --- /dev/null +++ b/firebolt/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Firebolt-core writes its database state under /firebolt-core/volume +# inside the container, which we bind-mount to ./fb-volume on the host. +du -bcs fb-volume 2>/dev/null | awk '/total$/ { print $1 }' diff --git a/firebolt/run.sh b/firebolt/run.sh deleted file mode 100755 index 08bdfdbc18..0000000000 --- a/firebolt/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -if [ "$1" != "" ] && [ "$1" != "scan-cache" ]; then - echo "Error: command line argument must be one of {'', 'scan-cache'}" - exit 1 -fi - -SCAN_CACHE="false" -if [ "$1" == "scan-cache" ]; then - SCAN_CACHE="true" -fi - -# Disable the result and subresult caches. Enable the scan-cache. -QUERY_PARAMS="enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=${SCAN_CACHE}&output_format=JSON_Compact" - -cat queries.sql | while read -r query; do - # Firebolt is a database with local on-disk storage: drop the page cache before the first run of each query. - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - # Run the query three times. - # Extract the elapsed time from the response's statistics. - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n "[${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n ",${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo ",${ELAPSED}]," -done From d3ade326140be15f720b8453aea5830d3d47d14a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:15:51 +0000 Subject: [PATCH 196/221] playground: docker default bridge in nat-unprotected mode dockerd 28+ added "DIRECT ACCESS FILTERING": iptables -t raw DROP rules to block traffic going directly to container IPs. The Firecracker CI kernel doesn't compile in CONFIG_IP_NF_RAW, so 'iptables -t raw -A PREROUTING' fails with 'Table does not exist' and 'docker run' on the default bridge exits 125. Write /etc/docker/daemon.json setting the bridge driver's gateway_mode_ipv4/ipv6 = nat-unprotected. Container traffic still masquerades via the `nat` and `filter` tables (which the kernel does have); we lose the extra "host-bypass DROP" layer that's fine to skip in a sandboxed single-container microVM. Co-Authored-By: Claude Opus 4.7 --- playground/images/build-base-rootfs.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 20ee84eece..bb4ad4ba2e 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -215,6 +215,30 @@ export DEBIAN_FRONTEND=noninteractive update-alternatives --set iptables /usr/sbin/iptables-legacy update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy +# dockerd 28+ adds "DIRECT ACCESS FILTERING" — it inserts DROP rules +# into the iptables `raw` table to block traffic going straight to +# container IPs. The Firecracker CI kernel doesn't compile in +# CONFIG_IP_NF_RAW, so `iptables -t raw -A PREROUTING` fails with +# "Table does not exist", and `docker run` for the default bridge +# exits 125. Switch the default bridge network to +# `gateway_mode_ipv4=nat-unprotected` (no raw-table DROP rules) +# via daemon.json. Container traffic still NATs through iptables +# `nat` and `filter` (which the kernel does have); we lose the +# extra layer of "no host-bypass" protection that DIRECT ACCESS +# FILTERING gives, which is fine for a sandboxed microVM with +# one container. +mkdir -p /etc/docker +cat > /etc/docker/daemon.json < Date: Fri, 15 May 2026 17:19:17 +0000 Subject: [PATCH 197/221] playground: disable dockerd iptables management entirely MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway_mode_ipv4=nat-unprotected attempt didn't take effect for the auto-created default `bridge` network on docker.io 29.x — every docker run still tries to insert a `raw`-table DROP rule and fails with 'Table does not exist'. Set iptables=false in daemon.json: dockerd stops touching iptables altogether, port forwarding goes through the userland docker-proxy (which works fine for our single-container-per-VM use case), and the host-side net.enable_filtered_internet path still handles VM→upstream masquerade. Co-Authored-By: Claude Opus 4.7 --- playground/images/build-base-rootfs.sh | 40 ++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index bb4ad4ba2e..0c1564fe56 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -215,27 +215,31 @@ export DEBIAN_FRONTEND=noninteractive update-alternatives --set iptables /usr/sbin/iptables-legacy update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy -# dockerd 28+ adds "DIRECT ACCESS FILTERING" — it inserts DROP rules -# into the iptables `raw` table to block traffic going straight to -# container IPs. The Firecracker CI kernel doesn't compile in -# CONFIG_IP_NF_RAW, so `iptables -t raw -A PREROUTING` fails with -# "Table does not exist", and `docker run` for the default bridge -# exits 125. Switch the default bridge network to -# `gateway_mode_ipv4=nat-unprotected` (no raw-table DROP rules) -# via daemon.json. Container traffic still NATs through iptables -# `nat` and `filter` (which the kernel does have); we lose the -# extra layer of "no host-bypass" protection that DIRECT ACCESS -# FILTERING gives, which is fine for a sandboxed microVM with -# one container. +# Turn off dockerd's iptables management entirely. Reasons: +# 1. dockerd 28+ adds "DIRECT ACCESS FILTERING" which touches the +# iptables `raw` table; the Firecracker CI kernel doesn't compile +# in CONFIG_IP_NF_RAW, so every `docker run` on the default +# bridge fails with +# Unable to enable DIRECT ACCESS FILTERING - DROP rule: +# iptables ... can't initialize iptables table `raw`: +# Table does not exist +# The `default-network-opts.bridge.gateway_mode_ipv4=nat-unprotected` +# knob is supposed to skip those rules, but isn't honoured for +# the auto-created `bridge` network on this docker.io 29.x. +# 2. The microVM only ever runs ONE container per system, and the +# container talks to 127.0.0.1: via host-side port mapping +# (handled by docker-proxy, not iptables). The host-side +# net.enable_filtered_internet handles VM→outside masquerade. +# +# With iptables=false, dockerd doesn't add ANY iptables rules; port +# forwarding goes through the userland docker-proxy. mkdir -p /etc/docker cat > /etc/docker/daemon.json < Date: Fri, 15 May 2026 17:34:52 +0000 Subject: [PATCH 198/221] firebolt{,-parquet,-parquet-partitioned}: dump diagnostics on healthcheck timeout The 10-min wait-for-ready loop just printed firebolt-core did not become healthy in 10 min with zero context, so subsequent re-kicks were blind. Add docker ps / inspect / logs / ss listener / curl probe on the failure path so the provision log carries enough to triage. Co-Authored-By: Claude Opus 4.7 --- firebolt-parquet-partitioned/start | 14 +++++++++++++- firebolt-parquet/start | 14 +++++++++++++- firebolt/start | 14 +++++++++++++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/firebolt-parquet-partitioned/start b/firebolt-parquet-partitioned/start index 1e08dce668..2ef0eed50a 100755 --- a/firebolt-parquet-partitioned/start +++ b/firebolt-parquet-partitioned/start @@ -45,5 +45,17 @@ for _ in $(seq 1 600); do fi sleep 1 done -echo "firebolt-core did not become healthy in 10 min" >&2 +{ + echo "firebolt-core did not become healthy in 10 min" + echo "=== docker ps -a ===" + sudo docker ps -a 2>&1 + echo "=== docker inspect firebolt-core (state) ===" + sudo docker inspect firebolt-core --format '{{json .State}}' 2>&1 + echo "=== docker logs firebolt-core --tail 50 ===" + sudo docker logs firebolt-core --tail 50 2>&1 + echo "=== curl http://localhost:3473/ ===" + curl -sS --max-time 3 'http://localhost:3473/' --data-binary 'SELECT 1' 2>&1 + echo "=== ss listeners ===" + sudo ss -lntp 2>&1 | head -20 +} >&2 exit 1 diff --git a/firebolt-parquet/start b/firebolt-parquet/start index 1e08dce668..2ef0eed50a 100755 --- a/firebolt-parquet/start +++ b/firebolt-parquet/start @@ -45,5 +45,17 @@ for _ in $(seq 1 600); do fi sleep 1 done -echo "firebolt-core did not become healthy in 10 min" >&2 +{ + echo "firebolt-core did not become healthy in 10 min" + echo "=== docker ps -a ===" + sudo docker ps -a 2>&1 + echo "=== docker inspect firebolt-core (state) ===" + sudo docker inspect firebolt-core --format '{{json .State}}' 2>&1 + echo "=== docker logs firebolt-core --tail 50 ===" + sudo docker logs firebolt-core --tail 50 2>&1 + echo "=== curl http://localhost:3473/ ===" + curl -sS --max-time 3 'http://localhost:3473/' --data-binary 'SELECT 1' 2>&1 + echo "=== ss listeners ===" + sudo ss -lntp 2>&1 | head -20 +} >&2 exit 1 diff --git a/firebolt/start b/firebolt/start index 1e08dce668..2ef0eed50a 100755 --- a/firebolt/start +++ b/firebolt/start @@ -45,5 +45,17 @@ for _ in $(seq 1 600); do fi sleep 1 done -echo "firebolt-core did not become healthy in 10 min" >&2 +{ + echo "firebolt-core did not become healthy in 10 min" + echo "=== docker ps -a ===" + sudo docker ps -a 2>&1 + echo "=== docker inspect firebolt-core (state) ===" + sudo docker inspect firebolt-core --format '{{json .State}}' 2>&1 + echo "=== docker logs firebolt-core --tail 50 ===" + sudo docker logs firebolt-core --tail 50 2>&1 + echo "=== curl http://localhost:3473/ ===" + curl -sS --max-time 3 'http://localhost:3473/' --data-binary 'SELECT 1' 2>&1 + echo "=== ss listeners ===" + sudo ss -lntp 2>&1 | head -20 +} >&2 exit 1 From 7f490f17775e42c97065b5a74d63c72ed2a52a0c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:46:42 +0000 Subject: [PATCH 199/221] firebolt{,-parquet,-parquet-partitioned}: chown fb-volume to uid 1111 The diagnostic dump showed firebolt-core refusing to start with: The directory '/firebolt-core/volume/' (owner 0:0, permissions 755) is not readable or writeable by the Firebolt Core process (running as effective user 1111, effective group 1111). The agent provisions as root, so the bind-mounted host dir lands as root:root; firebolt-core inside the container is uid 1111 and won't initialize the engine. chown the host-side dir to 1111:1111 before docker run. Co-Authored-By: Claude Opus 4.7 --- firebolt-parquet-partitioned/start | 6 ++++++ firebolt-parquet/start | 6 ++++++ firebolt/start | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/firebolt-parquet-partitioned/start b/firebolt-parquet-partitioned/start index 2ef0eed50a..652146394e 100755 --- a/firebolt-parquet-partitioned/start +++ b/firebolt-parquet-partitioned/start @@ -8,6 +8,12 @@ if curl -sS --max-time 5 'http://localhost:3473/' \ fi mkdir -p data fb-volume +# firebolt-core runs as UID/GID 1111 inside the container and refuses +# to start if its data dir is not writeable by that uid (the engine +# self-checks and aborts with "directory ... is not readable or +# writeable by the Firebolt Core process"). Set the host-side +# ownership accordingly so the bind-mounted dir is usable. +sudo chown 1111:1111 fb-volume # If the container exists (stopped from a prior agent pre-snapshot # cycle), just start it back — the data lives on the bind-mounted diff --git a/firebolt-parquet/start b/firebolt-parquet/start index 2ef0eed50a..652146394e 100755 --- a/firebolt-parquet/start +++ b/firebolt-parquet/start @@ -8,6 +8,12 @@ if curl -sS --max-time 5 'http://localhost:3473/' \ fi mkdir -p data fb-volume +# firebolt-core runs as UID/GID 1111 inside the container and refuses +# to start if its data dir is not writeable by that uid (the engine +# self-checks and aborts with "directory ... is not readable or +# writeable by the Firebolt Core process"). Set the host-side +# ownership accordingly so the bind-mounted dir is usable. +sudo chown 1111:1111 fb-volume # If the container exists (stopped from a prior agent pre-snapshot # cycle), just start it back — the data lives on the bind-mounted diff --git a/firebolt/start b/firebolt/start index 2ef0eed50a..652146394e 100755 --- a/firebolt/start +++ b/firebolt/start @@ -8,6 +8,12 @@ if curl -sS --max-time 5 'http://localhost:3473/' \ fi mkdir -p data fb-volume +# firebolt-core runs as UID/GID 1111 inside the container and refuses +# to start if its data dir is not writeable by that uid (the engine +# self-checks and aborts with "directory ... is not readable or +# writeable by the Firebolt Core process"). Set the host-side +# ownership accordingly so the bind-mounted dir is usable. +sudo chown 1111:1111 fb-volume # If the container exists (stopped from a prior agent pre-snapshot # cycle), just start it back — the data lives on the bind-mounted From edafd371fc5e18d52d9e69d8df2ea0738121c642 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:56:04 +0000 Subject: [PATCH 200/221] =?UTF-8?q?playground:=20btrfs=20migration=20?= =?UTF-8?q?=E2=80=94=20drop=20manual=20zstd=20compression,=20doc=20btrfs?= =?UTF-8?q?=20setup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflink + transparent zstd are both native on btrfs, so the two-phase reflink-then-zstd snapshot dance is no longer needed: revert _snapshot_disks/_restore_disks to plain reflink and let the filesystem handle compression. Update install-firecracker.sh to document mkfs.btrfs + compress=zstd:1 as the recommended host setup; XFS still works for reflink but lacks compression and fills the host at ~7 TB. Co-Authored-By: Claude Opus 4.7 --- playground/scripts/install-firecracker.sh | 17 +-- playground/server/vm_manager.py | 153 ++++++---------------- 2 files changed, 47 insertions(+), 123 deletions(-) diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh index e4cffbd5cd..749a051172 100755 --- a/playground/scripts/install-firecracker.sh +++ b/playground/scripts/install-firecracker.sh @@ -20,17 +20,18 @@ sudo chown "$(id -u):$(id -g)" \ # The playground relies on reflink (cp --reflink=always) to clone # 200 GB-apparent / multi-GB-real per-VM disks in milliseconds instead -# of seconds. ext4 ships reflink support behind the `shared_blocks` -# feature flag, but mke2fs in Ubuntu 22.04 / 24.04 doesn't expose it -# yet — so we format the playground volume as XFS, which has reflink -# enabled by default since mkfs.xfs 4.18 (2018). If you're staging the -# host yourself, set this up before running install-firecracker.sh: +# of seconds, and on transparent zstd compression to fit 100 system +# goldens on the host. Btrfs gives us both out of the box. Format the +# playground volume before running install-firecracker.sh: # -# sudo mkfs.xfs -L cbplayground -f /dev/ -# echo 'LABEL=cbplayground /opt/clickbench-playground xfs \ -# defaults,noatime,discard,nofail 0 2' | sudo tee -a /etc/fstab +# sudo mkfs.btrfs -L cbplayground -f /dev/ +# echo 'LABEL=cbplayground /opt/clickbench-playground btrfs \ +# defaults,noatime,compress=zstd:1,nofail 0 2' | sudo tee -a /etc/fstab # sudo mount /opt/clickbench-playground # +# (XFS also works for reflink but doesn't have transparent compression, +# so on XFS the host fills up at ~7 TB once every system is provisioned.) +# # Sanity-check at install time so a missing reflink is loud: if ! ( cd "$STATE_DIR" && tmp1="$(mktemp -p .)" && \ tmp2="$(mktemp -p . -u)" && \ diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 4289797626..031c0b9276 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -664,24 +664,17 @@ def _swap_paths(self, vm: VM) -> tuple[Path, Path] | None: async def _snapshot_disks(self, vm: VM) -> None: rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) - # Two-phase save: - # 1. Reflink-clone the working images into the golden slot. - # The working file stays bound to Firecracker's open - # virtio-blk fd through the post-snapshot resume + shutdown, - # and any writes during that window would leak into the - # golden (observed restored systems hitting ext4 EBADMSG - # on small files like duckdb's hits.db.wal and a venv - # activate script). With reflink the snapshot is - # near-instant; the working file's post-snapshot writes - # diverge into its own extents and don't touch the golden. - # 2. zstd-compress the reflinked golden in place, deleting - # the uncompressed copy. The reflink savings are gone - # (compressed bytes don't share extents with the original) - # but disk usage drops ~30-60% per system, which is the - # whole point — 100 snapshotted systems × 70-100 GB - # goldens fills a 7 TB host. The cost is paid back at - # restore time, where we decompress to a fresh working - # file (no reflink possible from .zst). + # Reflink-clone the working images into the golden slot. We can't + # rename: the working file stays bound to Firecracker's open + # virtio-blk fd through the post-snapshot resume + shutdown, and + # any writes during that window would leak into the golden (we + # observed restored systems hitting ext4 EBADMSG on small files + # like duckdb's hits.db.wal and a venv activate script). With + # reflink the snapshot is near-instant; the working file's + # post-snapshot writes diverge into its own extents and don't + # touch the golden. The disk is btrfs with compress=zstd so + # the goldens occupy ~30-50% of their apparent size on disk + # transparently — no application-level compression needed. async def _clone(src: Path, dst: Path) -> None: if dst.exists(): dst.unlink() @@ -694,33 +687,6 @@ async def _clone(src: Path, dst: Path) -> None: raise RuntimeError( f"reflink snapshot cp {src} -> {dst} failed: " f"{err.decode(errors='replace')[-400:]}") - - async def _compress(gold: Path) -> None: - # Compress fast (-1) on all cores (-T0). --sparse=auto on - # the decompress side will restore zero holes from the - # frame metadata. Two-step: create .zst, verify, *then* - # unlink the original. We deliberately don't pass --rm - # because an interrupted zstd with --rm could lose the - # only copy of the golden. - zst = gold.with_suffix(gold.suffix + ".zst") - with contextlib.suppress(FileNotFoundError): - zst.unlink() - proc = await asyncio.create_subprocess_exec( - "zstd", "-1", "-T0", "--sparse", - "--quiet", "-o", str(zst), str(gold), - stderr=asyncio.subprocess.PIPE, - ) - _, err = await proc.communicate() - if proc.returncode != 0: - with contextlib.suppress(FileNotFoundError): - zst.unlink() - raise RuntimeError( - f"zstd compress {gold} failed: " - f"{err.decode(errors='replace')[-400:]}") - # Compression succeeded; the uncompressed reflink is now - # redundant. - gold.unlink() - clones = [ _clone(rootfs, rootfs_gold), _clone(sysdisk, sysdisk_gold), @@ -729,83 +695,45 @@ async def _compress(gold: Path) -> None: if swap_pair is not None and swap_pair[0].exists(): clones.append(_clone(swap_pair[0], swap_pair[1])) await asyncio.gather(*clones) - - # Compress all goldens in parallel — they're independent files. - compresses = [_compress(rootfs_gold), _compress(sysdisk_gold)] + sizes = [_fmt_size(rootfs_gold.stat().st_size), + _fmt_size(sysdisk_gold.stat().st_size)] if swap_pair is not None and swap_pair[1].exists(): - compresses.append(_compress(swap_pair[1])) - await asyncio.gather(*compresses) - - sizes = [ - _fmt_size(rootfs_gold.with_suffix(".ext4.zst").stat().st_size), - _fmt_size(sysdisk_gold.with_suffix(".ext4.zst").stat().st_size), - ] - if swap_pair is not None: - swap_zst = swap_pair[1].with_suffix(".raw.zst") - if swap_zst.exists(): - sizes.append(_fmt_size(swap_zst.stat().st_size)) - log.info("[%s] golden disks saved + zstd (%s)", vm.system.name, + sizes.append(_fmt_size(swap_pair[1].stat().st_size)) + log.info("[%s] golden disks saved (%s)", vm.system.name, ", ".join(sizes)) async def _restore_disks(self, vm: VM) -> None: rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) - # Either form is acceptable; each pair has at most one of the - # two present. The .zst path is the post-compression world - # (see _snapshot_disks). The plain .ext4 path is the legacy - # uncompressed form — kept for backwards compatibility with - # older snapshots taken before the zstd hook landed. - def _pair(gold: Path) -> Path: - zst = gold.with_suffix(gold.suffix + ".zst") - if zst.exists(): - return zst - if gold.exists(): - return gold + if not rootfs_gold.exists() or not sysdisk_gold.exists(): raise RuntimeError( - f"[{vm.system.name}] missing golden {gold.name}{{,.zst}}; " - "cannot restore") - - rootfs_src = _pair(rootfs_gold) - sysdisk_src = _pair(sysdisk_gold) - - async def _materialize(src: Path, dst: Path) -> None: - """Reflink-clone an uncompressed golden (cheap, O(1)) or - zstd-decompress a compressed one (paid only at restore - time). Working file is created sparsely either way.""" + f"[{vm.system.name}] missing golden disks; cannot restore") + # Reflink-clone the goldens into fresh working copies. The host + # filesystem is btrfs with `compress=zstd` enabled; reflink is + # O(1) (extent-list copy) and the engine transparently + # decompresses on read, so restore latency is dominated by + # firecracker boot, not disk I/O. Both clones run concurrently; + # they touch disjoint files. + async def _clone(src: Path, dst: Path) -> None: if dst.exists(): dst.unlink() - if src.suffix == ".zst": - # zstd -d --sparse=always re-creates zero holes that - # the source ext4/swap.raw image had. - proc = await asyncio.create_subprocess_exec( - "zstd", "-d", "-T0", "--sparse", - "--quiet", "-o", str(dst), str(src), - stderr=asyncio.subprocess.PIPE, - ) - else: - proc = await asyncio.create_subprocess_exec( - "cp", "--reflink=always", str(src), str(dst), - stderr=asyncio.subprocess.PIPE, - ) + proc = await asyncio.create_subprocess_exec( + "cp", "--reflink=always", str(src), str(dst), + stderr=asyncio.subprocess.PIPE, + ) _, err = await proc.communicate() if proc.returncode != 0: raise RuntimeError( - f"restore materialize {src} -> {dst} failed: " + f"reflink cp {src} -> {dst} failed: " f"{err.decode(errors='replace')[-400:]}") - - jobs = [ - _materialize(rootfs_src, rootfs), - _materialize(sysdisk_src, sysdisk), + clones = [ + _clone(rootfs_gold, rootfs), + _clone(sysdisk_gold, sysdisk), ] swap_pair = self._swap_paths(vm) - if swap_pair is not None: - swap_gold = swap_pair[1] - swap_zst = swap_gold.with_suffix(swap_gold.suffix + ".zst") - if swap_zst.exists(): - jobs.append(_materialize(swap_zst, swap_pair[0])) - elif swap_gold.exists(): - jobs.append(_materialize(swap_gold, swap_pair[0])) - await asyncio.gather(*jobs) - log.info("[%s] working disks materialized from golden", + if swap_pair is not None and swap_pair[1].exists(): + clones.append(_clone(swap_pair[1], swap_pair[0])) + await asyncio.gather(*clones) + log.info("[%s] working disks reflink-cloned from golden", vm.system.name) async def _shutdown(self, vm: VM) -> None: @@ -959,17 +887,12 @@ def _has_snapshot(vm: VM) -> bool: golden disks have been captured. A half-built snapshot (memory present but goldens missing, or vice versa) is treated as no snapshot at all so the next ensure_ready_for_query re-provisions cleanly. - - Goldens may be either the raw ext4 file (.ext4) or the zstd-compressed - form (.ext4.zst) — see _snapshot_disks for the compression hook and - _restore_disks for the decompression path. Either is acceptable. """ mem_ok = (vm.snapshot_bin.exists() or vm.snapshot_bin.with_suffix(".bin.zst").exists()) sys_dir = vm.snapshot_bin.parent - def gold_ok(name: str) -> bool: - return (sys_dir / name).exists() or (sys_dir / (name + ".zst")).exists() - disks_ok = gold_ok("rootfs.golden.ext4") and gold_ok("system.golden.ext4") + disks_ok = ((sys_dir / "rootfs.golden.ext4").exists() and + (sys_dir / "system.golden.ext4").exists()) return mem_ok and disks_ok From 2b84f223c41c57b8ba8f6ee12d4bd3c00898274d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:56:13 +0000 Subject: [PATCH 201/221] playground: stage partitioned parquet symlinks at cwd before ./load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream ClickBench keeps the 100 hits_N.parquet partitioned files under hits_partitioned/; load scripts glob `hits_*.parquet` at cwd, not from a subdir. The agent relies on overlay magic for staging (lowerdir=datasets_ro, cwd=/opt/clickbench/system), and that surfaces files at root of the dataset image but leaves hits_partitioned/ as a subdir — the glob then matches nothing. Symptom: clickhouse / pg_clickhouse / ursa / daft-parquet-partitioned / duckdb-parquet-partitioned / duckdb-vortex-partitioned all hit 'No files found that match the pattern "hits_*.parquet"' (or the dialect-specific equivalent) at load time. Materialise the per-file symlinks in cwd in the agent rather than in each system's load script so the 6+ partitioned consumers don't each reimplement the same staging step (which historically rotted when one or two were updated and the rest weren't — upstream centralised this in lib/download-hits-* for the same reason). Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index 11ed91e662..f2fb6e2cd9 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -383,12 +383,25 @@ def _provision() -> tuple[int, bytes]: return 1, b"".join(log_lines) log_lines.append(b"\n=== check ok ===\n") - # No explicit data staging — the system's load script sees - # hits.parquet / hits.tsv / hits.csv / hits_*.parquet at cwd - # already, because cwd is the overlay merged dir - # /opt/clickbench/system and the dataset disk's contents (the + # Most datasets surface in cwd already: cwd is the overlay merged + # dir /opt/clickbench/system and the dataset disk's contents (the # overlay's lower) sit at /opt/clickbench/datasets_ro at the - # filesystem root, matching the names the load scripts use. + # filesystem root, so hits.parquet / hits.tsv / hits.csv are + # named exactly as the load scripts expect. + # + # Partitioned parquet is the exception: the upstream layout puts + # the 100 hits_N.parquet files under hits_partitioned/, and load + # scripts glob `hits_*.parquet` from cwd, not from a subdir. + # Materialize symlinks at cwd so the glob resolves. We do this in + # the agent rather than per-system to avoid 6+ systems each + # reimplementing the same staging step (which historically rotted + # — ClickBench upstream centralised this in lib/download-hits-*). + hits_partitioned = DATASETS_DIR / "hits_partitioned" + if hits_partitioned.is_dir(): + for src in hits_partitioned.glob("hits_*.parquet"): + dst = SYSTEM_DIR / src.name + if not dst.exists(): + os.symlink(src, dst) # Run load. t0 = time.monotonic() From a52048ed28ec1b426f5d3c3ccefc62c453c13acf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:56:25 +0000 Subject: [PATCH 202/221] clickhouse-web: bind-mount tmpfs at caches/web instead of symlinking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ClickHouse v26.x canonicalises the filesystem-cache path before the policy check that 'absolute path must lie inside /var/lib/clickhouse/caches/'; an older trick of pointing caches/web at /dev/shm via symlink is now rejected with BAD_ARGUMENTS at CREATE TABLE time. Bind-mount /dev/shm/clickhouse onto /var/lib/clickhouse/caches/web so the kernel-canonicalised path stays inside caches/ but the underlying bytes still live in tmpfs (the whole point — cold queries pull ~1 GB into the cache and we don't want that on the host SSD). Also clean up a leftover symlink from previous install runs before the mkdir/mount so re-running install is idempotent. Co-Authored-By: Claude Opus 4.7 --- clickhouse-web/install | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/clickhouse-web/install b/clickhouse-web/install index aa860621ad..b683174fad 100755 --- a/clickhouse-web/install +++ b/clickhouse-web/install @@ -16,11 +16,21 @@ fi # bytes to live in tmpfs (/dev/shm) for the speed: cold queries # pull ~1 GB on first run and tmpfs avoids touching the host SSD. # -# Trick: hand ClickHouse a path that satisfies its policy check -# (.../caches/web) but is itself a symlink into /dev/shm. CH only -# verifies the lexical prefix of the configured path; it doesn't -# canonicalise the target. -sudo mkdir -p /dev/shm/clickhouse /var/lib/clickhouse/caches -sudo chown clickhouse:clickhouse /dev/shm/clickhouse -# Replace any prior real dir / stale symlink atomically. -sudo ln -sfn /dev/shm/clickhouse /var/lib/clickhouse/caches/web +# Newer ClickHouse versions canonicalise the path before the policy +# check, so the older symlink trick (caches/web → /dev/shm/...) is +# rejected with BAD_ARGUMENTS. Bind-mount tmpfs at the +# policy-acceptable path instead — to CH the cache dir *is* +# /var/lib/clickhouse/caches/web with no symlink to resolve. +sudo mkdir -p /dev/shm/clickhouse /var/lib/clickhouse/caches/web +# Remove a stale symlink left by a prior install attempt: mkdir on a +# symlink-to-dir succeeds without replacing it, so an old caches/web +# pointing at /dev/shm would still be a symlink at mount time. +if [ -L /var/lib/clickhouse/caches/web ]; then + sudo rm /var/lib/clickhouse/caches/web + sudo mkdir /var/lib/clickhouse/caches/web +fi +sudo chown clickhouse:clickhouse /dev/shm/clickhouse /var/lib/clickhouse/caches/web +if ! mountpoint -q /var/lib/clickhouse/caches/web; then + sudo mount --bind /dev/shm/clickhouse /var/lib/clickhouse/caches/web +fi +sudo chown clickhouse:clickhouse /var/lib/clickhouse/caches/web From 39fc1bf728cdf46fc48bfca4fe96c381f3764ea2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 17:56:25 +0000 Subject: [PATCH 203/221] =?UTF-8?q?cedardb:=20bump=20start-ready=20timeout?= =?UTF-8?q?=2060s=20=E2=86=92=20300s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First-boot initdb inside the cedardb container runs through 'Fixing permissions on existing directory' and 'Setting up database directory' phases that take 90-120 s on cold disk before postgres actually listens. The 60 s budget bailed during that window, leaving the system in start-failed and never snapshotted. pg_isready exits fast once the daemon is up, so the longer timeout only changes behaviour in the failure path. Co-Authored-By: Claude Opus 4.7 --- cedardb/start | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cedardb/start b/cedardb/start index b6c3bbfe07..874a474500 100755 --- a/cedardb/start +++ b/cedardb/start @@ -30,10 +30,15 @@ if ! sudo docker run -d --rm -p 5432:5432 \ exit 1 fi -for _ in $(seq 1 60); do +# First-boot initdb inside the container can run for well over a +# minute (observed ~90-120 s of "Fixing permissions"/"Setting up +# database directory" before postgres actually listens). Older +# 60 s budget bailed during that phase. Give it 5 min — pg_isready +# exits fast once the daemon is up so this only matters on failure. +for _ in $(seq 1 300); do pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done -echo "cedardb did not become ready in 60 s; container logs:" >&2 +echo "cedardb did not become ready in 300 s; container logs:" >&2 sudo docker logs cedardb 2>&1 | tail -40 >&2 || true exit 1 From c16154e3d814c0146d53859734014abc5386fc21 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 18:15:53 +0000 Subject: [PATCH 204/221] clickhouse-web: drop idempotency dance from install (always from scratch) Provisioning always starts on a fresh per-VM rootfs, so the prior symlink-cleanup + mountpoint guard added nothing and just made the script noisier. Co-Authored-By: Claude Opus 4.7 --- clickhouse-web/install | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/clickhouse-web/install b/clickhouse-web/install index b683174fad..75715b33c6 100755 --- a/clickhouse-web/install +++ b/clickhouse-web/install @@ -22,15 +22,6 @@ fi # policy-acceptable path instead — to CH the cache dir *is* # /var/lib/clickhouse/caches/web with no symlink to resolve. sudo mkdir -p /dev/shm/clickhouse /var/lib/clickhouse/caches/web -# Remove a stale symlink left by a prior install attempt: mkdir on a -# symlink-to-dir succeeds without replacing it, so an old caches/web -# pointing at /dev/shm would still be a symlink at mount time. -if [ -L /var/lib/clickhouse/caches/web ]; then - sudo rm /var/lib/clickhouse/caches/web - sudo mkdir /var/lib/clickhouse/caches/web -fi sudo chown clickhouse:clickhouse /dev/shm/clickhouse /var/lib/clickhouse/caches/web -if ! mountpoint -q /var/lib/clickhouse/caches/web; then - sudo mount --bind /dev/shm/clickhouse /var/lib/clickhouse/caches/web -fi +sudo mount --bind /dev/shm/clickhouse /var/lib/clickhouse/caches/web sudo chown clickhouse:clickhouse /var/lib/clickhouse/caches/web From 95125acd56c874793e61fa881e9df39a4a8c565e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 18:23:16 +0000 Subject: [PATCH 205/221] =?UTF-8?q?parseable:=20upgrade=20v2.5.12=20?= =?UTF-8?q?=E2=86=92=20v2.7.2=20(fixes=200-row=20loads)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2.5.12 has an Arrow type-inference bug under static-schema mode: incoming JSON integers are inferred as Float64 even when the row fits in Int64, and every /ingest with an Int64-declared field fails with 400 "Fail to merge schema field 'X' because the from data_type = Float64 does not equal Int64". The load script's parallel ingest loop hit this on the very first chunk and logged ~5000 'curl: (22) HTTP 400' lines while loading zero rows; queries then returned 0 for everything. Verified the fix locally: v2.7.2 accepts the bundled static_schema.json and the playground's hits.json shape — single row ingest returns 200, COUNT(*) and AVG(UserID) both produce the expected values. Co-Authored-By: Claude Opus 4.7 --- parseable/install | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/parseable/install b/parseable/install index 9fcb8ffa1d..2d61f7ead5 100755 --- a/parseable/install +++ b/parseable/install @@ -5,8 +5,15 @@ sudo apt-get update -y sudo apt-get install -y parallel pigz pv if [ ! -x ./parseable ]; then + # v2.5.12 has an Arrow type-inference bug under static-schema mode: + # bare JSON numbers get inferred as Float64, so every ingest of a + # row with an Int64-declared field returns 400 with "Fail to merge + # schema field 'X' because the from data_type = Float64 does not + # equal Int64". Net effect: 0 rows loaded, every query returns 0. + # v2.7.2 fixes the inference; verified locally end-to-end against + # the bundled static_schema.json and hits.json. wget --continue --progress=dot:giga \ - https://github.com/parseablehq/parseable/releases/download/v2.5.12/Parseable_OSS_x86_64-unknown-linux-gnu + https://github.com/parseablehq/parseable/releases/download/v2.7.2/Parseable_OSS_x86_64-unknown-linux-gnu mv Parseable_OSS_x86_64-unknown-linux-gnu parseable chmod +x parseable fi From 09d4e46f9901a6839c2b7be8929f505ce10682db Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 19:02:04 +0000 Subject: [PATCH 206/221] playground: actually isolate disable_internet VMs (FORWARD DROP catch-all) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host's FORWARD policy is ACCEPT (Docker would flip it but we disable Docker's iptables management in the VM rootfs, and we don't want to flip the global policy ourselves — it would break unrelated host forwarding). disable_internet was only stripping the per-slot ACCEPTs and the POSTROUTING MASQUERADE, leaving every other packet to fall through to the default ACCEPT. Practical exploit: a VM with arbitrary code execution exposed to the benchmark consumer (pandas, polars, dataframe variants) could curl 169.254.169.254/latest/api/token and get a real IMDSv2 token — the AWS hypervisor responds to the VM's RFC1918 source address even without our MASQUERADE rule, and the reply gets forwarded back the same way through the still-ACCEPT default policy. From there an attacker can read the EC2 instance role's credentials. Datalake systems are accidentally safe (the PREROUTING REDIRECT to the SNI proxy catches TCP/80 before FORWARD, and the proxy's Host-header allowlist rejects 169.254.169.254) but every other system was wide open. Refactor: introduce _strip_slot(slot) that parses `iptables -S` output and removes every rule mentioning the slot's TAP or CIDR. Each enable/disable function calls it first, then installs its own rules — no more order-dependent interaction where a stale catch-all DROP from one mode silently blocks the next mode's ACCEPT. disable_filtered_internet is no longer needed (subsumed by _strip_slot) and goes away. disable_internet now installs explicit `-i tap -j DROP` and `-o tap -j DROP` so isolation no longer relies on the chain's default policy. Co-Authored-By: Claude Opus 4.7 --- playground/server/net.py | 182 ++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 100 deletions(-) diff --git a/playground/server/net.py b/playground/server/net.py index 5e88167633..78a0baa19e 100644 --- a/playground/server/net.py +++ b/playground/server/net.py @@ -89,28 +89,54 @@ async def _host_default_iface() -> str: raise RuntimeError(f"could not find default route: {text!r}") +async def _strip_slot(slot: int) -> None: + """Remove every iptables rule that mentions this slot's TAP or CIDR. + + Each enable/disable function calls this before installing its own + rules. That removes the previous mode's rules cleanly and avoids + the rule-order trap where, e.g., a 'disable_internet' catch-all + DROP added earlier sits ABOVE the RELATED-ESTABLISHED ACCEPT a + later 'enable_filtered_internet' wants to add — which would + silently block all reply traffic to the VM. + """ + tap = tap_name(slot) + _, _, cidr = addr_for(slot) + needle_tap = f" {tap} " # match -i/-o flags' value with surrounding spaces + needle_cidr = f" {cidr} " + + for table, chain in (("filter", "FORWARD"), + ("nat", "POSTROUTING"), + ("nat", "PREROUTING")): + rc, out, _ = await _run("sudo", "iptables", "-t", table, "-S", chain, + check=False) + if rc != 0: + continue + for line in out.decode(errors="replace").splitlines(): + if not line.startswith("-A "): + continue + padded = " " + line + " " + if needle_tap not in padded and needle_cidr not in padded: + continue + # Convert "-A CHAIN ..." into "-D CHAIN ..." for deletion. + args = line.split() + args[0] = "-D" + await _run("sudo", "iptables", "-t", table, *args, check=False) + + async def enable_internet(slot: int) -> None: """Allow the VM to reach the outside world via MASQUERADE + FORWARD.""" - # Any prior filtered_internet rules for this slot put a `-j DROP` - # catchall at the end of FORWARD that would take precedence over - # the ACCEPT we're about to add. Strip those first. - await disable_filtered_internet(slot) + await _strip_slot(slot) iface = await _host_default_iface() + tap = tap_name(slot) _, _, cidr = addr_for(slot) - # MASQUERADE rule: add only if not already present. - rc, out, _ = await _run("sudo", "iptables", "-t", "nat", "-S", "POSTROUTING") - if f"-s {cidr}" not in out.decode(errors="replace"): - await _run("sudo", "iptables", "-t", "nat", "-A", "POSTROUTING", - "-s", cidr, "-o", iface, "-j", "MASQUERADE") - # FORWARD rules + await _run("sudo", "iptables", "-t", "nat", "-A", "POSTROUTING", + "-s", cidr, "-o", iface, "-j", "MASQUERADE") for rule in ( - ("-i", tap_name(slot), "-o", iface, "-j", "ACCEPT"), - ("-i", iface, "-o", tap_name(slot), "-m", "state", "--state", + ("-i", tap, "-o", iface, "-j", "ACCEPT"), + ("-i", iface, "-o", tap, "-m", "state", "--state", "RELATED,ESTABLISHED", "-j", "ACCEPT"), ): - rc, out, _ = await _run("sudo", "iptables", "-C", "FORWARD", *rule, check=False) - if rc != 0: - await _run("sudo", "iptables", "-A", "FORWARD", *rule) + await _run("sudo", "iptables", "-A", "FORWARD", *rule) # Ports the SNI-filtering proxy listens on (see sni_proxy.py). Kept in @@ -179,109 +205,65 @@ async def enable_filtered_internet(slot: int) -> None: exfiltration channel — see GHSA / RFC1918 advisories cited in the security review). Every other outbound port from the VM is DROPped at FORWARD. - """ - # Clear any prior `enable_internet` ACCEPT — its blanket allow - # rule would otherwise take precedence over the DROP we'll add - # at the bottom of FORWARD and the VM would still have unrestricted - # access. - await disable_internet(slot) - tap = tap_name(slot) - iface = await _host_default_iface() - # NAT redirects: - # TCP 443/80 -> SNI proxy - # UDP 53 -> host's local DNS resolver on port 53 - nat_rules = ( - ("-i", tap, "-p", "tcp", "--dport", "443", - "-j", "REDIRECT", "--to-ports", str(PROXY_HTTPS_PORT)), - ("-i", tap, "-p", "tcp", "--dport", "80", - "-j", "REDIRECT", "--to-ports", str(PROXY_HTTP_PORT)), - ("-i", tap, "-p", "udp", "--dport", "53", - "-j", "REDIRECT", "--to-ports", "53"), - ) - for match in nat_rules: - rc, _, _ = await _run("sudo", "iptables", "-t", "nat", - "-C", "PREROUTING", *match, check=False) - if rc != 0: - await _run("sudo", "iptables", "-t", "nat", - "-A", "PREROUTING", *match) - - # FORWARD: drop TCP/53 (DNS tunneling), drop UDP/53 too as a - # belt-and-braces (the REDIRECT above already short-circuits it, - # but if the resolver is down we don't want fall-through to - # upstream). Allow established replies for the SNI proxy's - # outbound to upstream. Catchall DROP at the end. - forward_rules = ( - ("-i", tap, "-p", "udp", "--dport", "53", "-j", "DROP"), - ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "DROP"), - ("-i", iface, "-o", tap, "-m", "state", "--state", - "RELATED,ESTABLISHED", "-j", "ACCEPT"), - ("-i", tap, "-j", "DROP"), - ) - for rule in forward_rules: - rc, _, _ = await _run("sudo", "iptables", "-C", "FORWARD", *rule, - check=False) - if rc != 0: - await _run("sudo", "iptables", "-A", "FORWARD", *rule) - # No POSTROUTING MASQUERADE here: the SNI proxy on the host opens - # its OWN outbound socket to the allowlisted upstream, so the - # host's normal egress path handles the source rewrite. The VM's - # only legitimate outbound traffic now goes via REDIRECT to a - # local listener; nothing on the VM's CIDR ever reaches the - # outside interface directly. - - -async def disable_filtered_internet(slot: int) -> None: - """Drop the rules added by enable_filtered_internet. Idempotent.""" + No POSTROUTING MASQUERADE here: the SNI proxy on the host opens + its OWN outbound socket to the allowlisted upstream, so the + host's normal egress path handles the source rewrite. The VM's + only legitimate outbound traffic now goes via REDIRECT to a + local listener; nothing on the VM's CIDR ever reaches the + outside interface directly. + """ + await _strip_slot(slot) tap = tap_name(slot) iface = await _host_default_iface() - nat_rules = ( + # NAT redirects: TCP 443/80 → SNI proxy, UDP 53 → host DNS resolver. + for match in ( ("-i", tap, "-p", "tcp", "--dport", "443", "-j", "REDIRECT", "--to-ports", str(PROXY_HTTPS_PORT)), ("-i", tap, "-p", "tcp", "--dport", "80", "-j", "REDIRECT", "--to-ports", str(PROXY_HTTP_PORT)), ("-i", tap, "-p", "udp", "--dport", "53", "-j", "REDIRECT", "--to-ports", "53"), - ) - for match in nat_rules: - while True: - rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-D", - "PREROUTING", *match, check=False) - if rc != 0: - break + ): + await _run("sudo", "iptables", "-t", "nat", "-A", "PREROUTING", *match) - forward_rules = ( + # FORWARD: drop TCP/53 + UDP/53 (DNS-over-TCP is a classic exfil + # channel; UDP/53 is REDIRECTed above, this is a belt-and-braces + # for a downed resolver). Allow established replies for the SNI + # proxy's outbound to upstream. Catch-all DROP at the end. + for rule in ( ("-i", tap, "-p", "udp", "--dport", "53", "-j", "DROP"), ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "DROP"), ("-i", iface, "-o", tap, "-m", "state", "--state", "RELATED,ESTABLISHED", "-j", "ACCEPT"), ("-i", tap, "-j", "DROP"), - ) - for rule in forward_rules: - while True: - rc, _, _ = await _run("sudo", "iptables", "-D", "FORWARD", *rule, - check=False) - if rc != 0: - break + ): + await _run("sudo", "iptables", "-A", "FORWARD", *rule) async def disable_internet(slot: int) -> None: - """Drop the masquerade + forward rules added by enable_internet.""" - iface = await _host_default_iface() - _, _, cidr = addr_for(slot) - # Best-effort removal — repeat until iptables reports the rule isn't there. - while True: - rc, _, _ = await _run("sudo", "iptables", "-t", "nat", "-D", "POSTROUTING", - "-s", cidr, "-o", iface, "-j", "MASQUERADE", check=False) - if rc != 0: - break + """Isolate the VM: remove every per-slot rule and install per-slot + catch-all DROPs (both directions) so the VM cannot reach the + outside world via FORWARD's default policy. + + Why the explicit DROPs are necessary: the host's FORWARD policy + is ACCEPT (Docker would flip it but we disable Docker's iptables + management, and we don't want to flip the global policy ourselves + — it would break unrelated forwarding on the host). With just the + per-slot ACCEPTs removed, a 'disabled' VM still has clear egress + because every FORWARD packet falls through to the default ACCEPT. + Notably this lets a VM reach 169.254.169.254 (EC2 IMDS) — even + without our MASQUERADE rule the AWS hypervisor responds to the + VM's RFC1918 source, and the reply gets forwarded back the same + way. Any system exposing arbitrary code execution to the + benchmark consumer (pandas, polars, dataframe variants) could + then pivot to the host's IAM role. + """ + await _strip_slot(slot) + tap = tap_name(slot) for rule in ( - ("-i", tap_name(slot), "-o", iface, "-j", "ACCEPT"), - ("-i", iface, "-o", tap_name(slot), "-m", "state", "--state", - "RELATED,ESTABLISHED", "-j", "ACCEPT"), + ("-i", tap, "-j", "DROP"), + ("-o", tap, "-j", "DROP"), ): - while True: - rc, _, _ = await _run("sudo", "iptables", "-D", "FORWARD", *rule, check=False) - if rc != 0: - break + await _run("sudo", "iptables", "-A", "FORWARD", *rule) From 061ccd6a6fb9bbcbe26b948312a0b4a9079db8cb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 19:26:10 +0000 Subject: [PATCH 207/221] polars: avoid backslash inside f-string expression in query script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The line `sys.stderr.write(f"{o[\"elapsed\"]}\n")` parses fine on older Pythons that lex f-string contents textually but breaks on Python 3.12+ where PEP 701 parses the brace contents as a real expression — and a backslash inside a Python expression (outside a string literal) is invalid, so every query failed with "unexpected character after line continuation character" before even reaching the server. Drop the f-string for plain str() concatenation; no quote-nesting, no version-dependent lexer quirk. Co-Authored-By: Claude Opus 4.7 --- polars/query | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polars/query b/polars/query index 775d01506d..3aa96790e7 100755 --- a/polars/query +++ b/polars/query @@ -29,5 +29,5 @@ r = o.get("result", "") sys.stdout.write(r) if r and not r.endswith("\n"): sys.stdout.write("\n") -sys.stderr.write(f"{o[\"elapsed\"]}\n") +sys.stderr.write(str(o["elapsed"]) + "\n") ' From 27a157273117836c566e796db626374106bc1667 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 19:33:18 +0000 Subject: [PATCH 208/221] playground: rebuild per-system rootfs+sysdisk when base is newer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _build_images_if_needed short-circuits when both rootfs.ext4 and system.ext4 already exist, on the assumption that re-cloning costs disk for no benefit. That's wrong whenever base-rootfs.ext4 has been rebuilt since: the in-VM agent and the lib/download-* stubs live in the base, and the per-system scripts live in the sysdisk upper — and both stay stale. Concrete bite: today's agent change to stage partitioned parquet symlinks at cwd shipped in base-rootfs.ext4 at 18:05, but every already-provisioned partitioned system that we re-kicked afterwards (datafusion-partitioned and friends) booted off the pre-fix rootfs.ext4 from 15:39, ran the OLD agent that doesn't stage anything, and the load script's `mv hits_*.parquet partitioned/` matched zero files — leaving the parquet external-table empty and every query failing with 'No field named "EventDate"' / 'table hits not found'. Fix: compare mtimes; if base is newer, drop both the rootfs and the sysdisk so build-system-rootfs.sh runs and re-rsyncs both. On btrfs `cp --sparse=always` is a reflink — re-cloning a 200 GB sparse rootfs is near-instant, so the conservative invalidation isn't expensive. Co-Authored-By: Claude Opus 4.7 --- playground/server/vm_manager.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 031c0b9276..8ab1b8e1dd 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -367,6 +367,7 @@ async def _build_images_if_needed(self, vm: VM) -> None: rootfs = sys_dir / "rootfs.ext4" sysdisk = sys_dir / "system.ext4" swap = sys_dir / "swap.raw" + base = self.cfg.state_dir / "base-rootfs.ext4" # If we're (re-)provisioning a system whose rootfs already has # /var/lib/clickbench-agent/provisioned set, drop just the rootfs so # the agent reruns the full install/start/load flow on the next @@ -376,6 +377,23 @@ async def _build_images_if_needed(self, vm: VM) -> None: log.info("[%s] rootfs exists but no snapshot — dropping it for " "a fresh agent state", vm.system.name) rootfs.unlink() + # If base-rootfs has been rebuilt since the per-system rootfs was + # cloned (typically because we updated the in-VM agent or one of + # the lib/download-* stubs), drop the stale rootfs and the system + # disk too — the system disk's upper layer holds the scripts + # rsynced from the repo, so a stale agent and stale per-system + # scripts both come from here. Without this check, every code + # change to playground/agent/agent.py silently fails to reach + # already-provisioned systems on re-provision: vm_manager finds + # rootfs.ext4 + system.ext4 already present and skips the rebuild. + if rootfs.exists() and base.exists() and \ + rootfs.stat().st_mtime < base.stat().st_mtime: + log.info("[%s] base-rootfs is newer than rootfs — dropping " + "rootfs + sysdisk for a fresh agent + scripts", + vm.system.name) + rootfs.unlink() + with contextlib.suppress(FileNotFoundError): + sysdisk.unlink() # For memory-bound dataframe systems, also (re)create a sparse # swap.raw block device that the in-VM agent mkswaps + swapons. # Sized to the worst-case working set we've seen; sparse so the From 9a04fce813610870bf6f4d73ada5899897d96077 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 21:39:19 +0000 Subject: [PATCH 209/221] playground: add INSTALL.md with end-to-end setup instructions The README and architecture doc were conceptual; nothing walked through "from a blank Ubuntu 24.04 box to a serving playground". INSTALL.md does, in order: format btrfs + zstd, clone repo, set up sudoers, install firecracker/kernel/DNS/(optional)TLS, download datasets, build datasets image, build base rootfs, configure ClickHouse Cloud logging, start the server, provision the catalog. Co-Authored-By: Claude Opus 4.7 --- playground/INSTALL.md | 239 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 playground/INSTALL.md diff --git a/playground/INSTALL.md b/playground/INSTALL.md new file mode 100644 index 0000000000..867df08d14 --- /dev/null +++ b/playground/INSTALL.md @@ -0,0 +1,239 @@ +# ClickBench Playground — Installation + +End-to-end setup for a fresh Ubuntu 24.04 host. Everything lives under +`/opt/clickbench-playground/` once it's running. Total disk: ~7 TB at full +catalog (100 systems × multi-GB-per-system goldens, on btrfs with zstd +compression). + +## 0. Host prerequisites + +- Ubuntu 24.04 (noble), x86_64 +- `/dev/kvm` accessible (bare metal or a virt-enabled cloud instance — + `c6a.metal`, `m7i.metal-24xl`, `i4i.metal`, etc.) +- A dedicated block device for the playground state directory, plus + enough free space for ~7 TB of system goldens and ~200 GB of datasets. +- Outbound internet at install time (apt mirrors, GitHub releases, + Docker Hub, dataset downloads). +- Python 3.12+ on the host. + +## 1. Format the state volume (btrfs + transparent zstd) + +The playground depends on **reflink** (instant per-VM disk cloning) and +**transparent compression** (snapshots otherwise wouldn't fit). Btrfs gives +both. XFS works for reflink but lacks compression and fills the host at +~7 TB once all systems are provisioned. + +``` +sudo mkfs.btrfs -L cbplayground -f /dev/ +echo 'LABEL=cbplayground /opt/clickbench-playground btrfs \ + defaults,noatime,compress=zstd:1,nofail 0 2' | sudo tee -a /etc/fstab +sudo mkdir -p /opt/clickbench-playground +sudo mount /opt/clickbench-playground +``` + +## 2. Clone the repo + +``` +sudo apt-get update +sudo apt-get install -y git python3 python3-pip +cd /home/ubuntu +git clone https://github.com/ClickHouse/ClickBench +cd ClickBench +pip3 install --user -r playground/requirements.txt +``` + +## 3. Sudoers entry for the server + +The playground server runs as the unprivileged `ubuntu` user but needs to +call `sudo ip ...`, `sudo iptables ...`, `sudo mount`, `sudo cp`, `sudo +firecracker`, etc. Add a sudoers fragment so those calls don't prompt: + +``` +sudo tee /etc/sudoers.d/clickbench-playground >/dev/null <<'EOF' +ubuntu ALL=(root) NOPASSWD: /usr/sbin/ip, /usr/sbin/iptables, \ + /usr/bin/mount, /usr/bin/umount, /usr/bin/cp, /usr/bin/mv, \ + /usr/bin/chown, /usr/bin/chmod, /usr/bin/mkdir, /usr/bin/rm, \ + /usr/bin/dd, /usr/bin/truncate, /usr/sbin/mkfs.ext4, \ + /usr/sbin/losetup, /opt/clickbench-playground/bin/firecracker, \ + /opt/clickbench-playground/bin/jailer +EOF +sudo chmod 440 /etc/sudoers.d/clickbench-playground +``` + +Tighten the allowlist further if your security model demands it. + +## 4. Install Firecracker, kernel, host firewall, DNS, (optional) TLS + +``` +sudo playground/scripts/install-firecracker.sh +``` + +This script is idempotent. It: + +- Downloads `firecracker` + `jailer` (v1.13.1) into + `/opt/clickbench-playground/bin/`. +- Downloads the guest kernel (`vmlinux-6.1.141` from firecracker-ci) into + `/opt/clickbench-playground/kernel/vmlinux`. +- Sets `net.ipv4.conf.all.route_localnet=1` (needed by the SNI proxy + REDIRECT path). +- Installs `dnsmasq` and configures it as a UDP-only resolver on port 53 + for the per-VM TAPs. +- Sanity-checks that the state dir actually supports reflink. + +To enable TLS for the public API at the same time, set the domain first: + +``` +export PLAYGROUND_TLS_DOMAIN=clickbench-playground.example.com +export PLAYGROUND_TLS_EMAIL=ops@example.com # optional, defaults to ubuntu@$(hostname -d) +sudo -E playground/scripts/install-firecracker.sh +``` + +This invokes `certbot --standalone` to issue a cert, configures a deploy +hook so the `ssl-cert` group can read the renewed privkey, and adds the +operator user to the `ssl-cert` group. + +## 5. Download the datasets (~200 GB, slow) + +``` +playground/scripts/download-datasets.sh +``` + +Populates `/opt/clickbench-playground/datasets/` with: + +- `hits.parquet` — single-file parquet (~14 GB) +- `hits_partitioned/hits_0..99.parquet` — partitioned parquet +- `hits.tsv` — decompressed TSV (~75 GB) +- `hits.csv` — decompressed CSV (~75 GB) +- `hits.json` / `hits.json.gz` — JSON variants for parseable / + victorialogs + +The script uses `wget --continue` per format, so re-running picks up +where it left off. + +## 6. Build the read-only dataset image + +``` +playground/images/build-datasets-image.sh +``` + +rsyncs the `datasets/` directory into `datasets.ext4`, sized to fit, with +no journal and zero reserved blocks. This image is attached read-only to +every VM as `LABEL=cbdata`. + +## 7. Build the base rootfs + +``` +sudo playground/images/build-base-rootfs.sh +``` + +Starts from the official Ubuntu 24.04 cloud image and adds: + +- The in-VM agent at `/opt/clickbench-agent/agent.py` plus its systemd + unit. +- Forced iptables-legacy alternatives (Docker on the Firecracker kernel + needs them — `nf_tables` isn't compiled in). +- `/etc/docker/daemon.json` with `"iptables": false`, so Docker doesn't + try to manage the (missing) `raw` table. +- A preloaded kernel-module list (`overlay`, `br_netfilter`, `veth`, + `ip_tables`, `iptable_*`, `nf_conntrack`, `nf_nat`, `xt_MASQUERADE`, + `xt_conntrack`). +- `lib/download-hits-*` stubs that symlink from the read-only dataset + disk rather than `wget`ing from the public mirror. + +Output: `/opt/clickbench-playground/base-rootfs.ext4` (a sparse 200 GB +ext4 image). + +## 8. ClickHouse Cloud credentials (request logging) + +The server appends every request and restart to a ClickHouse Cloud table. +Provide credentials either via `playground/.env`: + +``` +CLICKHOUSE_CLOUD_URL=https://your-host.clickhouse.cloud:8443 +CLICKHOUSE_CLOUD_USER=default +CLICKHOUSE_CLOUD_PASSWORD=... +CLICKHOUSE_CLOUD_DB=playground +``` + +…or by copying `playground/clickhouse.conf.example` to +`/opt/clickbench-playground/clickhouse.conf` and filling it in. + +If neither is configured, the server falls back to a local JSONL sink +under `/opt/clickbench-playground/logs/`. + +## 9. Start the server + +Foreground (for local development): + +``` +playground/scripts/run-server.sh +``` + +As a managed service (recommended for production): + +``` +sudo cp playground/clickbench-playground.service /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable --now clickbench-playground +``` + +The server listens on `:8000` (HTTP) and, if a TLS cert exists at +`/etc/letsencrypt/live/${PLAYGROUND_TLS_DOMAIN}/`, also on `:443`. + +## 10. Provision every system (long) + +The server doesn't auto-provision on first query — initial install/start/ +load/snapshot is opt-in. Kick the whole catalog: + +``` +playground/scripts/provision-all.sh +``` + +This walks every system in `/api/systems`, posts to +`/api/admin/provision/`, and polls until each one is either +`snapshotted` or `down` with an error. Concurrency is bounded server-side +by `PLAYGROUND_PROVISION_CONCURRENCY` (default 32) and +`PLAYGROUND_BUILD_CONCURRENCY` (default 6). Expect 1–6 hours of wall +time depending on host throughput and Docker Hub rate-limit luck. + +Status: `curl http://localhost:8000/api/state | jq`. +Per-system log: `/opt/clickbench-playground/logs/provision-.log`. + +## 11. (Optional) Tune concurrency / monitor thresholds + +Environment variables read at server startup: + +| Var | Default | What it does | +|----------------------------------|--------------------------------------|-----------------------------------------------------------| +| `PLAYGROUND_STATE_DIR` | `/opt/clickbench-playground` | host state root | +| `PLAYGROUND_LISTEN` | `0.0.0.0:8000` | HTTP listener | +| `PLAYGROUND_TLS_CERT/_KEY` | `/etc/letsencrypt/live/$DOMAIN/...` | TLS | +| `PLAYGROUND_BUILD_CONCURRENCY` | 6 | parallel per-system rootfs builds | +| `PLAYGROUND_PROVISION_CONCURRENCY` | 32 | parallel VM provisions | +| `PLAYGROUND_SNAPSHOT_CONCURRENCY` | 6 | parallel Firecracker snapshot saves | +| `CLICKBENCH_OUTPUT_LIMIT` | 262144 | per-query response cap (bytes) enforced inside the agent | +| `VM_CPU_BUSY_THRESHOLD` | 0.97 | monitor: kill idle VMs above this | +| `VM_DISK_FULL_PCT` | 0.97 | monitor: kill VMs whose sysdisk passes this | + +## Smoke-testing a single system + +``` +playground/scripts/smoke-boot.sh clickhouse +``` + +Boots one system end-to-end (provision → snapshot → restore → /query), +prints timing, tears down. Use this to validate any change to +`base-rootfs.ext4` or the agent before re-kicking the full catalog. + +## Re-provisioning after agent or base-image changes + +`vm_manager` rebuilds the per-system rootfs+sysdisk automatically when +`base-rootfs.ext4` is newer than the existing `rootfs.ext4`. So after +changing `playground/agent/agent.py` or anything baked into the base: + +``` +sudo playground/images/build-base-rootfs.sh # rebuild base +curl -X POST http://localhost:8000/api/admin/provision/ # re-kick +``` + +The new agent and new per-system scripts both land in the next provision. From fa27fbd3994d7a222fbce76e3afe457e241458f4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 May 2026 23:53:08 +0000 Subject: [PATCH 210/221] =?UTF-8?q?siglens:=20rename=20queries.spl=20?= =?UTF-8?q?=E2=86=92=20queries.sql?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cross-system tooling (playground sweep, agent /query) keys off queries.sql by filename even when the contents aren't SQL. siglens ships SPL/Splunk QL but the file extension was producing NO_QUERIES misses in every catalog-wide sweep. Renaming aligns with every other system in the repo; the contents are unchanged, and benchmark.sh already declared BENCH_QUERIES_FILE accordingly (now matches reality, the override line is unnecessary but harmless). Co-Authored-By: Claude Opus 4.7 --- siglens/README.md | 2 +- siglens/benchmark.sh | 2 +- siglens/{queries.spl => queries.sql} | 0 siglens/query | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename siglens/{queries.spl => queries.sql} (100%) diff --git a/siglens/README.md b/siglens/README.md index 074b89ff98..0c43bf91f0 100644 --- a/siglens/README.md +++ b/siglens/README.md @@ -3,4 +3,4 @@ This document outlines the process for running a benchmark on SigLens, a observa Note about queries: - SigLens does not support SQL but supports Splunk Query Language (SPL). The SQL queries used by the benchmark have been translated into the splunk query language. - To ensure the accuracy of the translated Splunk Query Language queries, each SQL query was executed against the same dataset in ClickHouse. The responses from SigLens and ClickHouse were compared, and all results were identical. -- Some of the original queries are not supported and not run by the benchmark. The corresponding results have been recorded as null in `queries.spl` and `results.csv` respectively. +- Some of the original queries are not supported and not run by the benchmark. The corresponding results have been recorded as null in `queries.sql` and `results.csv` respectively. diff --git a/siglens/benchmark.sh b/siglens/benchmark.sh index 1eb2f016c9..d57ece0bc8 100755 --- a/siglens/benchmark.sh +++ b/siglens/benchmark.sh @@ -4,5 +4,5 @@ export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes # queries are SPL/Splunk QL, not SQL. -export BENCH_QUERIES_FILE="queries.spl" +export BENCH_QUERIES_FILE="queries.sql" exec ../lib/benchmark-common.sh diff --git a/siglens/queries.spl b/siglens/queries.sql similarity index 100% rename from siglens/queries.spl rename to siglens/queries.sql diff --git a/siglens/query b/siglens/query index 1ca8b5a918..a1fe18284a 100755 --- a/siglens/query +++ b/siglens/query @@ -7,7 +7,7 @@ set -e querytxt=$(cat) -# A "null" query in queries.spl means "not supported"; emit null timing. +# A "null" query in queries.sql means "not supported"; emit null timing. if [ "$querytxt" = "null" ]; then echo "{}" echo "null" >&2 From 8a18b66619f9f0a555b5fbc4a2beb1189617cf85 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 00:06:10 +0000 Subject: [PATCH 211/221] playground: replace .preserve-state marker with benchmark.sh vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two playground-agent behaviours used to be controlled by separate mechanisms — an opaque .preserve-state file in the system dir for "skip the pre-snapshot stop+start cycle" and nothing at all for "force ./stop after snapshot restore". Both are now driven by per-system variables in benchmark.sh, the same surface that already exposes BENCH_DOWNLOAD_SCRIPT / BENCH_DURABLE / BENCH_QUERIES_FILE. PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes The loaded state lives only in the daemon's process memory (pandas / polars / duckdb-dataframe / daft-parquet / chdb- dataframe / polars-dataframe — and pinot / tidb which have slow JVM/cluster bring-up worth snapshotting hot). Stopping pre-snapshot would wipe the in-process DataFrame and the restored snapshot would serve queries against a daemon whose `hits = None`. Replaces the .preserve-state marker file. PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes After a firecracker memory snapshot+restore the cluster's internal connections (brpc, gossip) are stale; the system's ./start does a shallow health probe ("SELECT 1" against the local node) and short-circuits, leaving the broken cross-node connections in place — every subsequent query then fails with "Connection refused" / "no available searcher nodes in the cluster". byconity and quickwit both showed this; opting them in causes the agent to force ./stop on btime shift before the next ./start so the bring-up is from a clean state. Agent reads the vars by grep, NOT by sourcing benchmark.sh (which ends with `exec ../lib/benchmark-common.sh`). Both vars live next to BENCH_DURABLE in the per-system shim, so the contract stays in one file. Co-Authored-By: Claude Opus 4.7 --- byconity/benchmark.sh | 6 +++ chdb-dataframe/.preserve-state | 0 chdb-dataframe/benchmark.sh | 5 +++ daft-parquet-partitioned/.preserve-state | 0 daft-parquet-partitioned/benchmark.sh | 5 +++ daft-parquet/.preserve-state | 0 daft-parquet/benchmark.sh | 5 +++ duckdb-dataframe/.preserve-state | 0 duckdb-dataframe/benchmark.sh | 5 +++ pandas/.preserve-state | 0 pandas/benchmark.sh | 5 +++ pinot/.preserve-state | 0 pinot/benchmark.sh | 5 +++ playground/agent/agent.py | 48 +++++++++++++++++++++++- polars-dataframe/.preserve-state | 0 polars-dataframe/benchmark.sh | 5 +++ polars/.preserve-state | 0 polars/benchmark.sh | 5 +++ quickwit/benchmark.sh | 6 +++ tidb/.preserve-state | 0 tidb/benchmark.sh | 5 +++ 21 files changed, 104 insertions(+), 1 deletion(-) delete mode 100644 chdb-dataframe/.preserve-state delete mode 100644 daft-parquet-partitioned/.preserve-state delete mode 100644 daft-parquet/.preserve-state delete mode 100644 duckdb-dataframe/.preserve-state delete mode 100644 pandas/.preserve-state delete mode 100644 pinot/.preserve-state delete mode 100644 polars-dataframe/.preserve-state delete mode 100644 polars/.preserve-state delete mode 100644 tidb/.preserve-state diff --git a/byconity/benchmark.sh b/byconity/benchmark.sh index 0450372b8a..b71ae2462e 100755 --- a/byconity/benchmark.sh +++ b/byconity/benchmark.sh @@ -7,4 +7,10 @@ export BENCH_DURABLE=yes # dependency, so the worst-case cold start is several minutes; the # lib's 300s default has timed out before server is up. export BENCH_CHECK_TIMEOUT=1200 +# After firecracker snapshot+restore the cluster's +# internal connections (brpc/gossip) are stale; ./start's +# shallow health probe doesn't notice and short-circuits. +# Tell the playground agent to ./stop the cluster before +# ./start so the next bring-up is from a clean state. +export PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/chdb-dataframe/.preserve-state b/chdb-dataframe/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/chdb-dataframe/benchmark.sh b/chdb-dataframe/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/chdb-dataframe/benchmark.sh +++ b/chdb-dataframe/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/daft-parquet-partitioned/.preserve-state b/daft-parquet-partitioned/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/daft-parquet-partitioned/benchmark.sh b/daft-parquet-partitioned/benchmark.sh index 1495c0bf62..024c58fe16 100755 --- a/daft-parquet-partitioned/benchmark.sh +++ b/daft-parquet-partitioned/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/daft-parquet/.preserve-state b/daft-parquet/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/daft-parquet/benchmark.sh b/daft-parquet/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/daft-parquet/benchmark.sh +++ b/daft-parquet/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/duckdb-dataframe/.preserve-state b/duckdb-dataframe/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/duckdb-dataframe/benchmark.sh b/duckdb-dataframe/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/duckdb-dataframe/benchmark.sh +++ b/duckdb-dataframe/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/pandas/.preserve-state b/pandas/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pandas/benchmark.sh b/pandas/benchmark.sh index 084c13353c..1369feb230 100755 --- a/pandas/benchmark.sh +++ b/pandas/benchmark.sh @@ -6,4 +6,9 @@ export BENCH_DURABLE=no # queries.sql holds those Python expressions, one per line, so the # default BENCH_QUERIES_FILE=queries.sql in lib/benchmark-common.sh # picks them up unchanged. +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/pinot/.preserve-state b/pinot/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pinot/benchmark.sh b/pinot/benchmark.sh index 1d14d0bb2c..2379871512 100755 --- a/pinot/benchmark.sh +++ b/pinot/benchmark.sh @@ -6,4 +6,9 @@ export BENCH_DURABLE=yes # inside one JVM and takes longer than the lib's 300 s default to be # query-ready on a cold instance. 900 s clears the observed cold start. export BENCH_CHECK_TIMEOUT=900 +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/playground/agent/agent.py b/playground/agent/agent.py index f2fb6e2cd9..c6c261119c 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -29,6 +29,7 @@ import http.server import json import os +import re import shutil import signal import socket @@ -84,6 +85,33 @@ _daemon_lock = threading.Lock() +_BENCH_VAR_RE = re.compile( + r'^\s*(?:export\s+)?(?P[A-Z_][A-Z0-9_]*)=' + r'(?:"(?P[^"]*)"|\'(?P[^\']*)\'|(?P[^\s#"\']*))', + re.MULTILINE, +) + + +def _bench_var(name: str) -> str: + """Return the value of a top-level `VAR=…` assignment in the + system's benchmark.sh, or "" if absent. Driven by static grep, NOT + `source` — benchmark.sh ends with `exec ../lib/benchmark-common.sh`, + so sourcing it would derail the agent. The bench-common driver and + the playground agent both rely on the same variable surface + (BENCH_DOWNLOAD_SCRIPT, PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT, + PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT, …), so a per-system + benchmark.sh stays the single source of truth.""" + bf = SYSTEM_DIR / "benchmark.sh" + try: + text = bf.read_text() + except FileNotFoundError: + return "" + for m in _BENCH_VAR_RE.finditer(text): + if m.group("name") == name: + return (m.group("dq") or m.group("sq") or m.group("bare") or "").strip() + return "" + + def _cap(b: bytes) -> tuple[bytes, bool]: """Truncate to OUTPUT_LIMIT bytes; return (body, was_truncated).""" if len(b) <= OUTPUT_LIMIT: @@ -443,7 +471,7 @@ def _provision() -> tuple[int, bytes]: stop = SYSTEM_DIR / "stop" start = SYSTEM_DIR / "start" check = SYSTEM_DIR / "check" - preserve_state = (SYSTEM_DIR / ".preserve-state").exists() + preserve_state = _bench_var("PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT") == "yes" has_daemon = (stop.exists() and start.exists() and check.exists() and os.access(stop, os.X_OK) and os.access(start, os.X_OK) and @@ -732,6 +760,24 @@ def _maybe_reconcile_for_restore() -> None: # ./start. Clear the daemon-started gate so the very next # _ensure_daemon_started() call brings the stack back up. _daemon_started.clear() + # Some systems' ./start scripts short-circuit on a shallow + # health probe (e.g. byconity checks `SELECT 1` against the + # local server; quickwit checks `docker ps` for the container) + # and never touch the broken cluster-internal connections that + # firecracker's frozen-time snapshot stranded. For those, + # PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes in benchmark.sh + # opts the system into a forced ./stop before ./start so the + # next bring-up is from a clean state. + if _bench_var("PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT") == "yes": + stop = SYSTEM_DIR / "stop" + if stop.exists() and os.access(stop, os.X_OK): + sys.stderr.write( + "[agent] PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT: " + "force ./stop\n") + subprocess.run([str(stop)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=300, check=False) # Kick off the rebuild asynchronously. /ready (or whoever # called us) returns promptly; the host's /ready poll then # waits for _daemon_started to flip back to True. diff --git a/polars-dataframe/.preserve-state b/polars-dataframe/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/polars-dataframe/benchmark.sh b/polars-dataframe/benchmark.sh index 90bdcf07e3..a5a40fb651 100755 --- a/polars-dataframe/benchmark.sh +++ b/polars-dataframe/benchmark.sh @@ -6,4 +6,9 @@ export BENCH_DURABLE=no # queries.sql holds those Python expressions, one per line, so the # default BENCH_QUERIES_FILE=queries.sql in lib/benchmark-common.sh # picks them up unchanged. +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/polars/.preserve-state b/polars/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/polars/benchmark.sh b/polars/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/polars/benchmark.sh +++ b/polars/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/quickwit/benchmark.sh b/quickwit/benchmark.sh index 298de0454d..cb593204ba 100755 --- a/quickwit/benchmark.sh +++ b/quickwit/benchmark.sh @@ -5,4 +5,10 @@ export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes export BENCH_QUERIES_FILE="queries.json" +# After firecracker snapshot+restore the cluster's +# internal connections (brpc/gossip) are stale; ./start's +# shallow health probe doesn't notice and short-circuits. +# Tell the playground agent to ./stop the cluster before +# ./start so the next bring-up is from a clean state. +export PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/tidb/.preserve-state b/tidb/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tidb/benchmark.sh b/tidb/benchmark.sh index 107b9dbb65..73f1c4ad83 100755 --- a/tidb/benchmark.sh +++ b/tidb/benchmark.sh @@ -3,4 +3,9 @@ # TiDB Lightning loads from ..csv files; we use the CSV download. export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" export BENCH_DURABLE=yes +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh From 8aa9bd6041c820e085d99ef3663d6b374a0da34c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 01:40:22 +0000 Subject: [PATCH 212/221] playground: restore docker0 MASQUERADE inside VMs (presto/cloudberry fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we set `iptables: false` in /etc/docker/daemon.json (to work around the missing kernel CONFIG_IP_NF_RAW on the firecracker guest kernel — Docker 28+'s DIRECT ACCESS FILTERING insists on the raw table), dockerd stopped installing its usual nat-table rule: -t nat -A POSTROUTING -s 172.17.0.0/16 ! -o docker0 -j MASQUERADE Container-originated packets then leave the VM with their docker0 source intact (172.17.0.x). The host's per-slot MASQUERADE matches only the VM TAP CIDR (10.200.X.0/24), so the 172.17.0.x packet exits ens1 unchanged and AWS drops it. Empirically: presto-datalake's load failed with `Name or service not known` for clickhouse-public-datasets.s3.eu-central-1.amazonaws.com, and cloudberry's install failed inside a Rocky Linux container with `Could not resolve host: mirrors.rockylinux.org`. Replicate the missing rule via a small systemd unit that runs after docker.service. The nat table is intact (it's `raw` that isn't compiled in), so MASQUERADE works fine. Also: - cedardb / cedardb-parquet: bump start-ready timeout 300s → 600s (the container's initdb takes longer than 5 min on the cold sysdisk; this was the proximate cause of two HEALTHCHECK-TIMEOUT failures in the last sweep). - trino-datalake / trino-datalake-partitioned: set BENCH_CHECK_TIMEOUT=1800. Trino's cold JVM bootstrap pushes past the lib's 300 s default, then keeps going for several more minutes; both variants timed out at the 900 s ./check budget. Co-Authored-By: Claude Opus 4.7 --- cedardb/start | 4 +-- playground/images/build-base-rootfs.sh | 40 +++++++++++++++++++++++++ trino-datalake-partitioned/benchmark.sh | 2 ++ trino-datalake/benchmark.sh | 2 ++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/cedardb/start b/cedardb/start index 874a474500..6bd35d176f 100755 --- a/cedardb/start +++ b/cedardb/start @@ -35,10 +35,10 @@ fi # database directory" before postgres actually listens). Older # 60 s budget bailed during that phase. Give it 5 min — pg_isready # exits fast once the daemon is up so this only matters on failure. -for _ in $(seq 1 300); do +for _ in $(seq 1 600); do pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done -echo "cedardb did not become ready in 300 s; container logs:" >&2 +echo "cedardb did not become ready in 600 s; container logs:" >&2 sudo docker logs cedardb 2>&1 | tail -40 >&2 || true exit 1 diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh index 0c1564fe56..03f3a05669 100755 --- a/playground/images/build-base-rootfs.sh +++ b/playground/images/build-base-rootfs.sh @@ -243,6 +243,46 @@ cat > /etc/docker/daemon.json < /usr/local/sbin/clickbench-docker-nat <<'NATEOF' +#!/bin/bash +set -e +# Idempotent — the systemd unit may fire on every boot, including after +# a snapshot restore where the rule may already be there. +if ! iptables -t nat -C POSTROUTING -s 172.17.0.0/16 ! -o docker0 \ + -j MASQUERADE 2>/dev/null; then + iptables -t nat -A POSTROUTING -s 172.17.0.0/16 ! -o docker0 \ + -j MASQUERADE +fi +NATEOF +chmod +x /usr/local/sbin/clickbench-docker-nat + +cat > /etc/systemd/system/clickbench-docker-nat.service < Date: Sat, 16 May 2026 01:41:08 +0000 Subject: [PATCH 213/221] =?UTF-8?q?cedardb-parquet:=20align=20start-ready?= =?UTF-8?q?=20timeout=20with=20cedardb=20(60s=20=E2=86=92=20600s)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cedardb base variant got bumped to 600s in the last commit but cedardb-parquet still had the older 60s, so it would have hit the same HEALTHCHECK-TIMEOUT failure mode again. Co-Authored-By: Claude Opus 4.7 --- cedardb-parquet/start | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cedardb-parquet/start b/cedardb-parquet/start index 981f23f221..f12fcd366d 100755 --- a/cedardb-parquet/start +++ b/cedardb-parquet/start @@ -28,10 +28,15 @@ if ! sudo docker run -d --rm -p 5432:5432 \ exit 1 fi -for _ in $(seq 1 60); do +# First-boot initdb inside the container takes well over a minute +# (observed ~90-120 s of "Fixing permissions"/"Setting up database +# directory" before postgres actually listens). Give it 10 min — +# pg_isready exits fast once the daemon is up, so this only +# matters in the failure path. +for _ in $(seq 1 600); do pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done -echo "cedardb did not become ready in 60 s; container logs:" >&2 +echo "cedardb did not become ready in 600 s; container logs:" >&2 sudo docker logs cedardb 2>&1 | tail -40 >&2 || true exit 1 From c00594ba33174017d88b20b279a3754a4025251e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:00:00 +0000 Subject: [PATCH 214/221] playground: honor BENCH_CHECK_TIMEOUT in agent's post-start probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent waited up to a hardcoded 900 s for ./check to succeed after ./start, regardless of what the per-system benchmark.sh declared. trino-datalake / trino-datalake-partitioned bumped BENCH_CHECK_TIMEOUT=1800 to cover Trino's cold-JVM bootstrap, but the agent ignored it and bailed at 900 s — exactly the "check did not succeed within 900s" we saw. Read the override via the same _bench_var() grep that handles PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT etc., and clamp to a floor of 900 s so the existing baseline still covers Druid / Pinot / similar JVM stacks that don't declare an override. Co-Authored-By: Claude Opus 4.7 --- playground/agent/agent.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/playground/agent/agent.py b/playground/agent/agent.py index c6c261119c..9f9a5b1b39 100644 --- a/playground/agent/agent.py +++ b/playground/agent/agent.py @@ -385,16 +385,22 @@ def _provision() -> tuple[int, bytes]: PROVISION_LOG.write_bytes(b"".join(log_lines)) return r.returncode, b"".join(log_lines) - # Wait for ./check to succeed for up to 300s + # Wait for ./check to succeed. Per-system override via + # BENCH_CHECK_TIMEOUT in benchmark.sh (same surface as the + # standalone bench driver); default 900 s, which covers + # Druid / Pinot / similar JVM-stack engines that need 5-10 min + # for Zookeeper / Coordinator / Broker / Historical to come up + # in sequence. Trino on a cold sysdisk has been observed + # pushing past 900 s, hence the override hook. check = SYSTEM_DIR / "check" + check_budget = 900 + override = _bench_var("BENCH_CHECK_TIMEOUT") + if override.isdigit(): + check_budget = max(check_budget, int(override)) ok = False t0 = time.monotonic() last_check: subprocess.CompletedProcess | None = None - # Druid / Pinot / similar JVM-stack engines need 5-10 min to come - # up from a cold start, between Zookeeper / Coordinator / Broker / - # Historical processes booting in sequence. 300 s was too tight - # for those; 900 s covers the slowest observed cases. - while time.monotonic() - t0 < 900: + while time.monotonic() - t0 < check_budget: last_check = subprocess.run( [str(check)], cwd=str(SYSTEM_DIR), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -404,7 +410,8 @@ def _provision() -> tuple[int, bytes]: break time.sleep(1) if not ok: - log_lines.append(b"\n=== check did not succeed within 900s ===\n") + log_lines.append( + f"\n=== check did not succeed within {check_budget}s ===\n".encode()) if last_check is not None: log_lines.append(last_check.stderr or b"") PROVISION_LOG.write_bytes(b"".join(log_lines)) From 8220f8333cd878c53be3db94d5ec756232051839 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:01:32 +0000 Subject: [PATCH 215/221] kinetica: lower RAM-tier cap from 27 GB to 9 GB The upstream cap is sized for a bare-metal benchmark machine. Playground VMs have 16 GiB RAM total, so a 27 GB RAM tier overshoots physical memory; kinetica's rank-1 worker gets OOM-killed mid-LOAD and the load fails with `[GPUdb]executeSql: Internal_Error: Rank 1 non-responsive (Table:"ki_home.hits")`. Keeping 7 GiB of headroom for the agent, dockerd, and the rest of the kinetica plane keeps the load on the disk tier and the load completes. Co-Authored-By: Claude Opus 4.7 --- kinetica/load | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kinetica/load b/kinetica/load index 5b90136ae6..1097ee3866 100755 --- a/kinetica/load +++ b/kinetica/load @@ -19,7 +19,13 @@ wget --continue --progress=dot:giga \ sudo mv hits.tsv.gz ./kinetica-persist/ $CLI --file create.sql -$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" +# Playground VMs have 16 GiB RAM total. The upstream 27 GB cap was +# sized for a host-mode benchmark machine; in the VM the RAM tier +# alone exceeds physical memory, kinetica's rank-1 worker gets +# OOM-killed mid-LOAD, and the load fails with +# [GPUdb]executeSql: Internal_Error: Rank 1 non-responsive +# Cap the RAM tier at 9 GB and rely on the on-disk tier for the rest. +$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '9000000000');" $CLI --sql "load into hits from file paths 'hits.tsv.gz' format delimited text (INCLUDES HEADER=false, DELIMITER = '\t') WITH OPTIONS (NUM_TASKS_PER_RANK=16, ON ERROR=SKIP);" From 66e03f2244262e9ff556d97a28a7745fe4f8f738 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:20:40 +0000 Subject: [PATCH 216/221] umbra: dump memory + swap + container cgroup state on start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every OOM during ./load just printed psql:create.sql:109: ERROR: unable to allocate memory and we couldn't tell whether the agent's mkswap+swapon actually ran, whether the container saw the swap, or whether the sysctl tweaks (overcommit_memory=1, max_map_count, swappiness) stuck. With umbra in NEEDS_SWAP and a 256 GiB swap.raw attached, OOM shouldn't be possible — but it is, so dump enough state at the end of ./start that the next failure tells us where to look. Co-Authored-By: Claude Opus 4.7 --- umbra/start | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/umbra/start b/umbra/start index eafb1ebcb4..077e0cc54c 100755 --- a/umbra/start +++ b/umbra/start @@ -37,6 +37,32 @@ sudo docker run -d --name umbradb \ # Container needs a moment before psql can connect. for _ in $(seq 1 60); do - PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres \ + -c 'SELECT 1' >/dev/null 2>&1; then + # Diagnostic dump so a future OOM during load lands with the + # memory/swap state of the VM in the provision log. Previously + # silent — every "unable to allocate memory" failure looked + # the same and we couldn't tell whether the agent's mkswap+ + # swapon ran, whether the container saw the swap, or whether + # the sysctl tweaks above stuck. + echo "=== umbra: VM memory state ===" + free -h || true + echo "=== umbra: swap state ===" + swapon --show=NAME,SIZE,USED,PRIO --bytes || true + echo "=== umbra: container memory cgroup ===" + sudo docker inspect umbradb --format \ + 'memory={{.HostConfig.Memory}} memory-swap={{.HostConfig.MemorySwap}}' || true + cgpath=$(sudo docker inspect umbradb --format '{{.State.Pid}}' 2>/dev/null | \ + xargs -I{} cat /proc/{}/cgroup 2>/dev/null | awk -F: '{print $NF}') + if [ -n "$cgpath" ]; then + for f in memory.max memory.swap.max memory.swap.current; do + p="/sys/fs/cgroup${cgpath}/$f" + [ -r "$p" ] && echo " $f = $(cat "$p")" + done + fi + echo "=== umbra: container procs ===" + sudo docker top umbradb -eo pid,vsz,rss,comm 2>&1 | head -10 + exit 0 + fi sleep 1 done From c3a674fdd0002288d6045f75d06434d3d1014693 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:36:35 +0000 Subject: [PATCH 217/221] =?UTF-8?q?trino-datalake{,-partitioned}:=20bump?= =?UTF-8?q?=20BENCH=5FCHECK=5FTIMEOUT=201800=20=E2=86=92=203600?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trino's cold-start in the datalake configuration (hive catalog, S3 credentials shim) ran past the 1800s budget on the last provision. trino (non-datalake) and trino-partitioned snapshot fine on the same 900s default, so the slowdown is specific to the catalog/S3 config — give the cold path another 30 min and revisit with diagnostics if it still doesn't land. Co-Authored-By: Claude Opus 4.7 --- trino-datalake-partitioned/benchmark.sh | 2 +- trino-datalake/benchmark.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/trino-datalake-partitioned/benchmark.sh b/trino-datalake-partitioned/benchmark.sh index 359e26e9d6..64f93a400d 100755 --- a/trino-datalake-partitioned/benchmark.sh +++ b/trino-datalake-partitioned/benchmark.sh @@ -4,5 +4,5 @@ export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes # Trino bootstrap on a cold sysdisk pushes past the 300s default. -export BENCH_CHECK_TIMEOUT=1800 +export BENCH_CHECK_TIMEOUT=3600 exec ../lib/benchmark-common.sh diff --git a/trino-datalake/benchmark.sh b/trino-datalake/benchmark.sh index 359e26e9d6..64f93a400d 100755 --- a/trino-datalake/benchmark.sh +++ b/trino-datalake/benchmark.sh @@ -4,5 +4,5 @@ export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes # Trino bootstrap on a cold sysdisk pushes past the 300s default. -export BENCH_CHECK_TIMEOUT=1800 +export BENCH_CHECK_TIMEOUT=3600 exec ../lib/benchmark-common.sh From b9f4983fd161c4bcc7dc8fe0f642f7ba10fce690 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:39:04 +0000 Subject: [PATCH 218/221] playground: per-system VM RAM override; bump umbra to 32 GiB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Umbra's COPY consistently ENOMEMs ~9 min into the load on the default 16 GiB VM, even with NEEDS_SWAP (256 GiB swap.raw active, overcommit_memory=1, no docker cgroup memory cap). The diagnostic dump confirmed swap is mounted and the container's memory.max / memory.swap.max are 'max', so the kernel isn't the one refusing — umbra's own allocator hits a wall at the working- set peak before the swap path can catch up. Add VM_MEM_OVERRIDES_MIB in systems.py and have vm_manager pull mem_size_mib from it (falling back to the host's vm_mem_mib). Bump umbra to 32 GiB; the COPY then finishes, the snapshot carries the warm working set, and restored queries don't pay reload cost. Co-Authored-By: Claude Opus 4.7 --- playground/server/systems.py | 15 +++++++++++++++ playground/server/vm_manager.py | 7 +++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index e7c1743f58..16ec6532d5 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -137,6 +137,21 @@ } +# Per-system VM RAM override (MiB). Default is the host's VM_MEM_MIB +# (16 GiB), which suits nearly every system. Bumps live here. +VM_MEM_OVERRIDES_MIB: dict[str, int] = { + # umbra's COPY consistently ENOMEMs ~9 min into the load on a + # 16 GiB VM, even with NEEDS_SWAP / 256 GiB swap.raw / overcommit + # _memory=1 / no docker cgroup cap. Diagnostic dump confirms + # memory.max=max + memory.swap.max=max + swap mounted, so the + # failure is umbra's own allocator returning at the working-set + # peak before the kernel can reclaim. 32 GiB lets the COPY land; + # the snapshot carries the warmed working set into restored + # queries. + "umbra": 32 * 1024, +} + + @dataclass(frozen=True) class System: name: str diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index 8ab1b8e1dd..f8021a145b 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -35,7 +35,9 @@ from . import firecracker as fc from . import net -from .systems import NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB +from .systems import ( + NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB, VM_MEM_OVERRIDES_MIB, +) from .config import Config from .systems import System, DATALAKE_FILTERED @@ -515,9 +517,10 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: "is_root_device": False, "is_read_only": False, }) + mem_mib = VM_MEM_OVERRIDES_MIB.get(vm.system.name, self.cfg.vm_mem_mib) await fc.put(sock, "/machine-config", { "vcpu_count": self.cfg.vm_vcpus, - "mem_size_mib": self.cfg.vm_mem_mib, + "mem_size_mib": mem_mib, "smt": False, }) await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) From 3cf62958b2fb8f16e27cb8eb9328de820c0a4f24 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:49:20 +0000 Subject: [PATCH 219/221] Revert "playground: per-system VM RAM override; bump umbra to 32 GiB" This reverts commit b9f4983fd161c4bcc7dc8fe0f642f7ba10fce690. --- playground/server/systems.py | 15 --------------- playground/server/vm_manager.py | 7 ++----- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/playground/server/systems.py b/playground/server/systems.py index 16ec6532d5..e7c1743f58 100644 --- a/playground/server/systems.py +++ b/playground/server/systems.py @@ -137,21 +137,6 @@ } -# Per-system VM RAM override (MiB). Default is the host's VM_MEM_MIB -# (16 GiB), which suits nearly every system. Bumps live here. -VM_MEM_OVERRIDES_MIB: dict[str, int] = { - # umbra's COPY consistently ENOMEMs ~9 min into the load on a - # 16 GiB VM, even with NEEDS_SWAP / 256 GiB swap.raw / overcommit - # _memory=1 / no docker cgroup cap. Diagnostic dump confirms - # memory.max=max + memory.swap.max=max + swap mounted, so the - # failure is umbra's own allocator returning at the working-set - # peak before the kernel can reclaim. 32 GiB lets the COPY land; - # the snapshot carries the warmed working set into restored - # queries. - "umbra": 32 * 1024, -} - - @dataclass(frozen=True) class System: name: str diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py index f8021a145b..8ab1b8e1dd 100644 --- a/playground/server/vm_manager.py +++ b/playground/server/vm_manager.py @@ -35,9 +35,7 @@ from . import firecracker as fc from . import net -from .systems import ( - NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB, VM_MEM_OVERRIDES_MIB, -) +from .systems import NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB from .config import Config from .systems import System, DATALAKE_FILTERED @@ -517,10 +515,9 @@ async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: "is_root_device": False, "is_read_only": False, }) - mem_mib = VM_MEM_OVERRIDES_MIB.get(vm.system.name, self.cfg.vm_mem_mib) await fc.put(sock, "/machine-config", { "vcpu_count": self.cfg.vm_vcpus, - "mem_size_mib": mem_mib, + "mem_size_mib": self.cfg.vm_mem_mib, "smt": False, }) await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) From c19ca4dcaebc46094114b02b7b5814ae0033dde7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 02:53:42 +0000 Subject: [PATCH 220/221] umbra: unlimited memlock + dump sysctl/memlock in start diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnostic dump from the last failure showed swap mounted, swap unused, container memory.max/swap.max both 'max'. The remaining hypothesis for the ENOMEM is umbra calling mlock() on a chunk bigger than the 8 MiB RLIMIT_MEMLOCK we explicitly set — mlock returns ENOMEM independent of how much swap is available, since locked pages by definition can't be paged out. - Switch the docker --ulimit from memlock=8388608 to memlock=-1 (unlimited). - Also dump vm.overcommit_memory / .swappiness / .max_map_count and the container's effective `ulimit -l` so the next failure conclusively tells us whether the sysctl tweaks stuck and what the container actually sees. Co-Authored-By: Claude Opus 4.7 --- umbra/start | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/umbra/start b/umbra/start index 077e0cc54c..097802346f 100755 --- a/umbra/start +++ b/umbra/start @@ -32,7 +32,7 @@ sudo docker run -d --name umbradb \ -v "$(pwd)/data:/data" \ -p 5432:5432 \ --ulimit nofile=1048576:1048576 \ - --ulimit memlock=8388608:8388608 \ + --ulimit memlock=-1:-1 \ umbradb/umbra:latest >/dev/null # Container needs a moment before psql can connect. @@ -49,9 +49,16 @@ for _ in $(seq 1 60); do free -h || true echo "=== umbra: swap state ===" swapon --show=NAME,SIZE,USED,PRIO --bytes || true + echo "=== umbra: sysctl ===" + for k in vm.overcommit_memory vm.swappiness vm.max_map_count \ + vm.overcommit_ratio; do + echo " $k = $(sysctl -n $k 2>/dev/null)" + done echo "=== umbra: container memory cgroup ===" sudo docker inspect umbradb --format \ 'memory={{.HostConfig.Memory}} memory-swap={{.HostConfig.MemorySwap}}' || true + echo "=== umbra: container memlock ulimit ===" + sudo docker exec umbradb sh -c 'ulimit -l' 2>&1 || true cgpath=$(sudo docker inspect umbradb --format '{{.State.Pid}}' 2>/dev/null | \ xargs -I{} cat /proc/{}/cgroup 2>/dev/null | awk -F: '{print $NF}') if [ -n "$cgpath" ]; then From 4fddcde1ed61b3a419709df82964c52430d85cef Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 16 May 2026 03:03:36 +0000 Subject: [PATCH 221/221] trino-partitioned, presto-partitioned: bind-mount datasets_ro into container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `./load` does `ln -f hits_*.parquet data/hits/` to populate the hive `external_location`. With the agent now staging partitioned parquet as symlinks at cwd pointing to /opt/clickbench/datasets_ro/hits_partitioned/hits_N.parquet, GNU ln's default behavior (`-P`) creates a hardlink to the SYMLINK inode rather than dereferencing — so `data/hits/hits_N.parquet` is a hardlink to a symlink whose target is an absolute host-VM path the container can't see. Inside the trino/presto container the symlinks all dangle, the hive external_location appears empty, and queries return 0 rows. Add `-v /opt/clickbench/datasets_ro:/opt/clickbench/datasets_ro:ro` to both containers so the absolute symlink targets resolve from inside the container too. Co-Authored-By: Claude Opus 4.7 --- presto-partitioned/start | 1 + trino-partitioned/start | 1 + 2 files changed, 2 insertions(+) diff --git a/presto-partitioned/start b/presto-partitioned/start index 92bbe10997..125a4ca19d 100755 --- a/presto-partitioned/start +++ b/presto-partitioned/start @@ -17,4 +17,5 @@ sudo docker run -d --name presto \ -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ -v "$PWD/data:/clickbench" \ + -v "/opt/clickbench/datasets_ro:/opt/clickbench/datasets_ro:ro" \ prestodb/presto:${PRESTO_VERSION} diff --git a/trino-partitioned/start b/trino-partitioned/start index da87d704b4..b07580d780 100755 --- a/trino-partitioned/start +++ b/trino-partitioned/start @@ -13,4 +13,5 @@ sudo docker run -d --name trino \ -p 8080:8080 \ -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ -v "$PWD/data:/clickbench" \ + -v "/opt/clickbench/datasets_ro:/opt/clickbench/datasets_ro:ro" \ trinodb/trino:latest