diff --git a/byconity/benchmark.sh b/byconity/benchmark.sh index 0450372b8a..b71ae2462e 100755 --- a/byconity/benchmark.sh +++ b/byconity/benchmark.sh @@ -7,4 +7,10 @@ export BENCH_DURABLE=yes # dependency, so the worst-case cold start is several minutes; the # lib's 300s default has timed out before server is up. export BENCH_CHECK_TIMEOUT=1200 +# After firecracker snapshot+restore the cluster's +# internal connections (brpc/gossip) are stale; ./start's +# shallow health probe doesn't notice and short-circuits. +# Tell the playground agent to ./stop the cluster before +# ./start so the next bring-up is from a clean state. +export PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/cedardb-parquet/start b/cedardb-parquet/start index 981f23f221..f12fcd366d 100755 --- a/cedardb-parquet/start +++ b/cedardb-parquet/start @@ -28,10 +28,15 @@ if ! sudo docker run -d --rm -p 5432:5432 \ exit 1 fi -for _ in $(seq 1 60); do +# First-boot initdb inside the container takes well over a minute +# (observed ~90-120 s of "Fixing permissions"/"Setting up database +# directory" before postgres actually listens). Give it 10 min — +# pg_isready exits fast once the daemon is up, so this only +# matters in the failure path. +for _ in $(seq 1 600); do pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done -echo "cedardb did not become ready in 60 s; container logs:" >&2 +echo "cedardb did not become ready in 600 s; container logs:" >&2 sudo docker logs cedardb 2>&1 | tail -40 >&2 || true exit 1 diff --git a/cedardb/start b/cedardb/start index b6c3bbfe07..6bd35d176f 100755 --- a/cedardb/start +++ b/cedardb/start @@ -30,10 +30,15 @@ if ! sudo docker run -d --rm -p 5432:5432 \ exit 1 fi -for _ in $(seq 1 60); do +# First-boot initdb inside the container can run for well over a +# minute (observed ~90-120 s of "Fixing permissions"/"Setting up +# database directory" before postgres actually listens). Older +# 60 s budget bailed during that phase. Give it 5 min — pg_isready +# exits fast once the daemon is up so this only matters on failure. +for _ in $(seq 1 600); do pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0 sleep 1 done -echo "cedardb did not become ready in 60 s; container logs:" >&2 +echo "cedardb did not become ready in 600 s; container logs:" >&2 sudo docker logs cedardb 2>&1 | tail -40 >&2 || true exit 1 diff --git a/chdb-dataframe/benchmark.sh b/chdb-dataframe/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/chdb-dataframe/benchmark.sh +++ b/chdb-dataframe/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/clickhouse-web/create.sql b/clickhouse-web/create.sql index 4e687ef61f..3ec2451dc7 100644 --- a/clickhouse-web/create.sql +++ b/clickhouse-web/create.sql @@ -108,5 +108,5 @@ ATTACH TABLE hits UUID 'c449dfbf-ba06-4d13-abec-8396559eb955' PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID) ) ENGINE = MergeTree -SETTINGS disk = disk(type = cache, path = '/dev/shm/clickhouse/', max_size_ratio_to_total_space = 0.9, +SETTINGS disk = disk(type = cache, path = '/var/lib/clickhouse/caches/web/', max_size_ratio_to_total_space = 0.9, disk = disk(type = web, endpoint = 'https://clickhouse-public-datasets.s3.amazonaws.com/web/')); diff --git a/clickhouse-web/install b/clickhouse-web/install index eb23629536..75715b33c6 100755 --- a/clickhouse-web/install +++ b/clickhouse-web/install @@ -10,6 +10,18 @@ if [ ! -x /usr/bin/clickhouse ]; then sudo ./clickhouse install --noninteractive fi -# Cache directory used by the web disk. -sudo mkdir -p /dev/shm/clickhouse -sudo chown clickhouse:clickhouse /dev/shm/clickhouse +# Cache directory used by the web disk. ClickHouse rejects any +# filesystem-cache path outside /var/lib/clickhouse/caches/ with +# BAD_ARGUMENTS at CREATE TABLE time, but we still want the actual +# bytes to live in tmpfs (/dev/shm) for the speed: cold queries +# pull ~1 GB on first run and tmpfs avoids touching the host SSD. +# +# Newer ClickHouse versions canonicalise the path before the policy +# check, so the older symlink trick (caches/web → /dev/shm/...) is +# rejected with BAD_ARGUMENTS. Bind-mount tmpfs at the +# policy-acceptable path instead — to CH the cache dir *is* +# /var/lib/clickhouse/caches/web with no symlink to resolve. +sudo mkdir -p /dev/shm/clickhouse /var/lib/clickhouse/caches/web +sudo chown clickhouse:clickhouse /dev/shm/clickhouse /var/lib/clickhouse/caches/web +sudo mount --bind /dev/shm/clickhouse /var/lib/clickhouse/caches/web +sudo chown clickhouse:clickhouse /var/lib/clickhouse/caches/web diff --git a/daft-parquet-partitioned/benchmark.sh b/daft-parquet-partitioned/benchmark.sh index 1495c0bf62..024c58fe16 100755 --- a/daft-parquet-partitioned/benchmark.sh +++ b/daft-parquet-partitioned/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/daft-parquet/.preserve-state b/daft-parquet/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/daft-parquet/benchmark.sh b/daft-parquet/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/daft-parquet/benchmark.sh +++ b/daft-parquet/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/duckdb-dataframe/.preserve-state b/duckdb-dataframe/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/duckdb-dataframe/benchmark.sh b/duckdb-dataframe/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/duckdb-dataframe/benchmark.sh +++ b/duckdb-dataframe/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/firebolt-parquet-partitioned/benchmark.sh b/firebolt-parquet-partitioned/benchmark.sh index 0e6a62ae64..dbb9072c56 100755 --- a/firebolt-parquet-partitioned/benchmark.sh +++ b/firebolt-parquet-partitioned/benchmark.sh @@ -1,49 +1,6 @@ #!/bin/bash - -# Download the partitioned hits parquet files -echo "Downloading dataset..." -rm -rf data -../lib/download-hits-parquet-partitioned data - -# Start the container -sudo apt-get install -y docker.io jq -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v ./data/:/firebolt-core/clickbench \ - ghcr.io/firebolt-db/firebolt-core:preview-rc - -# See firebolt/benchmark.sh — the old curl-and-break pattern accepted the -# "Cluster not yet healthy" JSON error body as success. -for _ in {1..600} -do - if curl -sS "http://localhost:3473/" \ - --data-binary "SELECT 'Firebolt is ready';" 2>/dev/null \ - | grep -q "Firebolt is ready"; then - break - fi - sleep 1 -done - -# Create the database and external table -echo "Creating external table..." -curl -sS "http://localhost:3473/?enable_multi_query_requests=true" --data-binary "DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;" -curl -sS "http://localhost:3473/?database=clickbench&enable_multi_query_requests=true" --data-binary @create.sql - -# Print statistics -DATA_SIZE=$(du -bcs data/hits_*.parquet 2>/dev/null | grep total | awk '{print $1}') -if [ -z "$DATA_SIZE" ]; then - DATA_SIZE=$(du -cs data/hits_*.parquet | grep total | awk '{print $1}') -fi -echo "Load time: 0" -echo "Data size: $DATA_SIZE" - -# Run the benchmark -echo "Running the benchmark..." -./run.sh - -# Stop the container and remove the data -sudo docker container stop firebolt-core -rm -rf data +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" +export BENCH_DURABLE=no +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/firebolt-parquet-partitioned/check b/firebolt-parquet-partitioned/check new file mode 100755 index 0000000000..862722f602 --- /dev/null +++ b/firebolt-parquet-partitioned/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Firebolt-core's HTTP port answers immediately but may return a +# cluster-not-ready JSON error at HTTP 200. Test for an actual result. +curl -sSf --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1' diff --git a/firebolt-parquet-partitioned/data-size b/firebolt-parquet-partitioned/data-size new file mode 100755 index 0000000000..b5fe999ff8 --- /dev/null +++ b/firebolt-parquet-partitioned/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Firebolt-core writes its database state under /firebolt-core/volume +# inside the container, which we bind-mount to ./fb-volume on the host. +du -bcs fb-volume 2>/dev/null | awk '/total$/ { print $1 }' diff --git a/firebolt-parquet-partitioned/install b/firebolt-parquet-partitioned/install new file mode 100755 index 0000000000..38799727d9 --- /dev/null +++ b/firebolt-parquet-partitioned/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io jq +sudo docker pull ghcr.io/firebolt-db/firebolt-core:preview-rc diff --git a/firebolt-parquet-partitioned/load b/firebolt-parquet-partitioned/load new file mode 100755 index 0000000000..e309c8968f --- /dev/null +++ b/firebolt-parquet-partitioned/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +# Partitioned-parquet variant: stage hits_*.parquet under ./data so +# the container sees them at /firebolt-core/clickbench/*.parquet; +# create.sql declares an external table with FROM PATTERN that +# matches the glob. +mkdir -p data +shopt -s nullglob +for f in hits_*.parquet; do + mv -f "$f" "data/$f" +done +shopt -u nullglob + +curl -sSf 'http://localhost:3473/?enable_multi_query_requests=true' \ + --data-binary 'DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;' +curl -sSf 'http://localhost:3473/?database=clickbench&enable_multi_query_requests=true' \ + --data-binary @create.sql + +sync diff --git a/firebolt-parquet-partitioned/query b/firebolt-parquet-partitioned/query new file mode 100755 index 0000000000..910591e6b8 --- /dev/null +++ b/firebolt-parquet-partitioned/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it against the firebolt-core +# container via /?database=clickbench. +# Stdout: query result (firebolt's JSON_Compact format). +# Stderr: query runtime in fractional seconds on the last line, +# pulled from the response's `.statistics.elapsed`. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Result + sub-result caches off so timings are real; output_format +# matches what firebolt's run.sh uses for the public benchmark. +PARAMS='database=clickbench&enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=false&output_format=JSON_Compact' + +resp=$(curl -sS --max-time 600 "http://localhost:3473/?${PARAMS}" \ + --data-binary "$query") + +# Firebolt returns a JSON object whether the query succeeded or not. +# A failed query has an "errors" key; a successful one carries +# "data" + "statistics". +if printf '%s' "$resp" | jq -e '.errors' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" +printf '%s\n' "$resp" | jq -r '.statistics.elapsed' >&2 diff --git a/firebolt-parquet-partitioned/run.sh b/firebolt-parquet-partitioned/run.sh deleted file mode 100755 index 9b810c99ea..0000000000 --- a/firebolt-parquet-partitioned/run.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Disable the result and subresult caches. -QUERY_PARAMS="enable_result_cache=false&enable_subresult_cache=false&output_format=JSON_Compact" - -cat queries.sql | while read -r query; do - # Firebolt is a database with local on-disk storage: drop the page cache before the first run of each query. - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - # Run the query three times. - # Extract the elapsed time from the response's statistics. - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n "[${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n ",${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo ",${ELAPSED}]," -done diff --git a/firebolt-parquet-partitioned/start b/firebolt-parquet-partitioned/start new file mode 100755 index 0000000000..652146394e --- /dev/null +++ b/firebolt-parquet-partitioned/start @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# Idempotent: if firebolt-core already answers SELECT 1, do nothing. +if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1'; then + exit 0 +fi + +mkdir -p data fb-volume +# firebolt-core runs as UID/GID 1111 inside the container and refuses +# to start if its data dir is not writeable by that uid (the engine +# self-checks and aborts with "directory ... is not readable or +# writeable by the Firebolt Core process"). Set the host-side +# ownership accordingly so the bind-mounted dir is usable. +sudo chown 1111:1111 fb-volume + +# If the container exists (stopped from a prior agent pre-snapshot +# cycle), just start it back — the data lives on the bind-mounted +# fb-volume below, so the previously-created `clickbench` database +# is still there. Otherwise create the container fresh. +if sudo docker ps -a --format '{{.Names}}' | grep -qx firebolt-core; then + sudo docker start firebolt-core >/dev/null +else + # `firebolt-core` is the public self-hosted image. Container needs + # memlock 8 GiB and seccomp unconfined per upstream's run docs. + # /firebolt-core/clickbench: parquet source (read at load time). + # /firebolt-core/volume: engine data directory (must persist + # across the agent's pre-snapshot + # stop+start cycle or the snapshot + # ships an empty DB). + sudo docker run -dit --name firebolt-core \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v "$(pwd)/fb-volume:/firebolt-core/volume" \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +fi + +# Wait for the cluster to be "actually" ready. firebolt-core's HTTP +# port comes up immediately but returns +# {"errors":[{"description":"Cluster not yet healthy: ..."}]} +# at HTTP 200 until the engine threads have warmed; bench against a +# sentinel string instead of HTTP status to avoid that trap. +for _ in $(seq 1 600); do + if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary "SELECT 'firebolt-ready';" 2>/dev/null \ + | grep -q 'firebolt-ready'; then + exit 0 + fi + sleep 1 +done +{ + echo "firebolt-core did not become healthy in 10 min" + echo "=== docker ps -a ===" + sudo docker ps -a 2>&1 + echo "=== docker inspect firebolt-core (state) ===" + sudo docker inspect firebolt-core --format '{{json .State}}' 2>&1 + echo "=== docker logs firebolt-core --tail 50 ===" + sudo docker logs firebolt-core --tail 50 2>&1 + echo "=== curl http://localhost:3473/ ===" + curl -sS --max-time 3 'http://localhost:3473/' --data-binary 'SELECT 1' 2>&1 + echo "=== ss listeners ===" + sudo ss -lntp 2>&1 | head -20 +} >&2 +exit 1 diff --git a/firebolt-parquet-partitioned/stop b/firebolt-parquet-partitioned/stop new file mode 100755 index 0000000000..ac1834f7d6 --- /dev/null +++ b/firebolt-parquet-partitioned/stop @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Plain stop — leave the container in place so its bind-mounted +# fb-volume keeps the loaded database for the next ./start. The +# container is removed and the volume re-initialised only on +# explicit re-provision. +sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/firebolt-parquet/benchmark.sh b/firebolt-parquet/benchmark.sh index 737a3ca865..3a332d30db 100755 --- a/firebolt-parquet/benchmark.sh +++ b/firebolt-parquet/benchmark.sh @@ -1,46 +1,6 @@ #!/bin/bash - -# Download the hits.parquet file -echo "Downloading dataset..." -rm -rf data -../lib/download-hits-parquet-single data - -# Start the container -sudo apt-get install -y docker.io jq -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v ./data/:/firebolt-core/clickbench \ - ghcr.io/firebolt-db/firebolt-core:preview-rc - -# See firebolt/benchmark.sh — the old curl-and-break pattern accepted the -# "Cluster not yet healthy" JSON error body as success. -for _ in {1..600} -do - if curl -sS "http://localhost:3473/" \ - --data-binary "SELECT 'Firebolt is ready';" 2>/dev/null \ - | grep -q "Firebolt is ready"; then - break - fi - sleep 1 -done - -# Create the database and external table -echo "Creating external table..." -curl -sS "http://localhost:3473/?enable_multi_query_requests=true" --data-binary "DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;" -curl -sS "http://localhost:3473/?database=clickbench&enable_multi_query_requests=true" --data-binary @create.sql - -# Print statistics -DATA_SIZE=$(stat -c%s data/hits.parquet 2>/dev/null || stat -f%z data/hits.parquet) -echo "Load time: 0" -echo "Data size: $DATA_SIZE" - -# Run the benchmark -echo "Running the benchmark..." -./run.sh - -# Stop the container and remove the data -sudo docker container stop firebolt-core -rm -rf data +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_DURABLE=no +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/firebolt-parquet/check b/firebolt-parquet/check new file mode 100755 index 0000000000..862722f602 --- /dev/null +++ b/firebolt-parquet/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Firebolt-core's HTTP port answers immediately but may return a +# cluster-not-ready JSON error at HTTP 200. Test for an actual result. +curl -sSf --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1' diff --git a/firebolt-parquet/data-size b/firebolt-parquet/data-size new file mode 100755 index 0000000000..b5fe999ff8 --- /dev/null +++ b/firebolt-parquet/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Firebolt-core writes its database state under /firebolt-core/volume +# inside the container, which we bind-mount to ./fb-volume on the host. +du -bcs fb-volume 2>/dev/null | awk '/total$/ { print $1 }' diff --git a/firebolt-parquet/install b/firebolt-parquet/install new file mode 100755 index 0000000000..38799727d9 --- /dev/null +++ b/firebolt-parquet/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io jq +sudo docker pull ghcr.io/firebolt-db/firebolt-core:preview-rc diff --git a/firebolt-parquet/load b/firebolt-parquet/load new file mode 100755 index 0000000000..8d3886ee9d --- /dev/null +++ b/firebolt-parquet/load @@ -0,0 +1,17 @@ +#!/bin/bash +set -eu + +# Parquet variant: data stays in ./data (mounted as +# /firebolt-core/clickbench in the container), create.sql declares +# an external table that reads it on every query. +mkdir -p data +if [ -f hits.parquet ]; then + mv -f hits.parquet data/hits.parquet +fi + +curl -sSf 'http://localhost:3473/?enable_multi_query_requests=true' \ + --data-binary 'DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;' +curl -sSf 'http://localhost:3473/?database=clickbench&enable_multi_query_requests=true' \ + --data-binary @create.sql + +sync diff --git a/firebolt-parquet/query b/firebolt-parquet/query new file mode 100755 index 0000000000..910591e6b8 --- /dev/null +++ b/firebolt-parquet/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it against the firebolt-core +# container via /?database=clickbench. +# Stdout: query result (firebolt's JSON_Compact format). +# Stderr: query runtime in fractional seconds on the last line, +# pulled from the response's `.statistics.elapsed`. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Result + sub-result caches off so timings are real; output_format +# matches what firebolt's run.sh uses for the public benchmark. +PARAMS='database=clickbench&enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=false&output_format=JSON_Compact' + +resp=$(curl -sS --max-time 600 "http://localhost:3473/?${PARAMS}" \ + --data-binary "$query") + +# Firebolt returns a JSON object whether the query succeeded or not. +# A failed query has an "errors" key; a successful one carries +# "data" + "statistics". +if printf '%s' "$resp" | jq -e '.errors' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" +printf '%s\n' "$resp" | jq -r '.statistics.elapsed' >&2 diff --git a/firebolt-parquet/run.sh b/firebolt-parquet/run.sh deleted file mode 100755 index 9b810c99ea..0000000000 --- a/firebolt-parquet/run.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Disable the result and subresult caches. -QUERY_PARAMS="enable_result_cache=false&enable_subresult_cache=false&output_format=JSON_Compact" - -cat queries.sql | while read -r query; do - # Firebolt is a database with local on-disk storage: drop the page cache before the first run of each query. - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - # Run the query three times. - # Extract the elapsed time from the response's statistics. - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n "[${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n ",${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo ",${ELAPSED}]," -done diff --git a/firebolt-parquet/start b/firebolt-parquet/start new file mode 100755 index 0000000000..652146394e --- /dev/null +++ b/firebolt-parquet/start @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# Idempotent: if firebolt-core already answers SELECT 1, do nothing. +if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1'; then + exit 0 +fi + +mkdir -p data fb-volume +# firebolt-core runs as UID/GID 1111 inside the container and refuses +# to start if its data dir is not writeable by that uid (the engine +# self-checks and aborts with "directory ... is not readable or +# writeable by the Firebolt Core process"). Set the host-side +# ownership accordingly so the bind-mounted dir is usable. +sudo chown 1111:1111 fb-volume + +# If the container exists (stopped from a prior agent pre-snapshot +# cycle), just start it back — the data lives on the bind-mounted +# fb-volume below, so the previously-created `clickbench` database +# is still there. Otherwise create the container fresh. +if sudo docker ps -a --format '{{.Names}}' | grep -qx firebolt-core; then + sudo docker start firebolt-core >/dev/null +else + # `firebolt-core` is the public self-hosted image. Container needs + # memlock 8 GiB and seccomp unconfined per upstream's run docs. + # /firebolt-core/clickbench: parquet source (read at load time). + # /firebolt-core/volume: engine data directory (must persist + # across the agent's pre-snapshot + # stop+start cycle or the snapshot + # ships an empty DB). + sudo docker run -dit --name firebolt-core \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v "$(pwd)/fb-volume:/firebolt-core/volume" \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +fi + +# Wait for the cluster to be "actually" ready. firebolt-core's HTTP +# port comes up immediately but returns +# {"errors":[{"description":"Cluster not yet healthy: ..."}]} +# at HTTP 200 until the engine threads have warmed; bench against a +# sentinel string instead of HTTP status to avoid that trap. +for _ in $(seq 1 600); do + if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary "SELECT 'firebolt-ready';" 2>/dev/null \ + | grep -q 'firebolt-ready'; then + exit 0 + fi + sleep 1 +done +{ + echo "firebolt-core did not become healthy in 10 min" + echo "=== docker ps -a ===" + sudo docker ps -a 2>&1 + echo "=== docker inspect firebolt-core (state) ===" + sudo docker inspect firebolt-core --format '{{json .State}}' 2>&1 + echo "=== docker logs firebolt-core --tail 50 ===" + sudo docker logs firebolt-core --tail 50 2>&1 + echo "=== curl http://localhost:3473/ ===" + curl -sS --max-time 3 'http://localhost:3473/' --data-binary 'SELECT 1' 2>&1 + echo "=== ss listeners ===" + sudo ss -lntp 2>&1 | head -20 +} >&2 +exit 1 diff --git a/firebolt-parquet/stop b/firebolt-parquet/stop new file mode 100755 index 0000000000..ac1834f7d6 --- /dev/null +++ b/firebolt-parquet/stop @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Plain stop — leave the container in place so its bind-mounted +# fb-volume keeps the loaded database for the next ./start. The +# container is removed and the volume re-initialised only on +# explicit re-provision. +sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/firebolt/benchmark.sh b/firebolt/benchmark.sh index f27bc92f71..617422ddc2 100755 --- a/firebolt/benchmark.sh +++ b/firebolt/benchmark.sh @@ -1,63 +1,6 @@ #!/bin/bash - -# Download the hits.parquet file -echo "Downloading dataset..." -rm -rf data -../lib/download-hits-parquet-single data - -# Start the container -sudo apt-get install -y docker.io jq -sudo docker run -dit --name firebolt-core --rm \ - --ulimit memlock=8589934592:8589934592 \ - --security-opt seccomp=unconfined \ - -p 127.0.0.1:3473:3473 \ - -v /firebolt-core/volume \ - -v ./data/:/firebolt-core/clickbench \ - ghcr.io/firebolt-db/firebolt-core:preview-rc - -# Wait until Firebolt is ready. The old loop just did -# curl -s ... > /dev/null && break -# which treated any HTTP response as success, including the JSON error -# body -# {"errors":[{"description":"Cluster not yet healthy: ..."}]} -# that Firebolt returns at HTTP 200 while the container is still -# warming up. The loop exited on the first reply, the next -# CREATE TABLE / queries all hit the same "Cluster not yet healthy" -# error, and every query got recorded as "elapsed":0.0 — sink.parser -# then rejected the run for having no timing > 0.1 s, which is why -# Firebolt stopped showing up in sink.results after 2026-02-21 -# despite the bench completing 43/43 each time. -for _ in {1..600} -do - if curl -sS "http://localhost:3473/" \ - --data-binary "SELECT 'Firebolt is ready';" 2>/dev/null \ - | grep -q "Firebolt is ready"; then - break - fi - sleep 1 -done - -# Ingest the data -echo "Ingesting the data..." -curl -s "http://localhost:3473/?enable_multi_query_requests=true" --data-binary "DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;" -LOAD_TIME=$(curl -w "%{time_total}\n" -s "http://localhost:3473/?database=clickbench&enable_multi_query_requests=true" --data-binary @create.sql) - -# Print statistics -COMPRESSED_SIZE=$(curl -s "http://localhost:3473/?database=clickbench&output_format=JSON_Compact" --data-binary "SELECT compressed_bytes FROM information_schema.tables WHERE table_name = 'hits';" | jq '.data[0][0] | tonumber') -UNCOMPRESSED_SIZE=$(curl -s "http://localhost:3473/?database=clickbench&output_format=JSON_Compact" --data-binary "SELECT uncompressed_bytes FROM information_schema.tables WHERE table_name = 'hits';" | jq '.data[0][0] | tonumber') -echo "Load time: $LOAD_TIME" -echo "Data size: $COMPRESSED_SIZE" -echo "Uncompressed data size: $UNCOMPRESSED_SIZE bytes" - -if [ "$1" != "" ] && [ "$1" != "scan-cache" ]; then - echo "Error: command line argument must be one of {'', 'scan-cache'}" - exit 1 -fi - -# Run the benchmark -echo "Running the benchmark..." -./run.sh "$1" - -# Stop the container and remove the data -sudo docker container stop firebolt-core -rm -rf data +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_DURABLE=yes +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/firebolt/check b/firebolt/check new file mode 100755 index 0000000000..862722f602 --- /dev/null +++ b/firebolt/check @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +# Firebolt-core's HTTP port answers immediately but may return a +# cluster-not-ready JSON error at HTTP 200. Test for an actual result. +curl -sSf --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1' diff --git a/firebolt/data-size b/firebolt/data-size new file mode 100755 index 0000000000..b5fe999ff8 --- /dev/null +++ b/firebolt/data-size @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# Firebolt-core writes its database state under /firebolt-core/volume +# inside the container, which we bind-mount to ./fb-volume on the host. +du -bcs fb-volume 2>/dev/null | awk '/total$/ { print $1 }' diff --git a/firebolt/install b/firebolt/install new file mode 100755 index 0000000000..38799727d9 --- /dev/null +++ b/firebolt/install @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io jq +sudo docker pull ghcr.io/firebolt-db/firebolt-core:preview-rc diff --git a/firebolt/load b/firebolt/load new file mode 100755 index 0000000000..ba2864fe32 --- /dev/null +++ b/firebolt/load @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu + +# Stage hits.parquet where the container can see it (./data is +# bind-mounted as /firebolt-core/clickbench). +mkdir -p data +if [ -f hits.parquet ]; then + mv -f hits.parquet data/hits.parquet +fi + +# create.sql CREATEs hits_external pointing at the parquet file, then +# INSERTs into the managed `hits` table — the ingested-to-Firebolt +# variant of the benchmark. +curl -sSf 'http://localhost:3473/?enable_multi_query_requests=true' \ + --data-binary 'DROP DATABASE IF EXISTS clickbench;CREATE DATABASE clickbench;' +curl -sSf 'http://localhost:3473/?database=clickbench&enable_multi_query_requests=true' \ + --data-binary @create.sql + +rm -f data/hits.parquet +sync diff --git a/firebolt/query b/firebolt/query new file mode 100755 index 0000000000..910591e6b8 --- /dev/null +++ b/firebolt/query @@ -0,0 +1,28 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it against the firebolt-core +# container via /?database=clickbench. +# Stdout: query result (firebolt's JSON_Compact format). +# Stderr: query runtime in fractional seconds on the last line, +# pulled from the response's `.statistics.elapsed`. +# Exit non-zero on error. +set -e + +query=$(cat) + +# Result + sub-result caches off so timings are real; output_format +# matches what firebolt's run.sh uses for the public benchmark. +PARAMS='database=clickbench&enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=false&output_format=JSON_Compact' + +resp=$(curl -sS --max-time 600 "http://localhost:3473/?${PARAMS}" \ + --data-binary "$query") + +# Firebolt returns a JSON object whether the query succeeded or not. +# A failed query has an "errors" key; a successful one carries +# "data" + "statistics". +if printf '%s' "$resp" | jq -e '.errors' >/dev/null 2>&1; then + printf '%s\n' "$resp" >&2 + exit 1 +fi + +printf '%s\n' "$resp" +printf '%s\n' "$resp" | jq -r '.statistics.elapsed' >&2 diff --git a/firebolt/run.sh b/firebolt/run.sh deleted file mode 100755 index 08bdfdbc18..0000000000 --- a/firebolt/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -if [ "$1" != "" ] && [ "$1" != "scan-cache" ]; then - echo "Error: command line argument must be one of {'', 'scan-cache'}" - exit 1 -fi - -SCAN_CACHE="false" -if [ "$1" == "scan-cache" ]; then - SCAN_CACHE="true" -fi - -# Disable the result and subresult caches. Enable the scan-cache. -QUERY_PARAMS="enable_result_cache=false&enable_subresult_cache=false&enable_scan_cache=${SCAN_CACHE}&output_format=JSON_Compact" - -cat queries.sql | while read -r query; do - # Firebolt is a database with local on-disk storage: drop the page cache before the first run of each query. - sync - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null - # Run the query three times. - # Extract the elapsed time from the response's statistics. - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n "[${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo -n ",${ELAPSED}" - ELAPSED=$(curl -sS "http://localhost:3473/?database=clickbench&${QUERY_PARAMS}" --data-binary "$query" | jq '.statistics.elapsed') - echo ",${ELAPSED}]," -done diff --git a/firebolt/start b/firebolt/start new file mode 100755 index 0000000000..652146394e --- /dev/null +++ b/firebolt/start @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# Idempotent: if firebolt-core already answers SELECT 1, do nothing. +if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary 'SELECT 1;' 2>/dev/null | grep -q '^1'; then + exit 0 +fi + +mkdir -p data fb-volume +# firebolt-core runs as UID/GID 1111 inside the container and refuses +# to start if its data dir is not writeable by that uid (the engine +# self-checks and aborts with "directory ... is not readable or +# writeable by the Firebolt Core process"). Set the host-side +# ownership accordingly so the bind-mounted dir is usable. +sudo chown 1111:1111 fb-volume + +# If the container exists (stopped from a prior agent pre-snapshot +# cycle), just start it back — the data lives on the bind-mounted +# fb-volume below, so the previously-created `clickbench` database +# is still there. Otherwise create the container fresh. +if sudo docker ps -a --format '{{.Names}}' | grep -qx firebolt-core; then + sudo docker start firebolt-core >/dev/null +else + # `firebolt-core` is the public self-hosted image. Container needs + # memlock 8 GiB and seccomp unconfined per upstream's run docs. + # /firebolt-core/clickbench: parquet source (read at load time). + # /firebolt-core/volume: engine data directory (must persist + # across the agent's pre-snapshot + # stop+start cycle or the snapshot + # ships an empty DB). + sudo docker run -dit --name firebolt-core \ + --ulimit memlock=8589934592:8589934592 \ + --security-opt seccomp=unconfined \ + -p 127.0.0.1:3473:3473 \ + -v "$(pwd)/fb-volume:/firebolt-core/volume" \ + -v "$(pwd)/data:/firebolt-core/clickbench" \ + ghcr.io/firebolt-db/firebolt-core:preview-rc >/dev/null +fi + +# Wait for the cluster to be "actually" ready. firebolt-core's HTTP +# port comes up immediately but returns +# {"errors":[{"description":"Cluster not yet healthy: ..."}]} +# at HTTP 200 until the engine threads have warmed; bench against a +# sentinel string instead of HTTP status to avoid that trap. +for _ in $(seq 1 600); do + if curl -sS --max-time 5 'http://localhost:3473/' \ + --data-binary "SELECT 'firebolt-ready';" 2>/dev/null \ + | grep -q 'firebolt-ready'; then + exit 0 + fi + sleep 1 +done +{ + echo "firebolt-core did not become healthy in 10 min" + echo "=== docker ps -a ===" + sudo docker ps -a 2>&1 + echo "=== docker inspect firebolt-core (state) ===" + sudo docker inspect firebolt-core --format '{{json .State}}' 2>&1 + echo "=== docker logs firebolt-core --tail 50 ===" + sudo docker logs firebolt-core --tail 50 2>&1 + echo "=== curl http://localhost:3473/ ===" + curl -sS --max-time 3 'http://localhost:3473/' --data-binary 'SELECT 1' 2>&1 + echo "=== ss listeners ===" + sudo ss -lntp 2>&1 | head -20 +} >&2 +exit 1 diff --git a/firebolt/stop b/firebolt/stop new file mode 100755 index 0000000000..ac1834f7d6 --- /dev/null +++ b/firebolt/stop @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Plain stop — leave the container in place so its bind-mounted +# fb-volume keeps the loaded database for the next ./start. The +# container is removed and the volume re-initialised only on +# explicit re-provision. +sudo docker container stop firebolt-core >/dev/null 2>&1 || true diff --git a/kinetica/load b/kinetica/load index 523f581545..1097ee3866 100755 --- a/kinetica/load +++ b/kinetica/load @@ -9,11 +9,23 @@ CLI="./kisql --host localhost --user admin" # decompressed TSV. wget --continue --progress=dot:giga \ 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -# Symlink rather than copy: hits.tsv.gz is 16 GB and we only read it once. -sudo ln -sf "$PWD/hits.tsv.gz" ./kinetica-persist/hits.tsv.gz +# Move (rename) rather than symlink: the kinetica daemon runs inside a +# docker container with ./kinetica-persist bind-mounted, so a symlink +# pointing at $PWD/hits.tsv.gz dangles inside the container and `LOAD +# INTO ... FROM FILE PATHS 'hits.tsv.gz'` returns +# Not_Found: No such file(s) (File(s):hits.tsv.gz) +# The persist dir and $PWD live on the same overlay filesystem, so +# mv is a rename — cheap. +sudo mv hits.tsv.gz ./kinetica-persist/ $CLI --file create.sql -$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '27000000000');" +# Playground VMs have 16 GiB RAM total. The upstream 27 GB cap was +# sized for a host-mode benchmark machine; in the VM the RAM tier +# alone exceeds physical memory, kinetica's rank-1 worker gets +# OOM-killed mid-LOAD, and the load fails with +# [GPUdb]executeSql: Internal_Error: Rank 1 non-responsive +# Cap the RAM tier at 9 GB and rely on the on-disk tier for the rest. +$CLI --sql "ALTER TIER ram WITH OPTIONS ('capacity' = '9000000000');" $CLI --sql "load into hits from file paths 'hits.tsv.gz' format delimited text (INCLUDES HEADER=false, DELIMITER = '\t') WITH OPTIONS (NUM_TASKS_PER_RANK=16, ON ERROR=SKIP);" diff --git a/pandas/.preserve-state b/pandas/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pandas/benchmark.sh b/pandas/benchmark.sh index 084c13353c..1369feb230 100755 --- a/pandas/benchmark.sh +++ b/pandas/benchmark.sh @@ -6,4 +6,9 @@ export BENCH_DURABLE=no # queries.sql holds those Python expressions, one per line, so the # default BENCH_QUERIES_FILE=queries.sql in lib/benchmark-common.sh # picks them up unchanged. +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/parseable/install b/parseable/install index 9fcb8ffa1d..2d61f7ead5 100755 --- a/parseable/install +++ b/parseable/install @@ -5,8 +5,15 @@ sudo apt-get update -y sudo apt-get install -y parallel pigz pv if [ ! -x ./parseable ]; then + # v2.5.12 has an Arrow type-inference bug under static-schema mode: + # bare JSON numbers get inferred as Float64, so every ingest of a + # row with an Int64-declared field returns 400 with "Fail to merge + # schema field 'X' because the from data_type = Float64 does not + # equal Int64". Net effect: 0 rows loaded, every query returns 0. + # v2.7.2 fixes the inference; verified locally end-to-end against + # the bundled static_schema.json and hits.json. wget --continue --progress=dot:giga \ - https://github.com/parseablehq/parseable/releases/download/v2.5.12/Parseable_OSS_x86_64-unknown-linux-gnu + https://github.com/parseablehq/parseable/releases/download/v2.7.2/Parseable_OSS_x86_64-unknown-linux-gnu mv Parseable_OSS_x86_64-unknown-linux-gnu parseable chmod +x parseable fi diff --git a/parseable/load b/parseable/load index 763fe364af..45a7fb541d 100755 --- a/parseable/load +++ b/parseable/load @@ -20,12 +20,25 @@ else pv -s "$FILE_SIZE" hits.json.gz | pigz -d > hits.json fi -# Create the stream first — ingest below needs it to exist. -curl --silent --location --request PUT 'http://localhost:8000/api/v1/logstream/hits' \ +# Create the stream first — ingest below needs it to exist. Loud +# error reporting on purpose: the previous --silent + ignored exit +# code masked a 400 here for the entire load (every /ingest then +# returned 400 because the stream didn't exist, and the only +# evidence was 100k+ curl 400 lines in the provision log). +echo "==> creating logstream hits" +resp=$(curl --silent --show-error --location --request PUT \ + -w '\nHTTP_CODE=%{http_code}\n' \ + 'http://localhost:8000/api/v1/logstream/hits' \ -H 'X-P-Static-Schema-Flag: true' \ -H 'Content-Type: application/json' \ -u "admin:admin" \ - --data-binary @static_schema.json >/dev/null + --data-binary @static_schema.json) || true +printf '%s\n' "$resp" +code=$(printf '%s' "$resp" | awk -F= '/^HTTP_CODE=/ {print $2}' | tail -1) +if [ "${code:-}" != "200" ] && [ "${code:-}" != "201" ]; then + echo "parseable logstream create failed (HTTP $code)" >&2 + exit 1 +fi # Wrap each block of LINES_PER_CHUNK NDJSON lines in [ ... ] and POST # directly to /api/v1/ingest. Inlined into parallel's command string diff --git a/pinot/.preserve-state b/pinot/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pinot/benchmark.sh b/pinot/benchmark.sh index 1d14d0bb2c..2379871512 100755 --- a/pinot/benchmark.sh +++ b/pinot/benchmark.sh @@ -6,4 +6,9 @@ export BENCH_DURABLE=yes # inside one JVM and takes longer than the lib's 300 s default to be # query-ready on a cold instance. 900 s clears the observed cold start. export BENCH_CHECK_TIMEOUT=900 +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/playground/.gitignore b/playground/.gitignore new file mode 100644 index 0000000000..b6cf5f0391 --- /dev/null +++ b/playground/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +.env diff --git a/playground/INSTALL.md b/playground/INSTALL.md new file mode 100644 index 0000000000..867df08d14 --- /dev/null +++ b/playground/INSTALL.md @@ -0,0 +1,239 @@ +# ClickBench Playground — Installation + +End-to-end setup for a fresh Ubuntu 24.04 host. Everything lives under +`/opt/clickbench-playground/` once it's running. Total disk: ~7 TB at full +catalog (100 systems × multi-GB-per-system goldens, on btrfs with zstd +compression). + +## 0. Host prerequisites + +- Ubuntu 24.04 (noble), x86_64 +- `/dev/kvm` accessible (bare metal or a virt-enabled cloud instance — + `c6a.metal`, `m7i.metal-24xl`, `i4i.metal`, etc.) +- A dedicated block device for the playground state directory, plus + enough free space for ~7 TB of system goldens and ~200 GB of datasets. +- Outbound internet at install time (apt mirrors, GitHub releases, + Docker Hub, dataset downloads). +- Python 3.12+ on the host. + +## 1. Format the state volume (btrfs + transparent zstd) + +The playground depends on **reflink** (instant per-VM disk cloning) and +**transparent compression** (snapshots otherwise wouldn't fit). Btrfs gives +both. XFS works for reflink but lacks compression and fills the host at +~7 TB once all systems are provisioned. + +``` +sudo mkfs.btrfs -L cbplayground -f /dev/ +echo 'LABEL=cbplayground /opt/clickbench-playground btrfs \ + defaults,noatime,compress=zstd:1,nofail 0 2' | sudo tee -a /etc/fstab +sudo mkdir -p /opt/clickbench-playground +sudo mount /opt/clickbench-playground +``` + +## 2. Clone the repo + +``` +sudo apt-get update +sudo apt-get install -y git python3 python3-pip +cd /home/ubuntu +git clone https://github.com/ClickHouse/ClickBench +cd ClickBench +pip3 install --user -r playground/requirements.txt +``` + +## 3. Sudoers entry for the server + +The playground server runs as the unprivileged `ubuntu` user but needs to +call `sudo ip ...`, `sudo iptables ...`, `sudo mount`, `sudo cp`, `sudo +firecracker`, etc. Add a sudoers fragment so those calls don't prompt: + +``` +sudo tee /etc/sudoers.d/clickbench-playground >/dev/null <<'EOF' +ubuntu ALL=(root) NOPASSWD: /usr/sbin/ip, /usr/sbin/iptables, \ + /usr/bin/mount, /usr/bin/umount, /usr/bin/cp, /usr/bin/mv, \ + /usr/bin/chown, /usr/bin/chmod, /usr/bin/mkdir, /usr/bin/rm, \ + /usr/bin/dd, /usr/bin/truncate, /usr/sbin/mkfs.ext4, \ + /usr/sbin/losetup, /opt/clickbench-playground/bin/firecracker, \ + /opt/clickbench-playground/bin/jailer +EOF +sudo chmod 440 /etc/sudoers.d/clickbench-playground +``` + +Tighten the allowlist further if your security model demands it. + +## 4. Install Firecracker, kernel, host firewall, DNS, (optional) TLS + +``` +sudo playground/scripts/install-firecracker.sh +``` + +This script is idempotent. It: + +- Downloads `firecracker` + `jailer` (v1.13.1) into + `/opt/clickbench-playground/bin/`. +- Downloads the guest kernel (`vmlinux-6.1.141` from firecracker-ci) into + `/opt/clickbench-playground/kernel/vmlinux`. +- Sets `net.ipv4.conf.all.route_localnet=1` (needed by the SNI proxy + REDIRECT path). +- Installs `dnsmasq` and configures it as a UDP-only resolver on port 53 + for the per-VM TAPs. +- Sanity-checks that the state dir actually supports reflink. + +To enable TLS for the public API at the same time, set the domain first: + +``` +export PLAYGROUND_TLS_DOMAIN=clickbench-playground.example.com +export PLAYGROUND_TLS_EMAIL=ops@example.com # optional, defaults to ubuntu@$(hostname -d) +sudo -E playground/scripts/install-firecracker.sh +``` + +This invokes `certbot --standalone` to issue a cert, configures a deploy +hook so the `ssl-cert` group can read the renewed privkey, and adds the +operator user to the `ssl-cert` group. + +## 5. Download the datasets (~200 GB, slow) + +``` +playground/scripts/download-datasets.sh +``` + +Populates `/opt/clickbench-playground/datasets/` with: + +- `hits.parquet` — single-file parquet (~14 GB) +- `hits_partitioned/hits_0..99.parquet` — partitioned parquet +- `hits.tsv` — decompressed TSV (~75 GB) +- `hits.csv` — decompressed CSV (~75 GB) +- `hits.json` / `hits.json.gz` — JSON variants for parseable / + victorialogs + +The script uses `wget --continue` per format, so re-running picks up +where it left off. + +## 6. Build the read-only dataset image + +``` +playground/images/build-datasets-image.sh +``` + +rsyncs the `datasets/` directory into `datasets.ext4`, sized to fit, with +no journal and zero reserved blocks. This image is attached read-only to +every VM as `LABEL=cbdata`. + +## 7. Build the base rootfs + +``` +sudo playground/images/build-base-rootfs.sh +``` + +Starts from the official Ubuntu 24.04 cloud image and adds: + +- The in-VM agent at `/opt/clickbench-agent/agent.py` plus its systemd + unit. +- Forced iptables-legacy alternatives (Docker on the Firecracker kernel + needs them — `nf_tables` isn't compiled in). +- `/etc/docker/daemon.json` with `"iptables": false`, so Docker doesn't + try to manage the (missing) `raw` table. +- A preloaded kernel-module list (`overlay`, `br_netfilter`, `veth`, + `ip_tables`, `iptable_*`, `nf_conntrack`, `nf_nat`, `xt_MASQUERADE`, + `xt_conntrack`). +- `lib/download-hits-*` stubs that symlink from the read-only dataset + disk rather than `wget`ing from the public mirror. + +Output: `/opt/clickbench-playground/base-rootfs.ext4` (a sparse 200 GB +ext4 image). + +## 8. ClickHouse Cloud credentials (request logging) + +The server appends every request and restart to a ClickHouse Cloud table. +Provide credentials either via `playground/.env`: + +``` +CLICKHOUSE_CLOUD_URL=https://your-host.clickhouse.cloud:8443 +CLICKHOUSE_CLOUD_USER=default +CLICKHOUSE_CLOUD_PASSWORD=... +CLICKHOUSE_CLOUD_DB=playground +``` + +…or by copying `playground/clickhouse.conf.example` to +`/opt/clickbench-playground/clickhouse.conf` and filling it in. + +If neither is configured, the server falls back to a local JSONL sink +under `/opt/clickbench-playground/logs/`. + +## 9. Start the server + +Foreground (for local development): + +``` +playground/scripts/run-server.sh +``` + +As a managed service (recommended for production): + +``` +sudo cp playground/clickbench-playground.service /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable --now clickbench-playground +``` + +The server listens on `:8000` (HTTP) and, if a TLS cert exists at +`/etc/letsencrypt/live/${PLAYGROUND_TLS_DOMAIN}/`, also on `:443`. + +## 10. Provision every system (long) + +The server doesn't auto-provision on first query — initial install/start/ +load/snapshot is opt-in. Kick the whole catalog: + +``` +playground/scripts/provision-all.sh +``` + +This walks every system in `/api/systems`, posts to +`/api/admin/provision/`, and polls until each one is either +`snapshotted` or `down` with an error. Concurrency is bounded server-side +by `PLAYGROUND_PROVISION_CONCURRENCY` (default 32) and +`PLAYGROUND_BUILD_CONCURRENCY` (default 6). Expect 1–6 hours of wall +time depending on host throughput and Docker Hub rate-limit luck. + +Status: `curl http://localhost:8000/api/state | jq`. +Per-system log: `/opt/clickbench-playground/logs/provision-.log`. + +## 11. (Optional) Tune concurrency / monitor thresholds + +Environment variables read at server startup: + +| Var | Default | What it does | +|----------------------------------|--------------------------------------|-----------------------------------------------------------| +| `PLAYGROUND_STATE_DIR` | `/opt/clickbench-playground` | host state root | +| `PLAYGROUND_LISTEN` | `0.0.0.0:8000` | HTTP listener | +| `PLAYGROUND_TLS_CERT/_KEY` | `/etc/letsencrypt/live/$DOMAIN/...` | TLS | +| `PLAYGROUND_BUILD_CONCURRENCY` | 6 | parallel per-system rootfs builds | +| `PLAYGROUND_PROVISION_CONCURRENCY` | 32 | parallel VM provisions | +| `PLAYGROUND_SNAPSHOT_CONCURRENCY` | 6 | parallel Firecracker snapshot saves | +| `CLICKBENCH_OUTPUT_LIMIT` | 262144 | per-query response cap (bytes) enforced inside the agent | +| `VM_CPU_BUSY_THRESHOLD` | 0.97 | monitor: kill idle VMs above this | +| `VM_DISK_FULL_PCT` | 0.97 | monitor: kill VMs whose sysdisk passes this | + +## Smoke-testing a single system + +``` +playground/scripts/smoke-boot.sh clickhouse +``` + +Boots one system end-to-end (provision → snapshot → restore → /query), +prints timing, tears down. Use this to validate any change to +`base-rootfs.ext4` or the agent before re-kicking the full catalog. + +## Re-provisioning after agent or base-image changes + +`vm_manager` rebuilds the per-system rootfs+sysdisk automatically when +`base-rootfs.ext4` is newer than the existing `rootfs.ext4`. So after +changing `playground/agent/agent.py` or anything baked into the base: + +``` +sudo playground/images/build-base-rootfs.sh # rebuild base +curl -X POST http://localhost:8000/api/admin/provision/ # re-kick +``` + +The new agent and new per-system scripts both land in the next provision. diff --git a/playground/README.md b/playground/README.md new file mode 100644 index 0000000000..95b0a85a6d --- /dev/null +++ b/playground/README.md @@ -0,0 +1,101 @@ +# ClickBench Playground + +A self-service playground that lets visitors run arbitrary SQL against any of the +80+ database systems documented in ClickBench, isolated inside a Firecracker +microVM per system. + +## How it works + +1. The dataset (hits, in all formats ClickBench uses) is downloaded once into a + single directory on the host and exposed read-only to every VM as a virtio-blk + device. +2. For each system, a Firecracker microVM is launched once with internet access + to run the system's `install`, `start`, and `load` scripts. +3. A snapshot (memory + disk) is taken and persisted. Subsequent restorations + run without internet — the only path in or out is the host↔VM control link. +4. A small in-VM **agent** exposes `POST /query` over HTTP. The host **API + server** proxies user queries to the agent, returns the raw output as + `application/octet-stream`, and puts the timing into response headers. +5. A **monitor** loop watches per-VM CPU/disk/memory and host totals, killing + misbehaving or oversized VMs. +6. Every request and every restart is appended to a ClickHouse Cloud table. + +## Layout + +``` +playground/ +├── server/ # aiohttp API server, VM manager, monitor, logging sink +├── agent/ # In-VM HTTP agent (runs as systemd unit inside each VM) +├── images/ # Scripts that build the base rootfs + per-system overlays +├── web/ # Vanilla-JS single-page app +├── scripts/ # Host-side install / dataset / network helpers +└── docs/ # Design notes +``` + +Host state lives under `/opt/clickbench-playground/`: + +``` +/opt/clickbench-playground/ +├── bin/ firecracker, jailer +├── kernel/vmlinux guest kernel +├── base-rootfs.ext4 pristine Ubuntu 24.04 rootfs (built once) +├── datasets/ hits.parquet, hits_*.parquet, hits.tsv, hits.csv +├── datasets.ext4 read-only image of datasets/ (attached to every VM) +├── systems// per-system rootfs, snapshot, sockets, logs +├── vms/.sock Firecracker API socket +└── logs/ local JSONL fallback when ClickHouse Cloud is off +``` + +## Networking + +Each VM gets its own `/30` subnet on a dedicated TAP: + +| Side | Address | Notes | +|------|------------------|--------------------------------| +| Host | `10.200..1` | TAP device `fc-tap-` | +| VM | `10.200..2` | reachable from host always | + +During the install phase, `iptables FORWARD` + MASQUERADE are enabled for the +TAP so the VM can `apt-get`/`curl`/etc. After the snapshot is taken, the +forwarding rules are removed; the host↔VM link still works but external traffic +is blackholed. + +## Configuration + +Environment variables (read by `server/config.py`): + +| Var | Purpose | +|--------------------------------|-----------------------------------------------| +| `CLICKHOUSE_CLOUD_URL` | HTTPS URL of CH Cloud (e.g. `https://x.clickhouse.cloud:8443`) | +| `CLICKHOUSE_CLOUD_USER` | username | +| `CLICKHOUSE_CLOUD_PASSWORD` | password | +| `PLAYGROUND_STATE_DIR` | defaults to `/opt/clickbench-playground` | +| `PLAYGROUND_LISTEN` | defaults to `0.0.0.0:8000` | +| `CLICKBENCH_OUTPUT_LIMIT` | per-query response body cap in bytes, enforced inside the in-VM agent (default 262144 = 256 KB) | + +## Lifecycle of a request + +``` +client ──HTTP──▶ api/query?system=clickhouse + │ + ▼ + vm_manager.ensure_ready("clickhouse") + ├─ already running and /health OK ──▶ proceed + ├─ not running ──▶ restore from snapshot + └─ unresponsive ──▶ kill, restore, retry once + │ + ▼ + agent ◀── POST /query ── body=SQL + agent runs ./query, captures stdout/stderr, returns: + Content-Type: application/octet-stream + X-Query-Time: 0.234 + X-Output-Truncated: 0|1 + X-Output-Bytes: 8042 + body: (up to 10 KB of raw output) + │ + ▼ + logger.write_request(...) + │ + ▼ + client +``` diff --git a/chdb-dataframe/.preserve-state b/playground/__init__.py similarity index 100% rename from chdb-dataframe/.preserve-state rename to playground/__init__.py diff --git a/playground/agent/agent.py b/playground/agent/agent.py new file mode 100644 index 0000000000..9f9a5b1b39 --- /dev/null +++ b/playground/agent/agent.py @@ -0,0 +1,960 @@ +#!/usr/bin/env python3 +""" +ClickBench in-VM agent. + +Runs inside the Firecracker microVM. Exposes a tiny HTTP API that the host +server hits to: + + GET /health quick liveness probe; cheap + GET /stats CPU/mem/disk snapshot + POST /provision run install -> start -> load for the bundled system + (only called once, before the host snapshots the VM) + POST /query read SQL from request body, exec ./query, return + output as application/octet-stream + timing headers + +The system's ClickBench scripts (install/start/load/query/check/stop/...) are +mounted at /opt/clickbench/system, with the system name in /etc/clickbench- +system. The dataset is mounted read-only at /opt/clickbench/datasets. + +Listens on 0.0.0.0:50080 by default (deliberately not 8080 — that port +is claimed by cockroach, spark UI, trino, presto, druid, and a long +tail of other JVM web consoles in the catalog). + +Stdlib-only — the rootfs ships python3 from the Ubuntu base; no pip needed. +""" + +from __future__ import annotations + +import contextlib +import http.server +import json +import os +import re +import shutil +import signal +import socket +import socketserver +import subprocess +import sys +import threading +import time +import urllib.parse +from pathlib import Path + +SYSTEM_DIR = Path(os.environ.get("CLICKBENCH_SYSTEM_DIR", "/opt/clickbench/system")) +DATASETS_DIR = Path(os.environ.get("CLICKBENCH_DATASETS_DIR", "/opt/clickbench/datasets_ro")) +STATE_DIR = Path(os.environ.get("CLICKBENCH_AGENT_STATE", "/var/lib/clickbench-agent")) +SYSTEM_NAME = ( + os.environ.get("CLICKBENCH_SYSTEM_NAME") + or (Path("/etc/clickbench-system").read_text().strip() + if Path("/etc/clickbench-system").exists() else SYSTEM_DIR.name) +) +# Port 8080 is wildly oversubscribed in this catalog (cockroach, spark UI, +# trino, presto, hive, druid, ...). Pick a port nothing realistic is going +# to want — IANA's user range tops out at 49151, and we want to stay above +# any well-known ephemeral range too. 50080 keeps a vague "HTTP-ish" feel. +LISTEN_PORT = int(os.environ.get("CLICKBENCH_AGENT_PORT", "50080")) +# 10 KB cap, matching the spec. Configurable for testing. +OUTPUT_LIMIT = int(os.environ.get("CLICKBENCH_OUTPUT_LIMIT", str(256 * 1024))) +# Per-query wall-clock cap so a runaway query can't tie up a VM forever. +QUERY_TIMEOUT = int(os.environ.get("CLICKBENCH_QUERY_TIMEOUT", "60")) +# Provision (install/start/load) can legitimately take an hour for some systems. +# Per-step timeout for install/start/load. Some real-world systems load +# 100 M rows over many hours (postgres + indexes, cratedb, cockroachdb, +# yugabyte, etc.). 7 days covers anything reasonable without being +# unbounded. +PROVISION_TIMEOUT = int(os.environ.get("CLICKBENCH_PROVISION_TIMEOUT", str(7 * 86400))) + +STATE_DIR.mkdir(parents=True, exist_ok=True) +PROVISION_DONE = STATE_DIR / "provisioned" +PROVISION_LOG = STATE_DIR / "provision.log" + +# Concurrency policy: /query is *not* serialized at the agent level — +# we let the host fan multiple requests at the same VM in parallel. +# Per-system ./query scripts are expected to handle this (use $$ / +# mktemp for scratch state, never a fixed PID file). Engines that +# fundamentally don't support concurrent queries (e.g. embedded +# DuckDB with its file-level exclusive lock) will fail one of the +# concurrent requests; that's acceptable and visible to the user. +_provision_lock = threading.Lock() +# Tracks whether we've successfully run ./start since this agent process +# came up. After a snapshot restore the daemon doesn't exist in the +# restored memory (we stop it pre-snapshot to keep snapshots small), so the +# first /query has to bring it up. +_daemon_started = threading.Event() +_daemon_lock = threading.Lock() + + +_BENCH_VAR_RE = re.compile( + r'^\s*(?:export\s+)?(?P[A-Z_][A-Z0-9_]*)=' + r'(?:"(?P[^"]*)"|\'(?P[^\']*)\'|(?P[^\s#"\']*))', + re.MULTILINE, +) + + +def _bench_var(name: str) -> str: + """Return the value of a top-level `VAR=…` assignment in the + system's benchmark.sh, or "" if absent. Driven by static grep, NOT + `source` — benchmark.sh ends with `exec ../lib/benchmark-common.sh`, + so sourcing it would derail the agent. The bench-common driver and + the playground agent both rely on the same variable surface + (BENCH_DOWNLOAD_SCRIPT, PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT, + PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT, …), so a per-system + benchmark.sh stays the single source of truth.""" + bf = SYSTEM_DIR / "benchmark.sh" + try: + text = bf.read_text() + except FileNotFoundError: + return "" + for m in _BENCH_VAR_RE.finditer(text): + if m.group("name") == name: + return (m.group("dq") or m.group("sq") or m.group("bare") or "").strip() + return "" + + +def _cap(b: bytes) -> tuple[bytes, bool]: + """Truncate to OUTPUT_LIMIT bytes; return (body, was_truncated).""" + if len(b) <= OUTPUT_LIMIT: + return b, False + return b[:OUTPUT_LIMIT], True + + +def _read_body(handler: http.server.BaseHTTPRequestHandler) -> bytes: + n = int(handler.headers.get("Content-Length") or 0) + if n <= 0: + return b"" + # Cap inbound bodies at 1 MB; queries are SQL, not bulk data. + return handler.rfile.read(min(n, 1 << 20)) + + +def _system_script(name: str) -> Path: + """Return path to a script in the system dir, or raise if missing/not executable.""" + p = SYSTEM_DIR / name + if not p.exists(): + raise FileNotFoundError(f"missing system script: {p}") + if not os.access(p, os.X_OK): + raise PermissionError(f"system script not executable: {p}") + return p + + +def _read_proc_stat() -> tuple[int, int]: + """Return (total_jiffies, idle_jiffies) from /proc/stat.""" + with open("/proc/stat") as f: + parts = f.readline().split() + nums = list(map(int, parts[1:])) + total = sum(nums) + idle = nums[3] + (nums[4] if len(nums) > 4 else 0) + return total, idle + + +def _stats_snapshot() -> dict: + out: dict = {"system": SYSTEM_NAME, "ts": time.time()} + try: + out["loadavg"] = list(map(float, Path("/proc/loadavg").read_text().split()[:3])) + except Exception: + pass + try: + info = {k: v for k, v in ( + l.split(":", 1) for l in Path("/proc/meminfo").read_text().splitlines() if ":" in l + )} + out["mem_total_kb"] = int(info.get("MemTotal", "0 kB").split()[0]) + out["mem_avail_kb"] = int(info.get("MemAvailable", "0 kB").split()[0]) + except Exception: + pass + try: + st = shutil.disk_usage("/") + out["disk_total"] = st.total + out["disk_free"] = st.free + except Exception: + pass + try: + t1, i1 = _read_proc_stat() + time.sleep(0.05) + t2, i2 = _read_proc_stat() + total = max(1, t2 - t1) + out["cpu_busy"] = 1.0 - (i2 - i1) / total + except Exception: + pass + out["provisioned"] = PROVISION_DONE.exists() + return out + + +def _ensure_daemon_started() -> None: + """Bring the system's daemon up if it isn't already. + + Called at the top of every /query handler. The first call after a + snapshot restore is where the work happens — the snapshot was taken + with the daemon stopped (to keep the memory image compressible), so + nothing is listening on the daemon's port until we explicitly run + ./start. Subsequent calls are no-ops because _daemon_started is set. + + Wrapping ./start in a thread lock means only one /query in flight + pays the start cost, even if several arrive concurrently. + """ + if _daemon_started.is_set(): + return + with _daemon_lock: + if _daemon_started.is_set(): + return + start = SYSTEM_DIR / "start" + if not start.exists() or not os.access(start, os.X_OK): + # No daemon to start (in-process system like chdb/polars). + _daemon_started.set() + return + subprocess.run([str(start)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, check=False) + # Wait for ./check to confirm before unblocking the /query. + # 10 min covers cold-starting Druid + the other JVM stacks + # we ship (Doris, Pinot, Trino). On a fast daemon this loop + # exits in well under a second. + check = SYSTEM_DIR / "check" + if check.exists(): + for _ in range(1200): + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + break + time.sleep(0.5) + _daemon_started.set() + + +def _run_query(sql: bytes) -> tuple[int, bytes, bytes, float]: + """ + Invoke ./query with the SQL on stdin. + The query script's contract per lib/benchmark-common.sh: + stdout: result (whatever format the system uses) + stderr: timing in fractional seconds on the LAST numeric line + exit code: 0 on success + + Stops reading stdout once we've buffered OUTPUT_LIMIT+1 bytes (one + extra so _cap can detect the overflow) and kills the process group — + "SELECT * FROM hits" generates ~14 GB of output and we don't want + the agent to spin buffering it. Stderr is read on a background + thread so a chatty stderr can't deadlock the stdout pipe. + """ + import select + import threading + script = _system_script("query") + t0 = time.monotonic() + deadline = t0 + QUERY_TIMEOUT + cap = OUTPUT_LIMIT + 1 # +1 byte so _cap() can detect overflow + stdout_buf = bytearray() + stderr_buf = bytearray() + try: + p = subprocess.Popen( + [str(script)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=str(SYSTEM_DIR), + preexec_fn=os.setsid, + ) + except Exception as e: + return 255, b"", f"agent: failed to invoke ./query: {e}\n".encode(), 0.0 + + def _drain_stderr() -> None: + for chunk in iter(lambda: p.stderr.read(8192), b""): + stderr_buf.extend(chunk) + err_thread = threading.Thread(target=_drain_stderr, daemon=True) + err_thread.start() + + try: + if sql: + p.stdin.write(sql) + p.stdin.close() + except BrokenPipeError: + pass + + killed_for = "" # "timeout", "cap", or "" + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + killed_for = "timeout" + break + if len(stdout_buf) >= cap: + killed_for = "cap" + break + r, _, _ = select.select([p.stdout], [], [], min(remaining, 0.5)) + if r: + chunk = p.stdout.read1(min(8192, cap - len(stdout_buf))) + if not chunk: + break # EOF + stdout_buf.extend(chunk) + elif p.poll() is not None: + break + + if killed_for: + with contextlib.suppress(ProcessLookupError): + os.killpg(p.pid, signal.SIGKILL) + + try: + rc = p.wait(timeout=5) + except subprocess.TimeoutExpired: + with contextlib.suppress(ProcessLookupError): + os.killpg(p.pid, signal.SIGKILL) + rc = -9 + + if killed_for == "timeout": + rc = -9 + err_thread.join(timeout=2) + with contextlib.suppress(Exception): + p.stdout.close() + with contextlib.suppress(Exception): + p.stderr.close() + return rc, bytes(stdout_buf), bytes(stderr_buf), time.monotonic() - t0 + + +def _recent_oom_messages() -> str: + """Return kernel OOM-killer messages from `dmesg`, or '' if nothing + relevant. Called when the query script exits non-zero with empty + stdout AND stderr — the daemon was likely OOM-killed and never + got a chance to write a real error message. + """ + try: + out = subprocess.run( + ["dmesg", "--ctime"], + capture_output=True, timeout=5, check=False, + ).stdout.decode(errors="replace") + except Exception: + return "" + needles = ("killed process", "out of memory", "oom-killer", + "invoked oom-killer") + lines = [ln for ln in out.splitlines() + if any(n in ln.lower() for n in needles)] + return "\n".join(lines[-20:]) + + +def _extract_script_timing(stderr: bytes) -> float | None: + """ + Pull fractional-seconds timing from the last numeric line of stderr, + matching the lib/benchmark-common.sh tail -n1 logic. + """ + # Handle the spark/pyspark carriage-return progress-bar case. + text = stderr.decode("utf-8", errors="replace").replace("\r", "\n") + last = None + for line in text.splitlines(): + s = line.strip() + if not s: + continue + try: + v = float(s) + except ValueError: + continue + last = v + return last + + +def _provision() -> tuple[int, bytes]: + """ + Run install -> start -> wait-for-check -> load. Capture everything to + PROVISION_LOG. Idempotent: subsequent calls succeed-fast if PROVISION_DONE + is present. + """ + if PROVISION_DONE.exists(): + return 0, b"already provisioned\n" + + with _provision_lock: + if PROVISION_DONE.exists(): + return 0, b"already provisioned\n" + + # Use the same /lib/benchmark-common.sh helpers if they're around. But + # since this is the playground, we want a *minimal* version: install, + # start, wait for check, load, sync. No cold-cycle restart, no + # concurrent-QPS test, no query loop. + steps: list[tuple[str, list[str]]] = [ + ("install", [str(_system_script("install"))]), + ("start", [str(_system_script("start"))]), + ] + + log_lines: list[bytes] = [] + for name, cmd in steps: + t0 = time.monotonic() + log_lines.append(f"\n=== {name} ===\n".encode()) + r = subprocess.run( + cmd, cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, + ) + dt = time.monotonic() - t0 + log_lines.append(r.stdout or b"") + log_lines.append(f"=== {name} done rc={r.returncode} in {dt:.1f}s ===\n".encode()) + if r.returncode != 0: + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return r.returncode, b"".join(log_lines) + + # Wait for ./check to succeed. Per-system override via + # BENCH_CHECK_TIMEOUT in benchmark.sh (same surface as the + # standalone bench driver); default 900 s, which covers + # Druid / Pinot / similar JVM-stack engines that need 5-10 min + # for Zookeeper / Coordinator / Broker / Historical to come up + # in sequence. Trino on a cold sysdisk has been observed + # pushing past 900 s, hence the override hook. + check = SYSTEM_DIR / "check" + check_budget = 900 + override = _bench_var("BENCH_CHECK_TIMEOUT") + if override.isdigit(): + check_budget = max(check_budget, int(override)) + ok = False + t0 = time.monotonic() + last_check: subprocess.CompletedProcess | None = None + while time.monotonic() - t0 < check_budget: + last_check = subprocess.run( + [str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + ) + if last_check.returncode == 0: + ok = True + break + time.sleep(1) + if not ok: + log_lines.append( + f"\n=== check did not succeed within {check_budget}s ===\n".encode()) + if last_check is not None: + log_lines.append(last_check.stderr or b"") + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return 1, b"".join(log_lines) + log_lines.append(b"\n=== check ok ===\n") + + # Most datasets surface in cwd already: cwd is the overlay merged + # dir /opt/clickbench/system and the dataset disk's contents (the + # overlay's lower) sit at /opt/clickbench/datasets_ro at the + # filesystem root, so hits.parquet / hits.tsv / hits.csv are + # named exactly as the load scripts expect. + # + # Partitioned parquet is the exception: the upstream layout puts + # the 100 hits_N.parquet files under hits_partitioned/, and load + # scripts glob `hits_*.parquet` from cwd, not from a subdir. + # Materialize symlinks at cwd so the glob resolves. We do this in + # the agent rather than per-system to avoid 6+ systems each + # reimplementing the same staging step (which historically rotted + # — ClickBench upstream centralised this in lib/download-hits-*). + hits_partitioned = DATASETS_DIR / "hits_partitioned" + if hits_partitioned.is_dir(): + for src in hits_partitioned.glob("hits_*.parquet"): + dst = SYSTEM_DIR / src.name + if not dst.exists(): + os.symlink(src, dst) + + # Run load. + t0 = time.monotonic() + log_lines.append(b"\n=== load ===\n") + r = subprocess.run( + [str(_system_script("load"))], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, + ) + dt = time.monotonic() - t0 + log_lines.append(r.stdout or b"") + log_lines.append(f"=== load done rc={r.returncode} in {dt:.1f}s ===\n".encode()) + if r.returncode != 0: + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return r.returncode, b"".join(log_lines) + + # Pre-snapshot housekeeping. Order: + # 1) ./stop — drop the daemon's heap (merge arenas, query cache, + # mark cache, parquet ingest buffers, ...) so we can fstrim + # and drop_caches against a quiet system. + # 2) sync + drop_caches — flush dirty pages, evict the page + # cache, so init_on_free=1 zeroes everything that was + # cache. Snapshot then sees a mostly-zero free pool. + # 3) fstrim — DISCARD free blocks on the per-VM disks so the + # sparse backing file punches holes for bytes the load + # script `mv`'d in and `rm`'d (14-75 GB of dataset). + # 4) ./start + ./check — bring the daemon back up *into* the + # snapshot. Restore then resumes a daemon that's already + # serving, paying zero cold-start cost. + # Skip stop/start for systems without a real daemon (chdb, + # polars, duckdb): they're in-process tools with no separate + # process to manage. + # + # Also skip for daemons whose data lives only in their own + # process address space (daft, pandas, chdb-dataframe, ...). + # The default stop/restart wipes the loaded DataFrame and the + # restored snapshot serves queries against a daemon whose + # `hits = None`. A marker file in the system dir opts out. + stop = SYSTEM_DIR / "stop" + start = SYSTEM_DIR / "start" + check = SYSTEM_DIR / "check" + preserve_state = _bench_var("PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT") == "yes" + has_daemon = (stop.exists() and start.exists() and + check.exists() and os.access(stop, os.X_OK) and + os.access(start, os.X_OK) and + not preserve_state) + if preserve_state: + # The daemon is already running with state we want to keep + # (loaded DataFrame), so we don't restart it. The snapshot + # ships it as-is — mark /ready before snapshot so the host + # doesn't wait the full 600 s after restore. + _daemon_started.set() + if has_daemon: + log_lines.append(b"\n=== pre-snapshot stop ===\n") + r = subprocess.run([str(stop)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=120, check=False) + log_lines.append(b"stop: rc=" + str(r.returncode).encode() + b"\n") + log_lines.append(r.stdout or b"") + # Wait for the daemon to actually exit (./check failing means + # it's gone). Tolerant if it never fails — we still proceed. + for _ in range(120): + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc != 0: + break + time.sleep(0.5) + log_lines.append(b"=== pre-snapshot stop done ===\n") + + # Drop the page+dentry+inode cache. With init_on_free=1 set in + # the guest kernel cmdline (see vm_manager._kernel_cmdline), every + # page the kernel frees gets zero-filled before going back on the + # free list, so what we snapshot is mostly-zero. + subprocess.run(["sync"], check=False) + try: + Path("/proc/sys/vm/drop_caches").write_text("3\n") + except Exception: + pass + + # fstrim the per-VM disks so transient dataset bytes from + # `mv hits.parquet ... ; rm` don't end up in the golden disk. + for mnt in ("/opt/clickbench/sysdisk", "/"): + subprocess.run(["fstrim", mnt], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=300, check=False) + + # Restart the daemon so the snapshot captures it *running*. The + # restored VM then doesn't pay any cold-start cost; the daemon's + # process state, JIT/class-cache, connection pools, etc. all + # come back live. + if has_daemon: + log_lines.append(b"\n=== pre-snapshot start ===\n") + r = subprocess.run([str(start)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=PROVISION_TIMEOUT, check=False) + log_lines.append(r.stdout or b"") + log_lines.append(b"start: rc=" + str(r.returncode).encode() + b"\n") + # Wait for ./check before snapshotting — we want the daemon + # actually accepting queries when the memory image is captured. + ok = False + t0 = time.monotonic() + while time.monotonic() - t0 < 900: + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + ok = True + break + time.sleep(0.5) + if ok: + log_lines.append(b"=== pre-snapshot start ok ===\n") + _daemon_started.set() # the snapshot ships a running daemon + else: + log_lines.append(b"=== pre-snapshot start: check did not " + b"succeed in 900 s; snapshot will need a " + b"cold start on restore ===\n") + # Sync once more so any data the just-started daemon wrote + # (lock files, sockets, recovery markers) is on disk before + # the host snapshots the rootfs/sysdisk. + subprocess.run(["sync"], check=False) + + PROVISION_DONE.write_text(f"ok {time.time()}\n") + PROVISION_LOG.write_bytes(b"".join(log_lines)) + return 0, b"".join(log_lines) + + +class Handler(http.server.BaseHTTPRequestHandler): + server_version = "clickbench-agent/0.1" + + def log_message(self, fmt: str, *args) -> None: + sys.stderr.write("[agent] " + (fmt % args) + "\n") + + def _send(self, code: int, body: bytes, headers: dict | None = None) -> None: + self.send_response(code) + self.send_header("Content-Length", str(len(body))) + self.send_header("Content-Type", (headers or {}).pop("Content-Type", "application/json")) + for k, v in (headers or {}).items(): + self.send_header(k, v) + self.end_headers() + self.wfile.write(body) + + def _send_json(self, code: int, obj) -> None: + self._send(code, json.dumps(obj, default=str).encode() + b"\n", + {"Content-Type": "application/json"}) + + def do_GET(self) -> None: + if self.path == "/health": + self._send_json(200, {"ok": True, "system": SYSTEM_NAME, + "provisioned": PROVISION_DONE.exists()}) + return + if self.path == "/ready": + # True when the system's daemon is fully accepting queries. + # The host uses this at restore time to gate VM-state="ready" + # for slow daemons (Doris, Druid, Trino, etc.); without it + # the first user query arrives mid-start and times out. + # + # Check btime here too. The Python process state — including + # _daemon_started — survives a snapshot restore, so without + # this call /ready would happily report ready=true throughout + # a 5–10 minute post-restore daemon-rebuild window. + _maybe_reconcile_for_restore() + ready = _daemon_started.is_set() + self._send_json(200 if ready else 503, + {"ready": ready, "system": SYSTEM_NAME}) + return + if self.path == "/check": + # Run the system's ./check script. 200 = daemon responds, + # 503 = it does not. The host calls this after a failed + # /query to decide whether to teardown the VM. + check = SYSTEM_DIR / "check" + if not check.exists() or not os.access(check, os.X_OK): + # No check script (in-process systems like chdb/duckdb). + # Treat as healthy — there's no separate daemon to fail. + self._send_json(200, {"ok": True, "no_check": True}) + return + rc = subprocess.run( + [str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False, + ).returncode + self._send_json(200 if rc == 0 else 503, + {"ok": rc == 0, "rc": rc}) + return + if self.path == "/stats": + self._send_json(200, _stats_snapshot()) + return + if self.path == "/provision-log": + data = PROVISION_LOG.read_bytes() if PROVISION_LOG.exists() else b"" + self._send(200, data, {"Content-Type": "text/plain; charset=utf-8"}) + return + self._send_json(404, {"error": "not found", "path": self.path}) + + def do_POST(self) -> None: + if self.path == "/sync": + # Flush all dirty pages to the virtio-blk devices. The host + # calls this immediately before /snapshot/create so the + # on-disk image captured in the snapshot is consistent with + # what the in-memory page cache thinks is there. Without + # this, a long-running daemon's writeback may still be in + # flight when KVM pauses the vcpus, the snapshot freezes a + # mid-flush state, and post-restore reads see torn or + # checksum-mismatched data. + t0 = time.monotonic() + subprocess.run(["sync"], check=False) + self._send(200, f"{time.monotonic() - t0:.3f}\n".encode(), + {"Content-Type": "text/plain"}) + return + if self.path == "/provision": + rc, log = _provision() + self._send(200 if rc == 0 else 500, log[-OUTPUT_LIMIT:], + {"Content-Type": "text/plain; charset=utf-8", + "X-Provision-Status": "ok" if rc == 0 else f"err-{rc}"}) + return + if self.path == "/query": + if not PROVISION_DONE.exists(): + self._send_json(409, {"error": "not provisioned"}) + return + sql = _read_body(self) + if not sql.strip(): + self._send_json(400, {"error": "empty query"}) + return + # If /proc/stat's btime has shifted since the last call + # the VM was snapshot-restored and any docker daemon needs + # to be reconciled before we run the query script. + _maybe_reconcile_for_restore() + # First /query after a snapshot restore: start the daemon + # (it was stopped pre-snapshot to keep snapshots small). + # Subsequent calls are a near-instant no-op. + _ensure_daemon_started() + rc, out, err, wall = _run_query(sql) + script_t = _extract_script_timing(err) + body, truncated = _cap(out) + headers = { + "Content-Type": "application/octet-stream", + "X-Query-Wall-Time": f"{wall:.6f}", + "X-Output-Bytes": str(len(out)), + "X-Output-Truncated": "1" if truncated else "0", + "X-Exit-Code": str(rc), + "X-System": SYSTEM_NAME, + } + if script_t is not None: + headers["X-Query-Time"] = f"{script_t:.6f}" + # When _cap truncated the output the script was almost + # certainly killed mid-write — its rc is non-zero (SIGPIPE + # / SIGKILL) and stderr is full of "broken pipe"-style + # noise. That's not a real query failure, so don't surface + # it as an error: return 200 and let X-Output-Truncated=1 + # tell the UI to label the result accordingly. + if rc != 0 and not truncated: + # Surface a snippet of stderr so the client sees *something*. + err_snip = err[-1024:].decode("utf-8", errors="replace") + # Both stdout and stderr empty usually means the + # daemon was OOM-killed mid-query. Pull the recent + # OOM-killer lines from dmesg so the UI shows a real + # cause instead of a blank error. + if not body.strip() and not err_snip.strip(): + oom = _recent_oom_messages() + if oom: + err_snip = "kernel OOM-killer:\n" + oom + # HTTP headers can't carry raw newlines, so URL-encode + # the (truncated) snippet. The UI decodes via + # decodeURIComponent so real \n survives end-to-end. + headers["X-Error"] = urllib.parse.quote(err_snip[-512:]) + self._send(200 if (rc == 0 or truncated) else 502, body, headers) + return + self._send_json(404, {"error": "not found", "path": self.path}) + + +class ReusableServer(socketserver.ThreadingTCPServer): + allow_reuse_address = True + daemon_threads = True + + +_last_seen_btime: int | None = None + + +def _proc_btime() -> int | None: + """Read /proc/stat btime (the Unix timestamp of the kernel's last + boot). Shifts on snapshot/restore because uptime is preserved + while wall-clock advances, so we use it to detect restores from + inside the (restored) agent process.""" + try: + for line in Path("/proc/stat").read_text().splitlines(): + if line.startswith("btime "): + return int(line.split()[1]) + except Exception: + return None + return None + + +def _maybe_reconcile_for_restore() -> None: + """If /proc/stat btime has shifted since the last call, the VM was + snapshot-restored. Reconcile docker (the kernel-side cgroup / + netfilter state diverged from dockerd's restored view of it) and + clear _daemon_started so /ready reflects the truth: the daemon may + be technically running post-restore but is often broken (Druid's + SQL endpoint, byconity's compose stack, ...), and we need to + rebuild it before serving queries. + + Idempotent: subsequent calls see the same btime and return cheaply. + + Called from BOTH the /ready handler and the btime-watcher thread + so the readiness probe accurately returns 503 throughout the + post-restore rebuild — without this, the agent's Python-process + state (including _daemon_started) survives the snapshot, /ready + returns 200 immediately, and the host sends /query right into the + middle of a 5–10 minute daemon recovery.""" + global _last_seen_btime + cur = _proc_btime() + if cur is None: + return + if _last_seen_btime is None: + _last_seen_btime = cur + return + if cur != _last_seen_btime: + sys.stderr.write( + f"[agent] btime shifted " + f"{_last_seen_btime} -> {cur}; reconciling docker\n") + _last_seen_btime = cur + _reconcile_docker_after_restore() + # docker daemon restart kills containers that aren't pinned + # via `restart: unless-stopped`; for compose-based systems + # like byconity that means the worker is dead until we re-run + # ./start. Clear the daemon-started gate so the very next + # _ensure_daemon_started() call brings the stack back up. + _daemon_started.clear() + # Some systems' ./start scripts short-circuit on a shallow + # health probe (e.g. byconity checks `SELECT 1` against the + # local server; quickwit checks `docker ps` for the container) + # and never touch the broken cluster-internal connections that + # firecracker's frozen-time snapshot stranded. For those, + # PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes in benchmark.sh + # opts the system into a forced ./stop before ./start so the + # next bring-up is from a clean state. + if _bench_var("PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT") == "yes": + stop = SYSTEM_DIR / "stop" + if stop.exists() and os.access(stop, os.X_OK): + sys.stderr.write( + "[agent] PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT: " + "force ./stop\n") + subprocess.run([str(stop)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=300, check=False) + # Kick off the rebuild asynchronously. /ready (or whoever + # called us) returns promptly; the host's /ready poll then + # waits for _daemon_started to flip back to True. + threading.Thread(target=_ensure_daemon_started, daemon=True, + name="daemon-restart").start() + + +def _reconcile_docker_after_restore() -> None: + """Restart dockerd if it's active, to recover from snapshot-restore + skew. + + Why: after a Firecracker memory snapshot+restore, dockerd is resumed + in userspace but the (also-restored) kernel-side networking and cgroup + state is in flux. Symptom: `docker run` either fails or starts a + container that's unreachable on its mapped port (cedardb, byconity, + trino, etc.). `systemctl restart docker` reconciles the daemon to the + current kernel state. No-op on systems that don't use docker, and a + cheap ~2 s on initial provision (docker was just started anyway). + """ + rc = subprocess.run( + ["systemctl", "is-active", "--quiet", "docker"], + check=False, + ).returncode + if rc != 0: + return # docker isn't installed / not active + try: + subprocess.run(["sudo", "systemctl", "restart", "docker"], + check=False, timeout=60) + # Wait for the daemon to come back. + for _ in range(30): + r = subprocess.run(["sudo", "docker", "info"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=False, timeout=5).returncode + if r == 0: + return + time.sleep(1) + except Exception as e: + sys.stderr.write(f"[agent] docker reconcile failed: {e}\n") + + +def _kick_daemon_if_provisioned() -> None: + """On every agent boot, if the system has been provisioned, make sure + the daemon is also running. + + The rootfs is persistent across boots, so PROVISION_DONE survives a + cold restart of the VM. But the *process* doesn't — anything that was + in the snapshot's memory image goes away when the host takes a cold + boot (not a restore). Without this kick, a query would arrive at the + agent, the agent would see PROVISION_DONE and skip install/start, + and then ./query would hit a dead daemon and return "Connection + refused (localhost:9000)" forever. + + Run start asynchronously: blocking the agent's listen until the + daemon is ready would defeat /health, which the host uses to gate + snapshot creation and restore-wait timeouts. + """ + if not PROVISION_DONE.exists(): + return + start = SYSTEM_DIR / "start" + if not start.exists() or not os.access(start, os.X_OK): + return + + def _bg() -> None: + try: + # Slow daemons (Doris, Druid, Trino) can take >5 min to come + # up. The host's /ready poll has its own deadline; here we + # only need a generous upper bound to prevent an infinite + # hang. + subprocess.run([str(start)], cwd=str(SYSTEM_DIR), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + timeout=900, check=False) + check = SYSTEM_DIR / "check" + if check.exists(): + # Poll ./check until it succeeds — that's the daemon's + # own definition of "ready", and the host probes /ready + # for this flag. + for _ in range(240): + rc = subprocess.run([str(check)], cwd=str(SYSTEM_DIR), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, check=False).returncode + if rc == 0: + break + time.sleep(0.5) + _daemon_started.set() + except Exception as e: + sys.stderr.write(f"[agent] daemon-kick failed: {e}\n") + # Still mark started so /query is unblocked even if the + # daemon never comes up — the query will fail with a real + # error rather than hang waiting for /ready forever. + _daemon_started.set() + + threading.Thread(target=_bg, daemon=True, name="daemon-kick").start() + + +def _activate_swap() -> None: + """If the host attached a dedicated swap block device (per + `NEEDS_SWAP` in playground/server/systems.py), mkswap (idempotent) + and swapon it before serving requests so the load script can rely + on it. The swap disk is the last virtio-blk device, sized in the + hundreds of GB, and ships with no filesystem header on first boot. + + Idempotent: a device that already has `TYPE=swap` (i.e. survived a + snapshot/restore cycle) is just swapon'd again. + """ + candidates: list[tuple[str, int, str]] = [] + for entry in sorted(Path("/sys/block").glob("vd*")): + name = entry.name + try: + sectors = int((entry / "size").read_text().strip()) + except Exception: + continue + size_bytes = sectors * 512 + # Below ~100 GB it isn't the playground's swap drive. + if size_bytes < 100 * (1 << 30): + continue + dev = f"/dev/{name}" + r = subprocess.run( + ["blkid", "-s", "TYPE", "-o", "value", dev], + capture_output=True, text=True, + ) + fstype = r.stdout.strip() + candidates.append((dev, size_bytes, fstype)) + # Prefer an already-mkswap'd device; otherwise pick the first empty one. + target = next((d for d, _, t in candidates if t == "swap"), None) + if target is None: + target = next((d for d, _, t in candidates if t == ""), None) + if target is None: + return + rc = subprocess.run(["mkswap", "-L", "cbswap", target]).returncode + if rc != 0: + print(f"agent: mkswap {target} rc={rc}", flush=True) + return + rc = subprocess.run(["swapon", target]).returncode + print(f"agent: swapon {target} rc={rc}", flush=True) + + +def _btime_watcher() -> None: + """Background thread that polls btime and triggers reconcile the + moment a snapshot restore is detected — independent of whether any + /ready or /query has arrived yet. Without it, restore detection + is gated on a request landing, and the first /ready after restore + reports stale-true _daemon_started until that request lands.""" + while True: + try: + _maybe_reconcile_for_restore() + except Exception as e: + sys.stderr.write(f"[agent] btime watcher error: {e!r}\n") + time.sleep(1) + + +def main() -> None: + addr = ("0.0.0.0", LISTEN_PORT) + print(f"agent: system={SYSTEM_NAME} listen={addr[0]}:{addr[1]} " + f"dir={SYSTEM_DIR} data={DATASETS_DIR}", flush=True) + _activate_swap() + _reconcile_docker_after_restore() + # Capture btime *now*, before snapshot is taken: the snapshot + # freezes this value into memory, and after restore the live + # /proc/stat btime will have shifted, so _maybe_reconcile_for_restore + # picks up the change on the first post-restore /query. + global _last_seen_btime + _last_seen_btime = _proc_btime() + _kick_daemon_if_provisioned() + threading.Thread(target=_btime_watcher, daemon=True, + name="btime-watcher").start() + with ReusableServer(addr, Handler) as srv: + srv.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/playground/agent/clickbench-agent.service b/playground/agent/clickbench-agent.service new file mode 100644 index 0000000000..067b1cfdc8 --- /dev/null +++ b/playground/agent/clickbench-agent.service @@ -0,0 +1,28 @@ +[Unit] +Description=ClickBench in-VM playground agent +# Wait for clickbench-net.service to assign eth0's IP — without it the +# kernel-set IP (firecracker-ci kernel via CONFIG_IP_PNP) is a no-op on +# the Ubuntu generic kernel and we'd bind 0.0.0.0:50080 on an interface +# that doesn't have an IP yet. +After=local-fs.target clickbench-net.service +Wants=clickbench-net.service + +[Service] +Type=simple +Environment=PYTHONUNBUFFERED=1 +Environment=HOME=/root +# Several ClickBench install/load scripts (monetdb, ...) reference $USER +# and `set -u`-fail without it. systemd's default service environment +# has no USER/LOGNAME, so stamp them. We run as root in the VM (no +# multi-tenant separation inside a per-VM playground), so these are +# correct. +Environment=USER=root +Environment=LOGNAME=root +ExecStart=/usr/bin/python3 /opt/clickbench-agent/agent.py +Restart=on-failure +RestartSec=2 +KillMode=mixed +TimeoutStopSec=10 + +[Install] +WantedBy=multi-user.target diff --git a/playground/clickbench-playground.service b/playground/clickbench-playground.service new file mode 100644 index 0000000000..77ebd34529 --- /dev/null +++ b/playground/clickbench-playground.service @@ -0,0 +1,61 @@ +[Unit] +Description=ClickBench Playground API +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ubuntu +# ssl-cert membership lets the process read the Let's Encrypt +# privkey under /etc/letsencrypt/{live,archive} (ownership set by +# the deploy hook in /etc/letsencrypt/renewal-hooks/deploy/). +SupplementaryGroups=ssl-cert +WorkingDirectory=/home/ubuntu/ClickBench +EnvironmentFile=-/home/ubuntu/ClickBench/playground/.env +ExecStart=/usr/bin/python3 -m playground.server.main +Restart=on-failure +RestartSec=3 + +# Grant the playground process CAP_NET_BIND_SERVICE so it can bind +# 443 as the unprivileged `ubuntu` user. We do NOT lock the +# capability bounding set: the server uses `sudo` to invoke +# iptables / ip tuntap / mount / firecracker etc., and the root +# child of sudo needs the full capability set to do that work. +AmbientCapabilities=CAP_NET_BIND_SERVICE + +# --- Hardening ----------------------------------------------------- +# +# The server runs as the unprivileged `ubuntu` user; privileged work +# (iptables, ip tuntap, mount, firecracker) is delegated to sudo with +# an operator-managed sudoers allowlist. These directives keep the +# systemd unit from regaining capabilities or filesystem write access +# if the python process is compromised. +# +# What we deliberately do NOT set: +# - NoNewPrivileges / RestrictSUIDSGID — both would break sudo, +# which the server uses to invoke iptables / ip tuntap / mount. +# The narrower defence is the constrained sudoers file. +# - ProtectSystem=strict — the sudo'd children (iptables-restore, +# mkfs.ext4, mount, ...) need to touch /etc, /run, etc. +# ProtectSystem=full is the practical maximum. +# - PrivateNetwork — the server needs the host network namespace +# to manage TAPs and the SNI proxy. +# - PrivateDevices — we use /dev/kvm, /dev/loop*, /dev/net/tun. +# +ProtectSystem=full +ProtectHome=read-only +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +ProtectClock=yes +PrivateTmp=yes +RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX AF_NETLINK AF_PACKET +LockPersonality=yes +RestrictRealtime=yes +RestrictNamespaces=yes +# Explicit write allow-list: state dir for VM artifacts + Python's +# bytecode cache. Everything else under ProtectSystem=full is RO. +ReadWritePaths=/opt/clickbench-playground /home/ubuntu/.cache + +[Install] +WantedBy=multi-user.target diff --git a/playground/clickhouse.conf.example b/playground/clickhouse.conf.example new file mode 100644 index 0000000000..7e36d7d47a --- /dev/null +++ b/playground/clickhouse.conf.example @@ -0,0 +1,16 @@ +# ClickHouse credentials for the playground. +# +# Copy this file to /clickhouse.conf (default: +# /opt/clickbench-playground/clickhouse.conf +# ) and fill in your hostname / user / password. The playground server +# reads it on startup for both the request-logging sink and any +# shared-query feature that gets wired in later. +# +# Env vars (CLICKHOUSE_CLOUD_URL / _USER / _PASSWORD / _DB) take +# precedence over this file so existing deployments keep working. + +[clickhouse] +url = https://your-host.clickhouse.cloud:8443 +user = default +password = +db = playground diff --git a/playground/docs/architecture.md b/playground/docs/architecture.md new file mode 100644 index 0000000000..fd78304266 --- /dev/null +++ b/playground/docs/architecture.md @@ -0,0 +1,140 @@ +# ClickBench Playground architecture + +## Components + +``` +┌──────────────────────────────────────────────────────────────────────────┐ +│ Browser (vanilla JS) │ +│ picks a system, types SQL, POST /api/query │ +└────────────────────────────┬─────────────────────────────────────────────┘ + │ HTTP/1.1 +┌────────────────────────────▼─────────────────────────────────────────────┐ +│ Host API server (aiohttp) │ +│ ┌─────────────────┐ ┌──────────────┐ ┌────────────────┐ │ +│ │ HTTP routes │ │ VMManager │ │ Monitor │ │ +│ │ /api/systems │ │ per-VM │ │ 1Hz polling │ │ +│ │ /api/query │──▶│ lifecycle │◀──│ CPU/mem/disk │ │ +│ │ /api/state │ │ snapshots │ │ watchdog │ │ +│ └─────────────────┘ └──────┬───────┘ └────────────────┘ │ +│ ┌─────────────────────────────▼─────────────────────────────────────┐ │ +│ │ LoggingSink: batched INSERT into ClickHouse Cloud + local JSONL │ │ +│ └───────────────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬─────────────────────────────────────────────┘ + │ HTTP over per-VM TAP /24 +┌────────────────────────────▼─────────────────────────────────────────────┐ +│ Firecracker microVM (Ubuntu 24.04, 4 vCPU / 16 GB / 200 GB sparse) │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ in-VM agent (stdlib python HTTP server) │ │ +│ │ /health, /stats, /provision, /query │ │ +│ └────────────────────────┬─────────────────────────────────────────┘ │ +│ ▼ runs │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ /opt/clickbench/system/ — system's ClickBench scripts (RW) │ │ +│ │ /opt/clickbench/datasets/ — shared dataset image (RO) │ │ +│ └──────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────┘ +``` + +## State machine (per system) + +``` + ┌───────┐ no snapshot ┌───────────────┐ + │ down │─────────────▶│ provisioning │ + └───┬───┘ └───────┬───────┘ + │ │ provision OK + │ snapshot ok ▼ + │ ┌──────────────┐ + ▼ │ snapshotted │ + ┌───────────┐ restore └──────┬───────┘ + │ ready │◀──────────────── │ + └─────┬─────┘ │ + │ watchdog / failed query │ + └─────────────────────────┘ +``` + +`ready` is the only state that accepts /query. Any restart (watchdog or +explicit kick) returns to `snapshotted`; the next /query restores from the +on-disk snapshot. + +## Snapshots + +Created the first time a system is requested. Three artifacts: + +- `/systems//snapshot.state` — Firecracker VM metadata +- `/systems//snapshot.bin` — guest memory dump + (mmap'd by Firecracker on restore — left uncompressed so restore is + O(1) host work; pages fault in lazily) +- `/systems//{rootfs,system}.golden.ext4` — frozen disk + state at snapshot time, reflink-cloned at restore + +The host filesystem at `` **must support reflinks** (XFS, or +ext4 with `shared_blocks`). `_snapshot_disks` and `_restore_disks` both +use `cp --reflink=always` so cloning the golden into a working disk is +a constant-time extent-list copy regardless of how much data the system +actually wrote. Without reflinks the playground still works, but every +restore pays a full sparse-cp of the working set. + +Snapshots are taken with the daemon **running** (`./start` is invoked +after the pre-snapshot `./stop` + `fstrim` + `drop_caches`), so a +restored VM resumes with the daemon already serving — no cold-start +cost on the first query. + +Drive paths in the snapshot are remapped to their current host locations +on restore so we don't have to re-snapshot if the playground gets moved +or rebooted. + +## Networking + +A `/24` per VM, with the host owning `.1` and the guest owning `.2`. Each +TAP is `fc-tap-`, where `` is the deterministic per-system index +assigned in `VMManager.__init__`. + +``` +host 10.200..1/24 ◀── TAP ─▶ 10.200..2/24 guest +``` + +**Datalake systems** (`*-datalake`, `*-datalake-partitioned`) need +outbound access to S3 to serve queries, but unrestricted internet is +overkill. The playground server runs an SNI-allowlist proxy +(`playground/server/sni_proxy.py`) on the host. For these systems, +`net.enable_filtered_internet(slot)` REDIRECTs the TAP's TCP 80/443 +to the proxy and DROPs everything else (DNS is allowed). The proxy +parses the TLS ClientHello's SNI (or the HTTP Host header), checks it +against an allowlist (`*.s3.*.amazonaws.com` and friends), then +splices bytes upstream — TLS terminates end-to-end between the VM +and S3, so certs / pinning / ALPN all keep working untouched. + +During the provision phase only, iptables NAT/FORWARD rules are added so +the guest can `apt-get` / `curl`. After the snapshot, those rules are +deleted — outbound traffic is dropped, the host↔guest link remains. + +## Output truncation + +Truncation is applied **inside the agent**, before bytes leave the VM: + +- Stdout from the system's `./query` script is capped at + `CLICKBENCH_OUTPUT_LIMIT` bytes (default 256 KB). +- The agent's response sets `X-Output-Truncated: 1` and + `X-Output-Bytes: ` so the client can show "this is a + partial result of N bytes." +- The host API server passes the headers through unchanged. + +## Watchdog rules + +The `Monitor` thread samples every running Firecracker process once per +second: + +- **CPU**: if per-VM CPU usage (utime+stime / wallclock / vcpus) stays + ≥ `VM_CPU_BUSY_THRESHOLD` (default 97%) for `VM_CPU_BUSY_WINDOW_SEC` + contiguous seconds (default 120), the VM is killed. +- **Disk**: if the sparse `rootfs.ext4` has allocated more than + `VM_DISK_FULL_PCT` (default 97%) of `VM_ROOTFS_SIZE_GB`, the VM is + killed. +- **Host RAM**: if `MemAvailable` drops below `HOST_MIN_FREE_RAM_GB` + (default 32 GiB), the watchdog kills the VM with the largest RSS. +- **Host disk**: if free space on `PLAYGROUND_STATE_DIR` drops below + `HOST_MIN_FREE_DISK_GB` (default 500 GiB), the watchdog kills the VM + with the largest allocated rootfs. + +A "kill" leaves the snapshot intact. The next user query restores from +snapshot, paying ~1 s of memory restore cost. diff --git a/playground/images/build-base-rootfs.sh b/playground/images/build-base-rootfs.sh new file mode 100755 index 0000000000..03f3a05669 --- /dev/null +++ b/playground/images/build-base-rootfs.sh @@ -0,0 +1,480 @@ +#!/bin/bash +# Build a base Ubuntu 24.04 rootfs for the Firecracker microVMs. +# +# Strategy: start from the official Ubuntu 24.04 cloud image (qcow2), convert +# to raw, mount it, install python3 + sudo + curl + iproute2, drop the agent in +# place, install a systemd unit that runs the agent on boot, and add a +# /etc/fstab line that mounts the dataset disk read-only. +# +# The resulting image is /opt/clickbench-playground/base-rootfs.ext4. Per-system +# images are produced by overlaying the system's ClickBench scripts onto a copy +# of this base. +# +# Idempotent: re-running just re-builds the file from scratch. + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +TMP="${STATE_DIR}/tmp/base-build" +OUT="${STATE_DIR}/base-rootfs.ext4" +# Match the per-system rootfs cap (200 GB) so build-system-rootfs.sh can +# clone the base directly with `cp --sparse=always` and skip resize2fs. +# The image is sparse: mkfs.ext4 with lazy_itable_init writes only the +# superblocks (~50 MB) upfront, and clones inherit that sparseness. +SIZE_GB="${BASE_ROOTFS_SIZE_GB:-200}" +CLOUDIMG_URL="${UBUNTU_CLOUDIMG_URL:-https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img}" +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +AGENT_DIR="${REPO_DIR}/playground/agent" + +echo "[base] state=$STATE_DIR out=$OUT size=${SIZE_GB}G" + +mkdir -p "$TMP" +mkdir -p "$STATE_DIR/cache" + +CLOUDIMG="$STATE_DIR/cache/noble-cloudimg.img" +if [ ! -f "$CLOUDIMG" ]; then + echo "[base] downloading cloud image" + curl -fsSL "$CLOUDIMG_URL" -o "${CLOUDIMG}.part" + mv "${CLOUDIMG}.part" "$CLOUDIMG" +fi + +# Plan: rather than grow the cloud image's partition (which involves +# sfdisk/growpart/resize2fs — all of which call `sync` and therefore stall +# whenever the host is under unrelated writeback pressure), we work in two +# fixed-size hops: +# +# 1. Loop-mount the cloud image's existing partition (2 GB) and use that +# as a read-only source. +# 2. Create a fresh, no-partition-table ext4 image of SIZE_GB and mount it +# as the build root. Copy the cloud image's content into it. The new +# image is what Firecracker boots from (it expects a flat ext4, no +# partition table). +# +# No growpart, no resize2fs, no waiting on the kernel to flush GBs of +# unrelated dirty pages just to update a partition table. + +RAW="$TMP/base.raw" +echo "[base] converting cloud image to raw" +qemu-img convert -O raw "$CLOUDIMG" "$RAW" + +SRC_LOOP="$(sudo losetup --find --show --partscan "$RAW")" +trap 'sudo losetup -d "$SRC_LOOP" 2>/dev/null || true' EXIT +for i in $(seq 1 20); do + if [ -b "${SRC_LOOP}p1" ]; then break; fi + sleep 0.5 +done + +SRC_MNT="$TMP/src" +mkdir -p "$SRC_MNT" +sudo mount -o ro "${SRC_LOOP}p1" "$SRC_MNT" + +# Now build the *target* image: a plain ext4 file of SIZE_GB with no partition +# table. Firecracker boots root=/dev/vda directly off this. +echo "[base] mkfs.ext4 -> ${SIZE_GB}G no-partition flat image" +FLAT="$TMP/base.flat.ext4" +fallocate -l "${SIZE_GB}G" "$FLAT" +mkfs.ext4 -F -L cbroot -E lazy_itable_init=1,lazy_journal_init=1 "$FLAT" >/dev/null + +DST_LOOP="$(sudo losetup --find --show "$FLAT")" +MNT="$TMP/mnt" +mkdir -p "$MNT" +sudo mount "$DST_LOOP" "$MNT" +trap ' + sudo umount "'"$SRC_MNT"'" 2>/dev/null || true + sudo umount "'"$MNT"'" 2>/dev/null || true + sudo losetup -d "'"$SRC_LOOP"'" 2>/dev/null || true + sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true +' EXIT + +# Stage the cloud image contents into the new rootfs. +echo "[base] copying cloud image content into flat rootfs" +sudo cp -a "$SRC_MNT"/. "$MNT"/ +sudo umount "$SRC_MNT" +sudo losetup -d "$SRC_LOOP" +trap ' + sudo umount "'"$MNT"'" 2>/dev/null || true + sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true +' EXIT + +# Bind /dev /proc /sys for the chroot. Use `--rbind` so submounts (devpts, +# mqueue, hugepages, /sys/fs/cgroup, …) come along. Critically, mark each +# new mount `--make-rslave` immediately afterwards. Without that, a later +# `umount -lR` on the chroot's `/dev` propagates back through the shared +# mount group and tears down the *host's* `/dev/pts` — at which point sshd +# can't allocate a PTY and the operator gets locked out. +for d in dev proc sys; do + sudo mkdir -p "$MNT/$d" + sudo mount --rbind "/$d" "$MNT/$d" + sudo mount --make-rslave "$MNT/$d" +done +trap ' + for d in dev proc sys; do sudo umount -lR "'"$MNT"'/$d" 2>/dev/null || true; done + sudo umount "'"$MNT"'" 2>/dev/null || true + sudo losetup -d "'"$DST_LOOP"'" 2>/dev/null || true +' EXIT + +# Resolve DNS from host inside the chroot. The cloud image ships +# /etc/resolv.conf as a symlink into /run/systemd/resolve/ which is empty +# until systemd-resolved starts; we need a real file for the chroot's apt +# to work. +sudo rm -f "$MNT/etc/resolv.conf" +sudo install -m 0644 /etc/resolv.conf "$MNT/etc/resolv.conf" + +# Run system customization inside the chroot. +sudo tee "$MNT/tmp/customize.sh" >/dev/null <<'CUSTOMIZE' +#!/bin/bash +set -euxo pipefail +export DEBIAN_FRONTEND=noninteractive + +# Disable cloud-init's network configuration so eth0 just comes up via +# /etc/network/interfaces-style config we install below. +echo 'network: {config: disabled}' > /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg + +# Keep the image small: turn off heavy services that we don't need on a +# query-serving microVM. +systemctl disable snapd.service snapd.socket snapd.seeded.service 2>/dev/null || true +systemctl mask snapd.service snapd.socket snapd.seeded.service 2>/dev/null || true +systemctl disable unattended-upgrades.service apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true +systemctl mask unattended-upgrades.service apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true + +apt-get update -qq +apt-get install -y --no-install-recommends \ + python3 python3-yaml ca-certificates curl wget gnupg sudo less vim-tiny \ + iproute2 iputils-ping net-tools openssh-server lsb-release \ + htop sysstat strace ncdu pigz unzip xz-utils zstd \ + build-essential netbase +apt-get clean +rm -rf /var/lib/apt/lists/* +CUSTOMIZE +sudo chmod +x "$MNT/tmp/customize.sh" +sudo chroot "$MNT" /tmp/customize.sh +sudo rm -f "$MNT/tmp/customize.sh" + +# Install Ubuntu's KVM-friendly kernel + its modules INTO the rootfs. +# Firecracker doesn't use grub — we just need /lib/modules// populated +# so the running kernel (Ubuntu generic, extracted from the same .deb) can +# load overlay, veth, br_netfilter, iptable_nat etc. at runtime. Without +# this, the in-VM mounts of /opt/clickbench/system (overlay) and Docker's +# networking (iptables NAT, br_netfilter, veth) silently fail. +sudo cp /var/cache/apt/archives/linux-modules-7.0.0-15-generic_*.deb "$MNT/tmp/" +sudo cp /var/cache/apt/archives/linux-image-7.0.0-15-generic_*.deb "$MNT/tmp/" +sudo tee -a "$MNT/tmp/customize-modules.sh" >/dev/null <<'MODSCRIPT' +#!/bin/bash +set -euxo pipefail +export DEBIAN_FRONTEND=noninteractive +# Extract files from the modules deb without registering it in dpkg. +# `dpkg --unpack` half-installs the package, leaving apt thinking there's +# an unconfigured package with unmet dependencies and refusing subsequent +# `apt-get install`s with "Unmet dependencies. Try 'apt --fix-broken +# install'". Bypass dpkg entirely: dpkg-deb -x just unrolls the data +# tarball into the rootfs. +dpkg-deb -x /tmp/linux-modules-7.0.0-15-generic_*.deb / +# Run depmod so the kernel can find modules by name at runtime. +depmod 7.0.0-15-generic 2>&1 | tail -2 || true +# Pre-load critical modules at boot — Docker needs overlay (storage), +# veth + bridge (per-container netif), br_netfilter (iptables visibility +# across the bridge), iptable_nat + ip_tables + nf_conntrack + nf_nat + +# xt_MASQUERADE (the actual NAT chain for outbound container traffic). +mkdir -p /etc/modules-load.d +cat > /etc/modules-load.d/clickbench.conf </dev/null <<'CUSTOMIZE' +#!/bin/bash +set -euxo pipefail +export DEBIAN_FRONTEND=noninteractive + +# iptables backend: pin to legacy (xtables). Ubuntu 24.04 defaults to +# the nft variant, but the Firecracker CI kernel (vmlinux-6.1.141) +# does not have CONFIG_NF_TABLES, so any nft call returns +# `Failed to initialize nft: Protocol not supported`. dockerd's +# bridge-driver init does `iptables -t nat -N DOCKER` at startup; +# the nft failure aborts dockerd → docker.service exits 1 → every +# docker-based system fails at install time with +# "Cannot connect to the Docker daemon". +# The legacy backend uses x_tables/ip_tables/iptable_nat which the +# firecracker kernel does compile in (see modules-load.d above). +update-alternatives --set iptables /usr/sbin/iptables-legacy +update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy + +# Turn off dockerd's iptables management entirely. Reasons: +# 1. dockerd 28+ adds "DIRECT ACCESS FILTERING" which touches the +# iptables `raw` table; the Firecracker CI kernel doesn't compile +# in CONFIG_IP_NF_RAW, so every `docker run` on the default +# bridge fails with +# Unable to enable DIRECT ACCESS FILTERING - DROP rule: +# iptables ... can't initialize iptables table `raw`: +# Table does not exist +# The `default-network-opts.bridge.gateway_mode_ipv4=nat-unprotected` +# knob is supposed to skip those rules, but isn't honoured for +# the auto-created `bridge` network on this docker.io 29.x. +# 2. The microVM only ever runs ONE container per system, and the +# container talks to 127.0.0.1: via host-side port mapping +# (handled by docker-proxy, not iptables). The host-side +# net.enable_filtered_internet handles VM→outside masquerade. +# +# With iptables=false, dockerd doesn't add ANY iptables rules; port +# forwarding goes through the userland docker-proxy. +mkdir -p /etc/docker +cat > /etc/docker/daemon.json < /usr/local/sbin/clickbench-docker-nat <<'NATEOF' +#!/bin/bash +set -e +# Idempotent — the systemd unit may fire on every boot, including after +# a snapshot restore where the rule may already be there. +if ! iptables -t nat -C POSTROUTING -s 172.17.0.0/16 ! -o docker0 \ + -j MASQUERADE 2>/dev/null; then + iptables -t nat -A POSTROUTING -s 172.17.0.0/16 ! -o docker0 \ + -j MASQUERADE +fi +NATEOF +chmod +x /usr/local/sbin/clickbench-docker-nat + +cat > /etc/systemd/system/clickbench-docker-nat.service </dev/null || true +systemctl disable systemd-resolved 2>/dev/null || true +rm -f /etc/resolv.conf +cat > /etc/resolv.conf < /usr/local/sbin/clickbench-net-up <<'NETUP' +#!/bin/bash +# Apply ip=:::::eth0:off from /proc/cmdline. +set -e +ip_arg=$(awk '{for(i=1;i<=NF;i++) if($i ~ /^ip=/) print $i}' /proc/cmdline | sed 's/^ip=//') +[ -z "$ip_arg" ] && exit 0 +IFS=':' read -r vm_ip _peer gw mask _hostname iface _autoconf <<<"$ip_arg" +iface="${iface:-eth0}" +ip link set "$iface" up +ip addr add "$vm_ip/$(python3 -c "import ipaddress; print(ipaddress.IPv4Network('0.0.0.0/$mask').prefixlen)" 2>/dev/null || echo 24)" dev "$iface" +[ -n "$gw" ] && ip route add default via "$gw" || true +NETUP +chmod +x /usr/local/sbin/clickbench-net-up + +cat > /etc/systemd/system/clickbench-net.service < /etc/systemd/system/serial-getty@ttyS0.service.d/override.conf <` work unchanged. Symlinks instead +# of copies save 14-75 GB of in-VM writes per system. +cat > /opt/clickbench/lib/download-hits-parquet-single <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.parquet hits.parquet +EOF +cat > /opt/clickbench/lib/download-hits-parquet-partitioned <<'EOF' +#!/bin/bash +# Partitioned parquet files live under datasets_ro/hits_partitioned/ +# on the read-only datasets disk (matching the +# datasets/hits_partitioned/ layout build-datasets-image.sh rsyncs +# from). Link them into cwd as a flat hits_*.parquet so the system +# load scripts can glob `hits_*.parquet` exactly like in the +# upstream `lib/download-hits-parquet-partitioned`. +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +for i in $(seq 0 99); do + ln -sf "/opt/clickbench/datasets_ro/hits_partitioned/hits_${i}.parquet" \ + "hits_${i}.parquet" +done +EOF +cat > /opt/clickbench/lib/download-hits-tsv <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.tsv hits.tsv +EOF +cat > /opt/clickbench/lib/download-hits-csv <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.csv hits.csv +EOF +cat > /opt/clickbench/lib/download-hits-json <<'EOF' +#!/bin/bash +# Pre-decompressed hits.json (~75 GB) on the readonly dataset disk. +# Load scripts that previously did wget + gunzip into the VM (and ran +# out of disk for the 75 GB decompressed copy) can just consume this +# file directly. +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.json hits.json +EOF +cat > /opt/clickbench/lib/download-hits-json-gz <<'EOF' +#!/bin/bash +set -e +dir="${1:-.}"; mkdir -p "$dir"; cd "$dir" +ln -sf /opt/clickbench/datasets_ro/hits.json.gz hits.json.gz +EOF +chmod +x /opt/clickbench/lib/download-hits-* +cat > /etc/fstab < /etc/hosts < flat re-copy step. +sudo umount -lR "$MNT/dev" "$MNT/proc" "$MNT/sys" +sudo umount "$MNT" +sudo losetup -d "$DST_LOOP" +trap - EXIT + +mv "$FLAT" "$OUT" +rm -rf "$TMP" + +# Final fsck: every per-system rootfs is cloned from this file and then +# resize2fs'd, which requires the source filesystem to be clean. Doing +# the fsck once here, while build-base-rootfs.sh has full I/O headroom, +# is much cheaper than doing it 98 times during the parallel system +# build phase. +sudo e2fsck -fy "$OUT" >/dev/null 2>&1 || true + +echo "[base] done: $OUT ($(du -h "$OUT" | cut -f1) physical, $(du -h --apparent-size "$OUT" | cut -f1) apparent)" diff --git a/playground/images/build-datasets-image.sh b/playground/images/build-datasets-image.sh new file mode 100755 index 0000000000..1a66ea151e --- /dev/null +++ b/playground/images/build-datasets-image.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Build a single read-only ext4 image of /opt/clickbench-playground/datasets, +# attached as virtio-blk to every per-system VM. This replaces the previous +# scheme of copying the dataset into each per-system disk: one image on the +# host vs N copies saves ~1-2 TB across the catalog. +# +# The VM-side fstab line (LABEL=cbdata ... ro) is provisioned by +# build-base-rootfs.sh; the agent copies the needed files into the writable +# system disk at provision time so load scripts that mv/chown can do so on +# files they own. + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +SRC="$STATE_DIR/datasets" +OUT="$STATE_DIR/datasets.ext4" + +if [ ! -d "$SRC" ]; then + echo "no datasets dir: $SRC" >&2 + exit 1 +fi + +bytes=$(du -sb "$SRC" | awk '{print $1}') +# Add 8 GB headroom for ext4 metadata + ext4 mkfs reserved blocks. +overhead=$(( 8 * 1024 * 1024 * 1024 )) +size_mib=$(( (bytes + overhead + 1024*1024 - 1) / (1024*1024) )) + +echo "[datasets] payload=$bytes B image=${size_mib} MiB" + +# Idempotency: skip the rebuild if a present image is at least as large as +# the source and was modified after the most-recent source file. The image +# never holds anything but the dataset, so a same-or-larger size + a +# fresher mtime is sufficient evidence that the contents are current. +# Force-rebuild with REBUILD=1 to override. +if [ -f "$OUT" ] && [ "${REBUILD:-}" != "1" ]; then + out_size=$(stat -c%s "$OUT" 2>/dev/null || echo 0) + out_mtime=$(stat -c%Y "$OUT" 2>/dev/null || echo 0) + src_newest=$(find "$SRC" -type f -printf '%T@\n' | sort -rn | head -1 | cut -d. -f1) + if [ "$out_size" -ge "$bytes" ] && [ "$out_mtime" -gt "${src_newest:-0}" ]; then + echo "[datasets] cached ($(du -h "$OUT" | cut -f1)); set REBUILD=1 to force" + ls -lh "$OUT" + exit 0 + fi +fi + +rm -f "$OUT" +truncate -s "${size_mib}M" "$OUT" +# Disable the journal (-O ^has_journal) and reserve 0 blocks for root +# (-m 0); both make sense for a read-only image. +mkfs.ext4 -F -L cbdata -m 0 -O ^has_journal \ + -E lazy_itable_init=1,lazy_journal_init=1 "$OUT" >/dev/null + +MNT="$(mktemp -d)" +trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT +sudo mount -o loop "$OUT" "$MNT" +sudo rsync -a "$SRC"/. "$MNT"/ +sudo umount "$MNT" +trap - EXIT + +# Mark the image read-only on the host too, so a misconfigured drive (RW +# attach by mistake) can't scribble. +chmod a-w "$OUT" + +echo "[datasets] done" +ls -lh "$OUT" diff --git a/playground/images/build-system-rootfs.sh b/playground/images/build-system-rootfs.sh new file mode 100755 index 0000000000..77c54057b8 --- /dev/null +++ b/playground/images/build-system-rootfs.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Build a per-system rootfs and "system disk" image for Firecracker. +# +# Outputs (under /opt/clickbench-playground/systems//): +# rootfs.ext4 CoW-ish copy of base-rootfs.ext4 (sparse 200 GB) +# system.ext4 ~2 GB ext4 holding ONLY the system's ClickBench +# scripts. The dataset is *not* copied in here — it +# comes from the host-side shared datasets.ext4 +# attached read-only to every VM (build-datasets- +# image.sh). The agent's /provision step copies +# only the bytes the load script actually needs. + +set -euo pipefail + +if [ $# -lt 1 ]; then + echo "usage: $0 " >&2 + exit 2 +fi +SYSTEM="$1" + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +BASE="$STATE_DIR/base-rootfs.ext4" +SRC="$REPO_DIR/$SYSTEM" +OUT_DIR="$STATE_DIR/systems/$SYSTEM" +ROOTFS="$OUT_DIR/rootfs.ext4" +SYSDISK="$OUT_DIR/system.ext4" + +ROOTFS_SIZE_GB="${VM_ROOTFS_SIZE_GB:-200}" +# Apparent size of the cbsystem disk. Every byte the load script writes +# (overlay copy-ups of the dataset, the database's own files — +# MergeTree parts, duckdb's hits.db, etc.) lands here. Some systems are +# heavy: tidb writes ~137 GB, postgres-indexed ~80 GB, druid ~50 GB. +# Match the rootfs cap (200 GB) so any single system has room. +# +# This is a SPARSE file: `truncate` reserves the apparent size but +# allocates no physical blocks. mkfs.ext4 only writes the small initial +# metadata. Real disk usage tracks the bytes the VM actually writes, +# and `cp --sparse=always` on the golden-disk path preserves that +# sparseness through snapshot+restore — snapshots of light systems +# stay light. +SYSDISK_SIZE_GB="${VM_SYSDISK_SIZE_GB:-200}" + +if [ ! -f "$BASE" ]; then + echo "base rootfs not found: $BASE — run build-base-rootfs.sh first" >&2 + exit 1 +fi +if [ ! -d "$SRC" ]; then + echo "no such system directory: $SRC" >&2 + exit 1 +fi +for f in install start load query check stop; do + if [ ! -x "$SRC/$f" ]; then + echo "system '$SYSTEM' missing executable $f — not playground-ready" >&2 + exit 1 + fi +done + +mkdir -p "$OUT_DIR" + +# 1. Rootfs: clone the base ext4 file block-level (sparse). The base is +# already sized at ROOTFS_SIZE_GB with mostly-empty ext4 metadata, so +# `cp --sparse=always` produces a sparse 200 GB image of the right size +# in seconds — no resize2fs, no e2fsck, no mount-and-rsync. +echo "[sys:$SYSTEM] rootfs.ext4 (sparse clone of base)" +rm -f "$ROOTFS" +cp --sparse=always "$BASE" "$ROOTFS" + +# Stamp the system name so the agent can identify itself. +# Note: no explicit `sync` — `umount` syncs the filesystem being unmounted. +# A global `sync` here would block until every dirty page on the host's +# disk is flushed, which under 98-way parallel builds means every build +# waits for everyone else's writeback before its own umount returns. +MNT="$(mktemp -d)" +trap 'sudo umount "'"$MNT"'" 2>/dev/null || true; rmdir "'"$MNT"'" 2>/dev/null || true' EXIT +sudo mount -o loop "$ROOTFS" "$MNT" +echo "$SYSTEM" | sudo tee "$MNT/etc/clickbench-system" >/dev/null +sudo umount "$MNT" +trap - EXIT + +# 2. System disk: ClickBench scripts only. Sized at SYSDISK_SIZE_GB (2 GB +# default). The agent populates the dataset files into this disk at +# provision time by copying from the shared read-only datasets disk. +echo "[sys:$SYSTEM] system.ext4 ${SYSDISK_SIZE_GB}G" +rm -f "$SYSDISK" +truncate -s "${SYSDISK_SIZE_GB}G" "$SYSDISK" +mkfs.ext4 -F -L cbsystem -E lazy_itable_init=1,lazy_journal_init=1 "$SYSDISK" >/dev/null + +SYS_MNT="$(mktemp -d)" +trap 'sudo umount "'"$SYS_MNT"'" 2>/dev/null || true; rmdir "'"$SYS_MNT"'" 2>/dev/null || true' EXIT +sudo mount -o loop "$SYSDISK" "$SYS_MNT" + +# The cbsystem disk is mounted at /opt/clickbench/sysdisk in the guest; +# the overlay points its upperdir at sysdisk/upper and its workdir at +# sysdisk/work. Pre-create that layout and drop the system's ClickBench +# scripts into upper. +sudo mkdir -p "$SYS_MNT/upper" "$SYS_MNT/work" +# Only `template.json` is playground-frontend metadata; everything +# else (mapping.json, schema.json, ingest.json, queries.json, ...) is +# a runtime artifact the load/query scripts read with -d@/--config. +# An over-broad `*.json` exclude broke elasticsearch, pinot, druid, +# parseable, quickwit. +sudo rsync -a --exclude 'results/' --exclude 'template.json' --exclude 'README*' \ + "$SRC"/ "$SYS_MNT/upper"/ + +# Systems whose load/install reference ../lib/download-hits-* pick those +# up from /opt/clickbench/lib in the base rootfs — see build-base-rootfs. +# Stubs there symlink from the shared RO dataset instead of wget'ing +# from datasets.clickhouse.com. + +# Discover the data format from benchmark.sh and stamp it in the upper; +# the agent uses this to decide which dataset symlinks to add for +# partitioned formats. +download_script="$(set +e; unset BENCH_DOWNLOAD_SCRIPT; \ + eval "$(grep -E '^[[:space:]]*(export[[:space:]]+)?BENCH_DOWNLOAD_SCRIPT=' "$SRC/benchmark.sh" | head -1)"; \ + printf '%s' "${BENCH_DOWNLOAD_SCRIPT:-}")" +case "$download_script" in + *parquet-partitioned*) format=parquet-partitioned ;; + *parquet-single*) format=parquet ;; + *tsv*) format=tsv ;; + *csv*) format=csv ;; + "") format=none ;; + *) format=unknown ;; +esac +echo "$format" | sudo tee "$SYS_MNT/upper/.data-format" >/dev/null +echo "[sys:$SYSTEM] format=$format" + +sudo chown -R 0:0 "$SYS_MNT/upper" +sudo chmod -R u+rwX,go+rX "$SYS_MNT/upper" +sudo umount "$SYS_MNT" +trap - EXIT + +echo "[sys:$SYSTEM] done" +ls -lh "$OUT_DIR" diff --git a/playground/requirements.txt b/playground/requirements.txt new file mode 100644 index 0000000000..fc3fbd65b4 --- /dev/null +++ b/playground/requirements.txt @@ -0,0 +1,10 @@ +# Runtime dependencies for the playground host server. +# +# aiohttp >= 3.10 covers: +# GHSA-5h86-8mv2-jq9f static handler symlink path traversal (3.9.2) +# GHSA-q3qx-c6g2-7pw2 request smuggling (3.9.4) +# GHSA-pjjw-qhg8-p2p9 follow_symlinks default tightening (3.10.x) +# +# main.py asserts this minimum at startup; the pin here is for the +# pip-based install path. +aiohttp>=3.10 diff --git a/playground/scripts/agent-selftest.sh b/playground/scripts/agent-selftest.sh new file mode 100755 index 0000000000..afad1ca506 --- /dev/null +++ b/playground/scripts/agent-selftest.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Spin up the agent in a local sandbox and hit its HTTP endpoints. Useful for +# iterating on agent.py without rebuilding a Firecracker image. +# +# The sandbox is just two temp directories that mimic the in-VM mounts: +# /tmp/clickbench-selftest/system — copy of the duckdb system dir +# /tmp/clickbench-selftest/datasets — empty +# +# We exercise: +# GET /health expects 200 with provisioned=false +# GET /stats expects 200 with cpu/mem/disk +# POST /provision expects 200 (will fail unless duckdb is locally installed) +# POST /query expects 200 with timing headers, output bytes capped +# +# Cleanup: kills the agent on exit. + +set -euo pipefail + +SANDBOX="${SANDBOX:-/tmp/clickbench-selftest}" +SYS="${SANDBOX}/system" +DATA="${SANDBOX}/datasets" +PORT="${PORT:-18080}" +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +rm -rf "$SANDBOX" +mkdir -p "$SYS" "$DATA" +cp -a "$REPO_DIR/duckdb"/. "$SYS"/ + +# A trivial "system" that doesn't need provisioning: replace install/start/load +# with no-ops so the smoke test focuses on the agent's HTTP path. +cat > "$SYS/install" <<'EOF' +#!/bin/bash +echo "fake install" +EOF +cat > "$SYS/start" <<'EOF' +#!/bin/bash +exit 0 +EOF +cat > "$SYS/check" <<'EOF' +#!/bin/bash +exit 0 +EOF +cat > "$SYS/load" <<'EOF' +#!/bin/bash +echo "fake load" +EOF +# A query script that echoes the request and reports 0.123s. +cat > "$SYS/query" <<'EOF' +#!/bin/bash +cat +echo "0.123" >&2 +EOF +chmod +x "$SYS"/{install,start,check,load,query} + +echo "selftest: starting agent on :$PORT" +CLICKBENCH_SYSTEM_DIR="$SYS" \ +CLICKBENCH_DATASETS_DIR="$DATA" \ +CLICKBENCH_AGENT_STATE="$SANDBOX/state" \ +CLICKBENCH_SYSTEM_NAME=selftest \ +CLICKBENCH_AGENT_PORT="$PORT" \ +CLICKBENCH_OUTPUT_LIMIT=64 \ +python3 "$REPO_DIR/playground/agent/agent.py" & +AGENT_PID=$! +trap 'kill $AGENT_PID 2>/dev/null || true' EXIT + +# wait for listen +for i in {1..30}; do + if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then + break + fi + sleep 0.2 +done + +echo "--- /health ---" +curl -fsS "http://127.0.0.1:$PORT/health" +echo "--- /stats ---" +curl -fsS "http://127.0.0.1:$PORT/stats" +echo "--- POST /provision ---" +curl -fsS -X POST "http://127.0.0.1:$PORT/provision" | head -c 500; echo + +echo "--- POST /query (capped output) ---" +LONG_BODY="$(printf 'X%.0s' {1..2048})" # 2 KB of X +curl -sS -X POST --data-binary "$LONG_BODY" "http://127.0.0.1:$PORT/query" -D - -o /tmp/clickbench-selftest.out +echo +echo "Output size: $(wc -c < /tmp/clickbench-selftest.out) bytes (cap was 64)" +echo "First chars: $(head -c 32 /tmp/clickbench-selftest.out)" + +echo "--- POST /query (without provisioning state) ---" +rm -rf "$SANDBOX/state" +mkdir -p "$SANDBOX/state" +curl -sS -X POST --data-binary "SELECT 1" "http://127.0.0.1:$PORT/query" -D - -o /dev/null | head -3 + +echo "OK" diff --git a/playground/scripts/download-datasets.sh b/playground/scripts/download-datasets.sh new file mode 100755 index 0000000000..be0fb56fe5 --- /dev/null +++ b/playground/scripts/download-datasets.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Eagerly download every ClickBench dataset format into the playground +# datasets dir. Run idempotent: each download script is `wget --continue`-based +# so re-running picks up where the previous run left off. +# +# Output: +# /opt/clickbench-playground/datasets/ +# hits.parquet single-file Athena parquet +# hits_partitioned/hits_0..99.parquet partitioned parquet +# hits.tsv decompressed TSV (~75 GB) +# hits.csv decompressed CSV (~75 GB) +# +# These files are read-only-mounted into every Firecracker VM via a virtio-blk +# device built by `build-datasets-image.sh`. + +set -e + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +DATASETS="${STATE_DIR}/datasets" +LIB="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../.. && pwd)/lib" + +mkdir -p "$DATASETS" +mkdir -p "$DATASETS/hits_partitioned" + +step() { echo "[$(date -u +%FT%TZ)] $*"; } + +step "parquet (single)" +if [ ! -f "$DATASETS/hits.parquet" ] || [ "$(stat -c%s "$DATASETS/hits.parquet" 2>/dev/null || echo 0)" -lt 14000000000 ]; then + "$LIB/download-hits-parquet-single" "$DATASETS" +else + step " cached" +fi + +step "parquet (partitioned)" +need=0 +for i in $(seq 0 99); do + f="$DATASETS/hits_partitioned/hits_${i}.parquet" + if [ ! -f "$f" ] || [ "$(stat -c%s "$f" 2>/dev/null || echo 0)" -lt 100000000 ]; then + need=1 + break + fi +done +if [ "$need" = "1" ]; then + "$LIB/download-hits-parquet-partitioned" "$DATASETS/hits_partitioned" +else + step " cached" +fi + +step "tsv" +if [ ! -f "$DATASETS/hits.tsv" ] || [ "$(stat -c%s "$DATASETS/hits.tsv" 2>/dev/null || echo 0)" -lt 70000000000 ]; then + "$LIB/download-hits-tsv" "$DATASETS" +else + step " cached" +fi + +step "csv" +if [ ! -f "$DATASETS/hits.csv" ] || [ "$(stat -c%s "$DATASETS/hits.csv" 2>/dev/null || echo 0)" -lt 70000000000 ]; then + "$LIB/download-hits-csv" "$DATASETS" +else + step " cached" +fi + +step "json.gz" +# Used by parseable. The full hits.json.gz is ~23 GB on +# datasets.clickhouse.com. +if [ ! -f "$DATASETS/hits.json.gz" ] || [ "$(stat -c%s "$DATASETS/hits.json.gz" 2>/dev/null || echo 0)" -lt 22000000000 ]; then + wget --continue --progress=dot:giga \ + -O "$DATASETS/hits.json.gz" \ + 'https://datasets.clickhouse.com/hits_compatible/hits.json.gz' +else + step " cached" +fi + +step "json (decompressed)" +# parseable / victorialogs decompress hits.json.gz at load time and +# blow out the 200 GB sysdisk; stage the decompressed copy on the +# read-only dataset disk so they can stream it without a temp file. +if [ ! -f "$DATASETS/hits.json" ] || [ "$(stat -c%s "$DATASETS/hits.json" 2>/dev/null || echo 0)" -lt 70000000000 ]; then + pigz -dk -c "$DATASETS/hits.json.gz" > "$DATASETS/hits.json" +else + step " cached" +fi + +step "done" +du -sh "$DATASETS"/* diff --git a/playground/scripts/install-firecracker.sh b/playground/scripts/install-firecracker.sh new file mode 100755 index 0000000000..749a051172 --- /dev/null +++ b/playground/scripts/install-firecracker.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# Idempotent: download firecracker + jailer if they're not in +# /opt/clickbench-playground/bin/, and fetch the guest kernel. + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +FC_VERSION="${FIRECRACKER_VERSION:-v1.13.1}" +KERNEL_URL="${GUEST_KERNEL_URL:-https://s3.amazonaws.com/spec.ccfc.min/firecracker-ci/v1.13/x86_64/vmlinux-6.1.141}" + +sudo mkdir -p "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache} +# Only chown the top-level subdirs we created. `chown -R` on $STATE_DIR +# would descend into any live mount underneath it — notably the loop- +# mounted rootfs that build-base-rootfs.sh keeps open under tmp/base-build +# while it's running — and flip /etc/sudoers inside the future VM image +# to uid 1000, breaking sudo on every subsequent provision. +sudo chown "$(id -u):$(id -g)" \ + "$STATE_DIR" \ + "$STATE_DIR"/{bin,kernel,datasets,systems,vms,logs,run,snapshots,tmp,cache} + +# The playground relies on reflink (cp --reflink=always) to clone +# 200 GB-apparent / multi-GB-real per-VM disks in milliseconds instead +# of seconds, and on transparent zstd compression to fit 100 system +# goldens on the host. Btrfs gives us both out of the box. Format the +# playground volume before running install-firecracker.sh: +# +# sudo mkfs.btrfs -L cbplayground -f /dev/ +# echo 'LABEL=cbplayground /opt/clickbench-playground btrfs \ +# defaults,noatime,compress=zstd:1,nofail 0 2' | sudo tee -a /etc/fstab +# sudo mount /opt/clickbench-playground +# +# (XFS also works for reflink but doesn't have transparent compression, +# so on XFS the host fills up at ~7 TB once every system is provisioned.) +# +# Sanity-check at install time so a missing reflink is loud: +if ! ( cd "$STATE_DIR" && tmp1="$(mktemp -p .)" && \ + tmp2="$(mktemp -p . -u)" && \ + cp --reflink=always "$tmp1" "$tmp2" 2>/dev/null; rc=$? ; \ + rm -f "$tmp1" "$tmp2"; exit "$rc" ); then + echo "[install] ERROR: $STATE_DIR does not support reflink. The" >&2 + echo "playground needs cp --reflink=always to clone per-VM disks" >&2 + echo "fast. Reformat the volume as XFS (or ext4 with shared_blocks)" >&2 + echo "and re-run this script. See the comment block above." >&2 + exit 1 +fi + +if [ ! -x "$STATE_DIR/bin/firecracker" ]; then + arch="$(uname -m)" + url="https://github.com/firecracker-microvm/firecracker/releases/download/${FC_VERSION}/firecracker-${FC_VERSION}-${arch}.tgz" + echo "[install] firecracker ${FC_VERSION}" + tmpdir="$(mktemp -d)" + curl -fsSL "$url" -o "$tmpdir/firecracker.tgz" + tar -C "$tmpdir" -xzf "$tmpdir/firecracker.tgz" --strip-components=1 + install -m 0755 "$tmpdir/firecracker-${FC_VERSION}-${arch}" "$STATE_DIR/bin/firecracker" + install -m 0755 "$tmpdir/jailer-${FC_VERSION}-${arch}" "$STATE_DIR/bin/jailer" + rm -rf "$tmpdir" +fi + +if [ ! -f "$STATE_DIR/kernel/vmlinux" ]; then + echo "[install] guest kernel" + curl -fsSL "$KERNEL_URL" -o "$STATE_DIR/kernel/vmlinux" +fi + +# IP forwarding for the per-VM TAPs. +sudo sysctl -w net.ipv4.ip_forward=1 >/dev/null +echo "net.ipv4.ip_forward=1" | sudo tee /etc/sysctl.d/99-clickbench-playground.conf >/dev/null + +# Datalake-style systems run with restricted outbound: the playground +# server hosts an SNI-allowlist proxy (playground/server/sni_proxy.py) +# bound on 0.0.0.0:8443 / :8080, and iptables REDIRECTs the VM TAP's +# 443/80 to those ports (see playground/server/net.enable_filtered_internet). +# Make sure local Linux conntrack can route the REDIRECT and the +# proxy's outbound connection back to the VM. Nothing else to install +# — the proxy is pure Python, the kernel needs route_localnet so +# REDIRECT can target a localhost listener from a non-local source. +sudo sysctl -w net.ipv4.conf.all.route_localnet=1 >/dev/null +echo "net.ipv4.conf.all.route_localnet=1" | sudo tee -a /etc/sysctl.d/99-clickbench-playground.conf >/dev/null + +# TLS for the playground API. We use certbot --standalone (binds 80 +# briefly for HTTP-01) to acquire / renew a Let's Encrypt cert for +# the public hostname. The unprivileged playground user reads the +# private key via the ssl-cert group; a deploy hook re-applies that +# ownership after every renewal so renewals don't lock us out. +# +# Skipped entirely if PLAYGROUND_TLS_DOMAIN isn't set — operators +# running the playground purely on a private network don't need +# the cert. +if [ -n "${PLAYGROUND_TLS_DOMAIN:-}" ]; then + sudo apt-get install -y certbot + getent group ssl-cert >/dev/null || sudo groupadd ssl-cert + sudo usermod -aG ssl-cert "${SUDO_USER:-ubuntu}" + if [ ! -d "/etc/letsencrypt/live/${PLAYGROUND_TLS_DOMAIN}" ]; then + sudo certbot certonly --standalone --non-interactive --agree-tos \ + -m "${PLAYGROUND_TLS_EMAIL:-${SUDO_USER:-ubuntu}@$(hostname -d 2>/dev/null || echo localhost)}" \ + -d "${PLAYGROUND_TLS_DOMAIN}" + fi + sudo tee /etc/letsencrypt/renewal-hooks/deploy/clickbench-ssl-cert.sh >/dev/null <<'HOOK' +#!/bin/bash +# Managed by playground/scripts/install-firecracker.sh. After every +# cert renewal, re-apply ssl-cert group ownership so the unprivileged +# playground user can keep reading the new privkey. +set -e +chgrp -R ssl-cert /etc/letsencrypt/live /etc/letsencrypt/archive +chmod 750 /etc/letsencrypt/live /etc/letsencrypt/archive +find /etc/letsencrypt/live /etc/letsencrypt/archive -type d -exec chmod 750 {} \; +find /etc/letsencrypt/archive -name "privkey*.pem" -exec chmod 640 {} \; +HOOK + sudo chmod 755 /etc/letsencrypt/renewal-hooks/deploy/clickbench-ssl-cert.sh + # Apply once now so the freshly issued cert is readable too. + sudo bash /etc/letsencrypt/renewal-hooks/deploy/clickbench-ssl-cert.sh +fi + +# Local DNS resolver for the VMs. enable_filtered_internet REDIRECTs +# the VM TAP's UDP/53 to the host's port 53. systemd-resolved binds +# only to 127.0.0.53 / .54, so REDIRECT'd traffic (dst=10.200.x.1:53) +# hits a closed port without a real listener. Dnsmasq fills that gap: +# bind every non-loopback address, forward upstream, UDP only from +# the VM side (iptables INPUT drops TCP/53 from VM addresses). +if ! command -v dnsmasq >/dev/null 2>&1; then + sudo apt-get install -y dnsmasq +fi +sudo tee /etc/dnsmasq.d/playground.conf >/dev/null <<'CONF' +# Managed by playground/scripts/install-firecracker.sh — do not edit. +port=53 +bind-interfaces +# systemd-resolved already owns 127.0.0.53/54 on loopback; leave it. +except-interface=lo +no-resolv +server=1.1.1.1 +server=8.8.8.8 +no-dhcp-interface= +log-queries=no +cache-size=2000 +CONF +sudo systemctl enable dnsmasq >/dev/null 2>&1 || true +sudo systemctl restart dnsmasq + +echo "[install] done" +"$STATE_DIR/bin/firecracker" --version diff --git a/playground/scripts/provision-all.sh b/playground/scripts/provision-all.sh new file mode 100755 index 0000000000..913f900fa2 --- /dev/null +++ b/playground/scripts/provision-all.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Kick off /api/admin/provision/ for every playground-eligible system. +# The server's own semaphore in VMManager bounds the actual concurrency +# (PLAYGROUND_PROVISION_CONCURRENCY, default 32) — this script just fires +# the requests as fast as the host can accept them, then polls until the +# server reports every system as either snapshotted or down-with-error. + +set -euo pipefail + +BASE="${PLAYGROUND_BASE:-http://127.0.0.1:8000}" +STATUS_LOG="${STATUS_LOG:-/opt/clickbench-playground/logs/provision-all.status}" +SKIP_PROVISIONED="${SKIP_PROVISIONED:-yes}" + +# Fetch the catalog. +mapfile -t SYSTEMS < <( + curl -fsS "$BASE/api/systems" | + python3 -c 'import json,sys; [print(x["name"]) for x in json.load(sys.stdin)]' +) + +echo "$(date -Is) catalog: ${#SYSTEMS[@]} systems" + +# Kick off /provision for each system that isn't already snapshotted. +for sys in "${SYSTEMS[@]}"; do + if [ "$SKIP_PROVISIONED" = "yes" ]; then + state=$(curl -fsS "$BASE/api/system/$sys" | + python3 -c 'import json,sys; print(json.load(sys.stdin)["state"])') + if [ "$state" = "snapshotted" ] || [ "$state" = "ready" ]; then + echo " $sys: skip (already $state)" + continue + fi + fi + echo " $sys: kicking provision" + curl -fsS -X POST "$BASE/api/admin/provision/$sys" >/dev/null +done + +echo "$(date -Is) all kicked off; polling state..." + +# Poll until every system reaches a terminal state. Emit one line per +# transition. +declare -A LAST_STATE +while true; do + in_flight=0 + succeeded=0 + failed=0 + : > "$STATUS_LOG.tmp" + for sys in "${SYSTEMS[@]}"; do + body=$(curl -fsS --max-time 5 "$BASE/api/system/$sys" 2>/dev/null || echo '{}') + state=$(echo "$body" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("state","?"))' 2>/dev/null) + err=$(echo "$body" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("last_error") or "")' 2>/dev/null) + echo "$sys $state $err" >> "$STATUS_LOG.tmp" + prev="${LAST_STATE[$sys]:-}" + if [ "$state" != "$prev" ]; then + ts=$(date -Is) + echo "$ts $sys: $prev -> $state${err:+ (err=$err)}" + LAST_STATE[$sys]=$state + fi + case "$state" in + snapshotted|ready) succeeded=$((succeeded+1)) ;; + down) [ -n "$err" ] && failed=$((failed+1)) || in_flight=$((in_flight+1)) ;; + provisioning) in_flight=$((in_flight+1)) ;; + *) in_flight=$((in_flight+1)) ;; + esac + done + mv "$STATUS_LOG.tmp" "$STATUS_LOG" + echo "$(date -Is) ok=$succeeded fail=$failed in_flight=$in_flight" + if [ "$in_flight" -eq 0 ]; then + echo "$(date -Is) done" + break + fi + sleep 30 +done + +# Final summary. +echo "" +echo "=== FINAL SUMMARY ===" +awk '{print $2}' "$STATUS_LOG" | sort | uniq -c +echo "" +echo "=== FAILED ===" +awk '$2 == "down" && NF > 2 {print}' "$STATUS_LOG" diff --git a/playground/scripts/run-server.sh b/playground/scripts/run-server.sh new file mode 100755 index 0000000000..b3bc56b959 --- /dev/null +++ b/playground/scripts/run-server.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Convenience wrapper to start the playground API server in the foreground. +# Looks for .env in the repo root for ClickHouse Cloud creds. + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +if [ -f "$REPO_DIR/playground/.env" ]; then + # shellcheck disable=SC2046 + export $(grep -v '^#' "$REPO_DIR/playground/.env" | xargs) +fi + +cd "$REPO_DIR" +exec python3 -m playground.server.main diff --git a/playground/scripts/smoke-boot.sh b/playground/scripts/smoke-boot.sh new file mode 100755 index 0000000000..c65bec62f0 --- /dev/null +++ b/playground/scripts/smoke-boot.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Boot a single Firecracker VM with the playground's base rootfs, attaching +# only the rootfs (no system disk, no dataset disk). Confirms the kernel + +# rootfs + agent path works end-to-end before we start asking it to install +# a database. Tears down on exit. +# +# Usage: smoke-boot.sh [slot] +# Logs go to /opt/clickbench-playground/logs/smoke-boot.log + +set -euo pipefail + +STATE_DIR="${PLAYGROUND_STATE_DIR:-/opt/clickbench-playground}" +SLOT="${1:-250}" # high slot to avoid clashing with the real registry +SOCK="$STATE_DIR/vms/smoke-boot.sock" +LOG="$STATE_DIR/logs/smoke-boot.log" +TAP="fc-tap-${SLOT}" +HOST_IP="10.200.${SLOT}.1" +GUEST_IP="10.200.${SLOT}.2" + +cleanup() { + echo "[smoke] cleanup" + pkill -f "firecracker.*${SOCK}" 2>/dev/null || true + sleep 0.3 + sudo ip link set "$TAP" down 2>/dev/null || true + sudo ip tuntap del dev "$TAP" mode tap 2>/dev/null || true + rm -f "$SOCK" +} +trap cleanup EXIT + +mkdir -p "$STATE_DIR/vms" "$STATE_DIR/logs" +rm -f "$SOCK" + +if ! ip link show "$TAP" >/dev/null 2>&1; then + sudo ip tuntap add dev "$TAP" mode tap +fi +sudo ip addr flush dev "$TAP" 2>/dev/null || true +sudo ip addr add "${HOST_IP}/24" dev "$TAP" +sudo ip link set "$TAP" up + +# Start Firecracker +"$STATE_DIR/bin/firecracker" --api-sock "$SOCK" --id smoke-boot >"$LOG" 2>&1 & +FC_PID=$! +echo "[smoke] firecracker pid=$FC_PID sock=$SOCK" + +# Wait for socket +for _ in $(seq 1 40); do + [ -S "$SOCK" ] && break + sleep 0.1 +done + +api() { + local m="$1" path="$2" body="${3:-}" + if [ -n "$body" ]; then + curl --unix-socket "$SOCK" -fsS -X "$m" "http://localhost$path" \ + -H 'Content-Type: application/json' --data "$body" + else + curl --unix-socket "$SOCK" -fsS -X "$m" "http://localhost$path" + fi +} + +api PUT /boot-source "$(cat </dev/null 2>&1; then + ok=1 + break + fi + sleep 1 +done + +if [ "$ok" = "1" ]; then + echo "[smoke] OK — agent responded after ${i}s" + curl -fsS "http://${GUEST_IP}:50080/health" | head -c 200; echo + echo "[smoke] /stats:" + curl -fsS "http://${GUEST_IP}:50080/stats" | head -c 400; echo +else + echo "[smoke] FAIL — agent never responded; firecracker log tail:" + tail -30 "$LOG" +fi diff --git a/daft-parquet-partitioned/.preserve-state b/playground/server/__init__.py similarity index 100% rename from daft-parquet-partitioned/.preserve-state rename to playground/server/__init__.py diff --git a/playground/server/clickhouse-bootstrap.sql b/playground/server/clickhouse-bootstrap.sql new file mode 100644 index 0000000000..e410270b47 --- /dev/null +++ b/playground/server/clickhouse-bootstrap.sql @@ -0,0 +1,57 @@ +-- ClickHouse bootstrap for the playground. +-- +-- Run as the default user on every server startup. Idempotent: CREATE +-- IF NOT EXISTS / CREATE OR REPLACE / ALTER USER ... IDENTIFIED. +-- +-- Parameter: +-- {db:Identifier} target database name (substituted in Python +-- before submit — CH doesn't substitute +-- Identifier params inside CREATE VIEW) + +-- =========================================================================== +-- Schema +-- =========================================================================== + +CREATE DATABASE IF NOT EXISTS {db:Identifier}; + +-- Request log + shared queries (same table). +-- ORDER BY ts so recent rows cluster (chronological retention / TTL friendly). +-- The `id` is a random 64-bit handle the API returns to the client; an +-- INDEX projection on `id` (ClickHouse 26.1 syntax) gives a fast +-- equality lookup without sorting the main part by id. +CREATE TABLE IF NOT EXISTS {db:Identifier}.requests ( + id UInt64, + ts DateTime64(6) DEFAULT now64(6), + client_addr String, + user_agent String, + system LowCardinality(String), + query String, + output String, + output_bytes UInt64, + output_truncated UInt8, + query_time Nullable(Float64), + wall_time Float64, + status UInt16, + error String, + PROJECTION by_id INDEX id TYPE basic +) ENGINE = MergeTree ORDER BY ts; + +-- Operational events (vm boot, oom-kick, watchdog teardown, ...). +CREATE TABLE IF NOT EXISTS {db:Identifier}.events ( + ts DateTime64(6) DEFAULT now64(6), + system LowCardinality(String), + kind LowCardinality(String), + detail String +) ENGINE = MergeTree ORDER BY ts; + +-- Parameterized view for the read-only public user. SQL SECURITY DEFINER +-- runs the SELECT as the view's owner (the default user), so the reader +-- doesn't need a direct grant on `requests` — just SELECT on the view. +-- The id projection makes this an O(log n) lookup even when the table +-- has billions of rows. +CREATE OR REPLACE VIEW {db:Identifier}.request_by_id +DEFINER = default +SQL SECURITY DEFINER +AS SELECT * FROM {db:Identifier}.requests + WHERE id = {q_id:UInt64} + LIMIT 1; diff --git a/playground/server/clickhouse_bootstrap.py b/playground/server/clickhouse_bootstrap.py new file mode 100644 index 0000000000..3c90b386f7 --- /dev/null +++ b/playground/server/clickhouse_bootstrap.py @@ -0,0 +1,215 @@ +"""ClickHouse bootstrap: schema + writer/reader users. + +Runs on server startup using the default-user credentials supplied in +/clickhouse.conf (or env vars). Idempotent: + +* Schema DDL (DB + tables + parameterized view) lives in the sibling + clickhouse-bootstrap.sql file — that file is the canonical source + of truth for the request/event tables and the request_by_id view. +* The two human users are created here in Python because CREATE USER + doesn't accept HTTP query parameters for the password / host clauses + and rotating those at bootstrap time is convenient. + +Generated credentials persist to /clickhouse-credentials.json +so the writer/reader users keep the same password across restarts; if +the file is missing, fresh random passwords are generated and the +users' passwords are reset to match. +""" +from __future__ import annotations + +import contextlib +import json +import logging +import secrets +from pathlib import Path +from typing import NamedTuple +from urllib.parse import urlencode + +import aiohttp + +from .config import Config + +log = logging.getLogger("clickhouse_bootstrap") + +_SQL_FILE = Path(__file__).parent / "clickhouse-bootstrap.sql" + + +class Credentials(NamedTuple): + url: str + db: str + writer_user: str + writer_password: str + # The reader's password is *always* empty — the user is created + # in CH with sha256_hash(""), and clients just pass their name + # with no password — so we don't keep it as a field. + reader_user: str + + +def _gen_pw(n: int = 32) -> str: + # URL-safe random string with at least one digit + one special char + # (CH Cloud's password policy requires both). + body = secrets.token_urlsafe(n) + return body + "!1" + + +# SHA-256 of the empty string. Lets us tell ClickHouse "no password" +# in a form the server understands (it stores sha256_hash users) while +# the operator-facing identity is plainly empty. +_SHA256_EMPTY = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + + +def _credentials_path(cfg: Config) -> Path: + return cfg.state_dir / "clickhouse-credentials.json" + + +def _load_or_make_credentials(cfg: Config) -> str: + """Return the writer's password. Persist on first run. + + The reader has no password — it's an unauthenticated public identity + that can only SELECT from the parameterized request_by_id view — + so we don't manage one here. + """ + path = _credentials_path(cfg) + if path.exists(): + with contextlib.suppress(Exception): + data = json.loads(path.read_text()) + return data["writer_password"] + creds = {"writer_password": _gen_pw()} + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(creds, indent=2)) + path.chmod(0o600) + return creds["writer_password"] + + +async def _ch_exec(session: aiohttp.ClientSession, + url: str, user: str, password: str, + sql: str, params: dict[str, str] | None = None) -> str: + """Run `sql` via HTTP and return the response body. Raises on + non-2xx.""" + qs = {f"param_{k}": v for k, v in (params or {}).items()} + full = url + ("?" + urlencode(qs) if qs else "") + async with session.post( + full, data=sql, + auth=aiohttp.BasicAuth(user, password), + timeout=aiohttp.ClientTimeout(total=60), + ) as r: + body = await r.text() + if r.status >= 300: + raise RuntimeError(f"CH bootstrap {r.status}: {body[:500]} (sql={sql[:200]})") + return body + + +async def bootstrap(cfg: Config) -> Credentials | None: + """Run the bootstrap. Returns the credentials the runtime should + use for the writer (logging sink) and the reader (saved-query + lookups). Returns None if the bootstrap config isn't present + (CH integration disabled).""" + if not (cfg.ch_cloud_url and cfg.ch_cloud_user and cfg.ch_cloud_password): + return None + db = cfg.ch_cloud_db or "playground" + writer_pw = _load_or_make_credentials(cfg) + async with aiohttp.ClientSession() as session: + # The writer user is host-pinned to our public IP. Resolve it + # via api.ipify.org rather than asking CH (its various + # client-address functions vary by interface and version). + try: + async with session.get( + "https://api.ipify.org", + timeout=aiohttp.ClientTimeout(total=10), + ) as r: + writer_host = (await r.text()).strip() + except Exception: + writer_host = "0.0.0.0" # fallback: no IP restriction + log.info("writer host (public IP) = %s", writer_host) + + # Schema DDL from the .sql file. We substitute {db:Identifier} + # in Python rather than via HTTP params because the CREATE VIEW + # body contains a *view-time* parameter ({q_id:UInt64}) and + # ClickHouse skips HTTP param substitution for DDL when there + # are unbound placeholders — the result is the VIEW DDL going + # out with literal `{db:Identifier}` and the parser barking + # "Database `` does not exist". Python substitution is fine + # because the db name is our own (no SQL-injection vector). + sql_blob = _SQL_FILE.read_text().replace("{db:Identifier}", db) + statements = _split_sql_statements(sql_blob) + for stmt in statements: + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + stmt, + ) + + # Users — passwords + host clause go inline; ALTER on every + # bootstrap rotates / re-pins them. + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"CREATE USER IF NOT EXISTS playground_writer " + f"IDENTIFIED WITH sha256_password BY '{writer_pw}' " + f"HOST IP '{writer_host}'", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"ALTER USER playground_writer " + f"IDENTIFIED WITH sha256_password BY '{writer_pw}' " + f"HOST IP '{writer_host}'", + ) + # Strict scope: revoke everything then re-grant only INSERT. + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"REVOKE ALL ON *.* FROM playground_writer", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"GRANT INSERT ON {db}.requests TO playground_writer", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"GRANT INSERT ON {db}.events TO playground_writer", + ) + + # Reader: public, no password — SELECT-only on the parameterized + # view, with tight resource caps. The empty password is + # expressed as sha256_hash of the empty string so CH stores it + # in the same shape as any other sha256 user but the + # operator-facing identity is plainly "no password". + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"CREATE USER IF NOT EXISTS playground_reader " + f"IDENTIFIED WITH sha256_hash BY '{_SHA256_EMPTY}' " + f"DEFAULT DATABASE {db} " + f"SETTINGS readonly = 2, " + f"max_execution_time = 5, " + f"max_memory_usage = 100000000, " + f"max_result_rows = 1, " + f"max_rows_to_read = 1048576, " + f"max_threads = 2", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"ALTER USER playground_reader " + f"IDENTIFIED WITH sha256_hash BY '{_SHA256_EMPTY}'", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"REVOKE ALL ON *.* FROM playground_reader", + ) + await _ch_exec( + session, cfg.ch_cloud_url, cfg.ch_cloud_user, cfg.ch_cloud_password, + f"GRANT SELECT ON {db}.request_by_id TO playground_reader", + ) + + log.info("ClickHouse bootstrap complete (writer host=%s)", writer_host) + return Credentials( + url=cfg.ch_cloud_url, db=db, + writer_user="playground_writer", writer_password=writer_pw, + reader_user="playground_reader", + ) + + +def _split_sql_statements(blob: str) -> list[str]: + """Strip --line comments, split on top-level `;`. Naive — fine for + the bootstrap file which has no string literals or nested blocks.""" + stripped = "\n".join( + line for line in blob.splitlines() + if not line.lstrip().startswith("--") + ) + return [s.strip() for s in stripped.split(";") if s.strip()] diff --git a/playground/server/config.py b/playground/server/config.py new file mode 100644 index 0000000000..571b186d64 --- /dev/null +++ b/playground/server/config.py @@ -0,0 +1,149 @@ +"""Central configuration for the playground server. + +All knobs are read from environment variables so a single systemd unit can drop +them in. Falls back to sensible defaults for local development. + +ClickHouse credentials (for the logging sink and any future shared-query +feature) can also be supplied via an INI file at +`/clickhouse.conf`. Env vars, if set, take precedence over the +file so existing deployments keep working unchanged. +""" +from __future__ import annotations + +import configparser +import os +from dataclasses import dataclass +from pathlib import Path + + +def _env_int(name: str, default: int) -> int: + v = os.environ.get(name) + if not v: + return default + try: + return int(v) + except ValueError: + return default + + +@dataclass(frozen=True) +class Config: + # Where on the host disk we keep VM artifacts and dataset images. + state_dir: Path + repo_dir: Path + # HTTP listen target for the playground API server. Plain host:port string; + # aiohttp parses it. + listen_host: str + listen_port: int + # TLS. When tls_cert + tls_key are both set, the server binds on + # tls_port with TLS *in addition to* listen_port (which becomes the + # plain-HTTP redirect listener). Empty strings disable TLS. + tls_cert: str + tls_key: str + tls_port: int + # Per-VM resources. + vm_vcpus: int + vm_mem_mib: int + vm_rootfs_size_gb: int + # Watchdog thresholds. + cpu_busy_window_sec: int + cpu_busy_threshold: float + # Cumulative CPU-seconds (across all vCPUs) a VM may burn between + # restore and now. Anything past this is presumably a runaway and + # the watchdog kicks the VM. Counts only "ready" state — provision + # is allowed to use as much CPU as it wants. + vm_cpu_total_seconds_cap: int + # Seconds since the last /query a "ready" VM is allowed to linger + # before the monitor tears it down. Snapshot is preserved; the + # next /query restores in seconds. Keeps the kernel's KVM + # async_pf_execute workqueue from accumulating idle VMs and + # slowing unrelated services (sshd in particular). + idle_kick_after_sec: int + host_min_free_ram_gb: int + host_min_free_disk_gb: int + # Per-system disk full check. + vm_disk_pct_kill_threshold: float + # ClickHouse Cloud logging. + ch_cloud_url: str + ch_cloud_user: str + ch_cloud_password: str + ch_cloud_db: str + + @property + def kernel_path(self) -> Path: return self.state_dir / "kernel" / "vmlinux" + @property + def base_rootfs(self) -> Path: return self.state_dir / "base-rootfs.ext4" + @property + def datasets_image(self) -> Path: return self.state_dir / "datasets.ext4" + @property + def systems_dir(self) -> Path: return self.state_dir / "systems" + @property + def vms_dir(self) -> Path: return self.state_dir / "vms" + @property + def logs_dir(self) -> Path: return self.state_dir / "logs" + @property + def firecracker_bin(self) -> Path: return self.state_dir / "bin" / "firecracker" + + +def _load_clickhouse_conf(state_dir: Path) -> dict[str, str]: + """Parse /clickhouse.conf. Format is INI with a single + [clickhouse] section: + + [clickhouse] + url = https://your-host.clickhouse.cloud:8443 + user = default + password = ... + db = playground + + Missing file / parse errors return {} silently — the env-var path + still works and the logging sink just stays disabled. + """ + path = state_dir / "clickhouse.conf" + if not path.exists(): + return {} + parser = configparser.ConfigParser() + try: + parser.read(path) + except configparser.Error: + return {} + if "clickhouse" not in parser: + return {} + return {k: v for k, v in parser["clickhouse"].items()} + + +def load() -> Config: + state_dir = Path(os.environ.get("PLAYGROUND_STATE_DIR", "/opt/clickbench-playground")) + repo_dir = Path(os.environ.get("PLAYGROUND_REPO_DIR", "/home/ubuntu/ClickBench")) + listen = os.environ.get("PLAYGROUND_LISTEN", "0.0.0.0:8000") + host, _, port = listen.rpartition(":") + ch_conf = _load_clickhouse_conf(state_dir) + return Config( + state_dir=state_dir, + repo_dir=repo_dir, + listen_host=host or "0.0.0.0", + listen_port=int(port or 8000), + tls_cert=os.environ.get("PLAYGROUND_TLS_CERT", ""), + tls_key=os.environ.get("PLAYGROUND_TLS_KEY", ""), + tls_port=_env_int("PLAYGROUND_TLS_PORT", 443), + vm_vcpus=_env_int("VM_VCPUS", 4), + # 16 GB. DataFrame-style engines (chdb-dataframe, duckdb-dataframe, + # daft-*, polars-dataframe) would need >100 GB to load the full + # hits dataset and don't fit the playground's model; they're + # disabled in systems.py instead of bumping VM RAM for everyone. + vm_mem_mib=_env_int("VM_MEM_MIB", 16 * 1024), + vm_rootfs_size_gb=_env_int("VM_ROOTFS_SIZE_GB", 200), + cpu_busy_window_sec=_env_int("VM_CPU_BUSY_WINDOW_SEC", 120), + cpu_busy_threshold=float(os.environ.get("VM_CPU_BUSY_THRESHOLD", "0.97")), + vm_cpu_total_seconds_cap=_env_int("VM_CPU_TOTAL_SECONDS_CAP", 3600), + # 10 minutes default. Cold restore is ~5-30 s for most engines, + # so a user returning within 10 min finds a warm VM; longer + # gaps cost a single fresh restore. + idle_kick_after_sec=_env_int("VM_IDLE_KICK_AFTER_SEC", 600), + host_min_free_ram_gb=_env_int("HOST_MIN_FREE_RAM_GB", 32), + host_min_free_disk_gb=_env_int("HOST_MIN_FREE_DISK_GB", 100), + vm_disk_pct_kill_threshold=float(os.environ.get("VM_DISK_FULL_PCT", "0.97")), + ch_cloud_url=os.environ.get("CLICKHOUSE_CLOUD_URL", ch_conf.get("url", "")), + ch_cloud_user=os.environ.get("CLICKHOUSE_CLOUD_USER", ch_conf.get("user", "")), + ch_cloud_password=os.environ.get("CLICKHOUSE_CLOUD_PASSWORD", ch_conf.get("password", "")), + ch_cloud_db=os.environ.get("CLICKHOUSE_CLOUD_DB", ch_conf.get("db", "playground")), + ) diff --git a/playground/server/firecracker.py b/playground/server/firecracker.py new file mode 100644 index 0000000000..62aba74dca --- /dev/null +++ b/playground/server/firecracker.py @@ -0,0 +1,117 @@ +"""Thin async wrapper around Firecracker's REST API (Unix socket). + +We talk to the Firecracker process through its API socket, not the JSON config +file, because that's the only way to drive snapshot create/load and to mutate +runtime state. + +The HTTP layer is hand-rolled (single-shot HTTP/1.1 over Unix socket) so we +don't pull in extra deps just to send a few PUTs. Each call opens a new +connection — Firecracker's API socket is single-threaded and that's fine. +""" +from __future__ import annotations + +import asyncio +import json +from typing import Any + + +class FirecrackerError(RuntimeError): + pass + + +async def _request(socket_path: str, method: str, path: str, body: Any = None, + timeout: float = 30.0) -> tuple[int, bytes]: + payload = b"" + if body is not None: + payload = json.dumps(body).encode() + req_lines = [ + f"{method} {path} HTTP/1.1", + "Host: localhost", + "Accept: application/json", + "Connection: close", + ] + if payload: + req_lines.append("Content-Type: application/json") + req_lines.append(f"Content-Length: {len(payload)}") + req_lines.append("") + req_lines.append("") + head = "\r\n".join(req_lines).encode() + + reader, writer = await asyncio.wait_for( + asyncio.open_unix_connection(socket_path), timeout=timeout + ) + try: + writer.write(head + payload) + await writer.drain() + # Read response head line-by-line until the blank line that ends the + # header block. Don't `read(-1)` — Firecracker keeps the connection + # open after small responses (204s in particular), so EOF-based reads + # block until our timeout despite the response being fully on the + # wire. Once we have headers we know the Content-Length and can read + # exactly that many body bytes. + head_lines: list[bytes] = [] + while True: + line = await asyncio.wait_for(reader.readline(), timeout=timeout) + if not line: + # Server closed the connection mid-headers. + break + head_lines.append(line) + if line == b"\r\n" or line == b"\n": + break + + if not head_lines: + raise FirecrackerError(f"no response from firecracker for {method} {path}") + status_line = head_lines[0].rstrip(b"\r\n").decode("ascii", errors="replace") + parts = status_line.split(" ", 2) + if len(parts) < 2: + raise FirecrackerError(f"bad status line: {status_line!r}") + code = int(parts[1]) + + content_length = 0 + for raw_h in head_lines[1:]: + h = raw_h.rstrip(b"\r\n") + if not h: + continue + name, _, value = h.partition(b":") + if name.strip().lower() == b"content-length": + try: + content_length = int(value.strip()) + except ValueError: + content_length = 0 + + body_b = b"" + if content_length > 0: + body_b = await asyncio.wait_for( + reader.readexactly(content_length), timeout=timeout + ) + finally: + try: + writer.close() + await writer.wait_closed() + except Exception: + pass + return code, body_b + + +async def put(socket_path: str, path: str, body: Any = None, timeout: float = 30.0) -> None: + code, b = await _request(socket_path, "PUT", path, body, timeout) + if code >= 300: + raise FirecrackerError(f"PUT {path} -> {code}: {b!r}") + + +async def patch(socket_path: str, path: str, body: Any = None, timeout: float = 30.0) -> None: + code, b = await _request(socket_path, "PATCH", path, body, timeout) + if code >= 300: + raise FirecrackerError(f"PATCH {path} -> {code}: {b!r}") + + +async def get(socket_path: str, path: str, timeout: float = 30.0) -> dict: + code, b = await _request(socket_path, "GET", path, timeout=timeout) + if code >= 300: + raise FirecrackerError(f"GET {path} -> {code}: {b!r}") + if not b: + return {} + try: + return json.loads(b) + except Exception as e: + raise FirecrackerError(f"GET {path} -> non-JSON body: {b!r}") from e diff --git a/playground/server/logging_sink.py b/playground/server/logging_sink.py new file mode 100644 index 0000000000..4f46b506f4 --- /dev/null +++ b/playground/server/logging_sink.py @@ -0,0 +1,143 @@ +"""Batched, async logger that writes events to ClickHouse Cloud over HTTPS. + +Schema + users are bootstrapped on server startup by +`clickhouse_bootstrap.bootstrap()` — see clickhouse-bootstrap.sql for the +canonical DDL of the `requests` (request log + shared queries) and +`events` (operational events) tables and the `request_by_id` +parameterized view. This module only writes; it uses the writer user +issued by the bootstrap, NOT the default user. + +When CLICKHOUSE_CLOUD_URL is unset, both tables are mirrored to +/opt/clickbench-playground/logs/requests.jsonl and events.jsonl so the +service still has an audit trail in dev. +""" +from __future__ import annotations + +import asyncio +import contextlib +import json +import logging +import time +from pathlib import Path +from typing import Any + +import aiohttp + +from .config import Config +from .clickhouse_bootstrap import Credentials + +log = logging.getLogger("logging_sink") + + +class LoggingSink: + def __init__(self, cfg: Config, creds: Credentials | None): + self.cfg = cfg + self._creds = creds + self._queue: asyncio.Queue[tuple[str, dict]] = asyncio.Queue(maxsize=10000) + self._task: asyncio.Task | None = None + self._session: aiohttp.ClientSession | None = None + self._local_files: dict[str, Path] = {} + self._enabled = creds is not None + + async def start(self) -> None: + self.cfg.logs_dir.mkdir(parents=True, exist_ok=True) + self._local_files = { + "requests": self.cfg.logs_dir / "requests.jsonl", + "events": self.cfg.logs_dir / "events.jsonl", + } + if self._enabled: + self._session = aiohttp.ClientSession() + self._task = asyncio.create_task(self._flusher(), name="logging-sink") + + async def stop(self) -> None: + if self._task: + self._task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await self._task + if self._session: + await self._session.close() + + def write_request(self, **row: Any) -> None: + self._enqueue("requests", row) + + def write_event(self, **row: Any) -> None: + self._enqueue("events", row) + + def _enqueue(self, table: str, row: dict) -> None: + row.setdefault("ts", _now_dt64()) + try: + self._queue.put_nowait((table, row)) + except asyncio.QueueFull: + # Backpressure: drop oldest log lines first so we never block the + # query path on the audit trail. + try: + self._queue.get_nowait() + self._queue.put_nowait((table, row)) + except Exception: + pass + + async def _exec_ch(self, sql: str) -> None: + assert self._session is not None and self._creds is not None + async with self._session.post( + self._creds.url, + data=sql, + auth=aiohttp.BasicAuth(self._creds.writer_user, + self._creds.writer_password), + timeout=aiohttp.ClientTimeout(total=30), + ) as r: + if r.status >= 300: + txt = await r.text() + raise RuntimeError(f"CH error {r.status}: {txt[:500]}") + + async def _insert_ch(self, table: str, rows: list[dict]) -> None: + if not rows or self._creds is None: + return + body = "\n".join(json.dumps(r, default=str) for r in rows) + sql = f"INSERT INTO {self._creds.db}.{table} FORMAT JSONEachRow\n{body}" + await self._exec_ch(sql) + + async def _flusher(self) -> None: + buf: dict[str, list[dict]] = {"requests": [], "events": []} + last_flush = time.monotonic() + try: + while True: + timeout = 1.0 + try: + table, row = await asyncio.wait_for(self._queue.get(), timeout=timeout) + buf[table].append(row) + except asyncio.TimeoutError: + pass + # Flush every 1s or when batch >= 256 rows for any table + now = time.monotonic() + full = any(len(v) >= 256 for v in buf.values()) + if full or now - last_flush > 1.0: + await self._do_flush(buf) + for k in buf: + buf[k] = [] + last_flush = now + except asyncio.CancelledError: + await self._do_flush(buf) + raise + + async def _do_flush(self, buf: dict[str, list[dict]]) -> None: + for table, rows in buf.items(): + if not rows: + continue + # Always write to the local JSONL too — gives us a tail for + # debugging and a buffer if CH Cloud rejects. + try: + with open(self._local_files[table], "ab") as f: + for r in rows: + f.write((json.dumps(r, default=str) + "\n").encode()) + except Exception: + pass + if self._enabled: + try: + await self._insert_ch(table, rows) + except Exception as e: + log.warning("CH insert failed (%r); rows preserved in JSONL", e) + + +def _now_dt64() -> str: + t = time.time() + return time.strftime("%Y-%m-%d %H:%M:%S.", time.gmtime(t)) + f"{int((t % 1) * 1e6):06d}" diff --git a/playground/server/main.py b/playground/server/main.py new file mode 100644 index 0000000000..f786acdaaa --- /dev/null +++ b/playground/server/main.py @@ -0,0 +1,594 @@ +"""Playground HTTP API + static UI server. + +Endpoints: + + GET / redirects to /ui/ + GET /ui/... static-serves files from ../web/ + GET /api/systems JSON list of all playground-eligible systems + GET /api/state JSON snapshot of every VM's state + GET /api/system/{name} detail for a single system + POST /api/query?system=X body is the SQL; returns application/octet-stream + with timing in headers + GET /api/provision-log/{name} the system's most recent provision log + POST /api/admin/provision/{name} + manual trigger for first-time provision; convenient + for warming a system before the first user query + +The /api/query path tries once, then on failure tears down + restores from +snapshot and retries exactly once, matching the spec. +""" +from __future__ import annotations + +import asyncio +import base64 +import contextlib +import logging +import signal +import time +import urllib.parse + + +def _id_to_b64url(n: int) -> str: + """64-bit unsigned int -> 11-char URL-safe base64 (no padding). + Symmetric counterpart to _b64url_to_id. The same number can travel + as a UInt64 inside ClickHouse and as a tidy URL handle.""" + return base64.urlsafe_b64encode( + n.to_bytes(8, "big"), + ).rstrip(b"=").decode("ascii") + + +def _b64url_to_id(s: str) -> int: + pad = "=" * (-len(s) % 4) + return int.from_bytes(base64.urlsafe_b64decode(s + pad), "big") +from pathlib import Path + +import collections +import threading + +import aiohttp +from aiohttp import web + +# --- Per-IP rate limiting ------------------------------------------ +# +# /api/query and /api/warmup are unauthenticated; a single bad actor +# can wedge the playground by spamming restores or kicking heavy +# queries against snapshotted systems. Bound the damage with two +# rolling windows per source IP: +# 200 requests / minute +# 3000 requests / hour +# In-memory, since restarts are infrequent and per-IP state across +# restarts isn't load-bearing for this use case. +_RATE_PER_MINUTE = 200 +_RATE_PER_HOUR = 3000 +# Above this many distinct IPs in the dict, do a full O(N) sweep on +# the next request to drop entries whose newest timestamp is > 1h +# old (or whose deque is empty). Bounds the dict at ~threshold * +# 24 KB-per-entry-worst-case after a sweep. +_RATE_GC_THRESHOLD = 4096 +_rate_lock = threading.Lock() +_rate_hits: dict[str, collections.deque[float]] = {} + + +def _client_ip(req: web.Request) -> str: + """TCP peer address — never the X-Forwarded-For header. Honoring + XFF without an authenticated reverse proxy in front would let any + caller spoof their IP and bypass the rate limit by rotating the + header value.""" + return req.remote or "?" + + +def _rate_check(req: web.Request) -> web.Response | None: + """Return a 429 Response if the caller has exceeded either window, + else None. Increments the counter on a pass.""" + ip = _client_ip(req) + now = time.monotonic() + hour_ago = now - 3600 + minute_ago = now - 60 + with _rate_lock: + # Bulk GC. Drop any IP whose newest hit fell outside the + # 1-hour window (or whose deque is empty for whatever + # reason). Without this, one-shot source IPs would + # accumulate forever and grow _rate_hits unboundedly. Only + # fires when the dict has grown past the threshold, so the + # cost is amortized O(1) per request. + if len(_rate_hits) > _RATE_GC_THRESHOLD: + for stale in [k for k, d in _rate_hits.items() + if not d or d[-1] < hour_ago]: + _rate_hits.pop(stale, None) + + dq = _rate_hits.get(ip) + if dq is None: + dq = collections.deque() + _rate_hits[ip] = dq + # Trim timestamps older than 1 hour. The deque is sorted + # because we only ever append `now`, so popping from the left + # is O(1) per stale entry. + while dq and dq[0] < hour_ago: + dq.popleft() + if len(dq) >= _RATE_PER_HOUR: + retry = max(1, int(dq[0] + 3600 - now)) + return web.json_response( + {"error": "rate limit (hour)", + "limit": _RATE_PER_HOUR, "retry_after": retry}, + status=429, headers={"Retry-After": str(retry)}, + ) + recent = sum(1 for t in dq if t >= minute_ago) + if recent >= _RATE_PER_MINUTE: + # Find oldest sample inside the 1-minute window to suggest + # a reasonable retry-after. + oldest_in_min = next(t for t in dq if t >= minute_ago) + retry = max(1, int(oldest_in_min + 60 - now)) + return web.json_response( + {"error": "rate limit (minute)", + "limit": _RATE_PER_MINUTE, "retry_after": retry}, + status=429, headers={"Retry-After": str(retry)}, + ) + dq.append(now) + return None + + +# Refuse to start on aiohttp versions vulnerable to the static-handler +# path traversal (GHSA-5h86-8mv2-jq9f, fixed in 3.9.2) and the HTTP +# request-smuggling fixes that landed in 3.9.x / 3.10.x. We mitigate +# follow_symlinks at the call site too, but a runtime guard catches +# any future regression where someone re-enables it under an old lib. +_AIOHTTP_MIN = (3, 10, 0) +_aiohttp_v = tuple(int(p) for p in aiohttp.__version__.split(".")[:3] + if p.isdigit()) +if _aiohttp_v < _AIOHTTP_MIN: + raise RuntimeError( + f"aiohttp {aiohttp.__version__} is too old; " + f"require >= {'.'.join(str(x) for x in _AIOHTTP_MIN)} " + "(GHSA-5h86-8mv2-jq9f and request-smuggling fixes)" + ) + +from . import config as config_mod +from . import net +from . import systems as systems_mod +from .logging_sink import LoggingSink +from .monitor import Monitor +from .vm_manager import VMManager + +log = logging.getLogger("playground") + + +class App: + def __init__(self) -> None: + self.cfg = config_mod.load() + self.systems = systems_mod.discover(self.cfg.repo_dir) + self.vmm = VMManager(self.cfg, self.systems) + # CH credentials are populated by on_startup after the + # bootstrap runs. None means CH integration is disabled and + # the sink falls back to JSONL. + self.ch_creds = None + self.sink = LoggingSink(self.cfg, None) + self.monitor = Monitor(self.cfg, self.vmm, self.sink) + + async def on_startup(self, _app: web.Application) -> None: + from . import clickhouse_bootstrap + try: + self.ch_creds = await clickhouse_bootstrap.bootstrap(self.cfg) + except Exception as e: + log.warning("ClickHouse bootstrap failed (%r); CH integration disabled", e) + self.ch_creds = None + # Replace the placeholder sink with one wired to the bootstrap's + # writer credentials. + self.sink = LoggingSink(self.cfg, self.ch_creds) + self.monitor.sink = self.sink + await self.sink.start() + await self.monitor.start() + # SNI-allowlist proxy that mediates outbound HTTP/HTTPS for + # *-datalake systems (see net.enable_filtered_internet). + from . import sni_proxy + self.sni_servers = await sni_proxy.start( + https_port=net.PROXY_HTTPS_PORT, + http_port=net.PROXY_HTTP_PORT, + ) + # Lock the proxy + local DNS resolver to internal/VM traffic + # only. The proxy binds 0.0.0.0 so iptables PREROUTING REDIRECT + # from each TAP can find it; without these INPUT rules it + # would be an open S3 allowlist relay on the public internet. + await net.setup_host_firewall() + + async def on_cleanup(self, _app: web.Application) -> None: + await self.monitor.stop() + await self.sink.stop() + for s in getattr(self, "sni_servers", []): + s.close() + with contextlib.suppress(Exception): + await s.wait_closed() + + # ── handlers ───────────────────────────────────────────────────────── + + async def handle_systems(self, _r: web.Request) -> web.Response: + return web.json_response([s.asdict() for s in self.systems.values()]) + + async def handle_state(self, _r: web.Request) -> web.Response: + return web.json_response(self.vmm.list_all()) + + async def handle_system(self, req: web.Request) -> web.Response: + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound(reason=f"unknown system: {name}") + vm = self.vmm.vms[name] + return web.json_response({ + **self.systems[name].asdict(), + "state": vm.state, + "has_snapshot": vm.snapshot_bin.exists(), + "provisioned_at": vm.provisioned_at, + "last_used": vm.last_used, + "ready_since": vm.ready_since, + "last_error": vm.last_error, + "agent_url": self.vmm.agent_url(vm), + }) + + async def handle_queries(self, req: web.Request) -> web.Response: + """Return example queries for a system from its queries.sql. + + Splits by lines: one example per non-empty, non-comment line. + ClickBench's benchmark files are uniformly one-query-per-line — + SQL ones end with `;`, dataframe ones (pandas/polars/daft) and + LogsQL ones (victorialogs) don't, and a `;\n` splitter collapsed + the whole pandas file into one entry. Truncates to 200 entries. + """ + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + path = self.cfg.repo_dir / name / "queries.sql" + if not path.exists(): + return web.json_response([]) + text = path.read_text(errors="replace") + out = [] + for line in text.splitlines(): + q = line.strip() + if not q or q.startswith("--") or q.startswith("#"): + continue + out.append(q) + if len(out) >= 200: + break + return web.json_response(out) + + async def handle_provision_log(self, req: web.Request) -> web.Response: + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + log_path = self.cfg.logs_dir / f"firecracker-{name}.log" + if not log_path.exists(): + return web.Response(text="", content_type="text/plain") + try: + # Tail at most 64 KB so the browser doesn't choke. + data = log_path.read_bytes()[-64 * 1024:] + except Exception as e: + data = f"(failed to read: {e})".encode() + return web.Response(body=data, content_type="text/plain") + + async def handle_admin_provision(self, req: web.Request) -> web.Response: + # Heavy operation (re-runs install/start/load, can take hours on + # the postgres-indexed-class systems): only callable from the + # host itself. The public UI must never be able to trigger this. + # Trust the TCP peer address — we don't honor X-Forwarded-For + # here, because the server is meant to listen on the same host + # the operator drives the curls from. If you put it behind a + # reverse proxy, the proxy itself becomes the peer and the + # check still passes (which is fine: the proxy is part of the + # admin trust boundary). + peer = req.transport.get_extra_info("peername") if req.transport else None + peer_ip = peer[0] if peer else "" + if peer_ip not in ("127.0.0.1", "::1"): + raise web.HTTPForbidden(reason="admin endpoint, loopback only") + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + # Fire-and-forget; the client polls /api/system/{name} for state. + asyncio.create_task(self._provision_bg(name)) + return web.json_response({"started": True, "system": name}) + + async def _provision_bg(self, name: str) -> None: + try: + await self.vmm.provision_now(name) + except Exception as e: + log.exception("background provision failed for %s", name) + self.sink.write_event(system=name, kind="provision-failed", detail=repr(e)) + + async def handle_warmup(self, req: web.Request) -> web.Response: + """Trigger snapshot restore for a system without running a query. + + The UI calls this on system-select so the restore (~30 s for + cold ones, near-zero with reflink+live-daemon) overlaps the + time the user is typing their query, and Run query lands on a + VM that's already serving. Refuses to initial-provision; if no + snapshot exists, returns 409 and the user has to /admin/provision. + """ + if (resp := _rate_check(req)) is not None: + return resp + name = req.match_info["name"] + if name not in self.systems: + raise web.HTTPNotFound() + vm = self.vmm.vms[name] + if vm.state == "ready": + return web.json_response({"already_ready": True, "system": name}) + if vm.state == "provisioning": + return web.json_response({"in_flight": True, "system": name}) + if not self.vmm.vms[name].snapshot_bin.exists() and \ + not (self.cfg.systems_dir / name / "rootfs.golden.ext4").exists(): + return web.json_response( + {"error": "no snapshot; POST /api/admin/provision first"}, + status=409, + ) + asyncio.create_task(self._warmup_bg(name)) + return web.json_response({"started": True, "system": name}) + + async def _warmup_bg(self, name: str) -> None: + try: + await self.vmm.ensure_ready_for_query(name) + except Exception as e: + log.warning("warmup failed for %s: %r", name, e) + + async def handle_query(self, req: web.Request) -> web.StreamResponse: + if (resp := _rate_check(req)) is not None: + return resp + system_name = req.query.get("system", "") + if system_name not in self.systems: + return web.json_response({"error": f"unknown system: {system_name!r}"}, + status=400) + sql = await req.read() + if not sql.strip(): + return web.json_response({"error": "empty SQL"}, status=400) + + client_addr = req.headers.get("X-Forwarded-For", req.remote or "?") + ua = req.headers.get("User-Agent", "") + wall_t0 = time.monotonic() + status = 500 + body = b"" + headers: dict[str, str] = {} + err: str | None = None + # Random 64-bit handle returned to the client as a base64url + # string (X-Query-Id) AND persisted to the requests table. + # The same id is the key the browser uses to permalink the + # result via /api/saved/. + import secrets + query_id = secrets.randbits(64) + try: + body, headers, status = await self._dispatch_query(system_name, sql) + except Exception as e: + err = repr(e) + log.exception("[%s] query dispatch failed", system_name) + finally: + wall = time.monotonic() - wall_t0 + try: + self.sink.write_request( + id=query_id, + client_addr=client_addr, user_agent=ua, + system=system_name, + query=sql.decode("utf-8", errors="replace")[:65536], + output=body.decode("utf-8", errors="replace")[:1 << 20], + output_bytes=int(headers.get("X-Output-Bytes", "0") or 0), + output_truncated=int(headers.get("X-Output-Truncated", "0") or 0), + query_time=(float(headers["X-Query-Time"]) + if "X-Query-Time" in headers else None), + wall_time=wall, + status=status, + # The agent URL-encodes X-Error so newlines survive + # the HTTP header. Decode before logging so the + # ClickHouse log stores the raw multi-line error. + error=err or urllib.parse.unquote( + headers.get("X-Error", "")), + ) + except Exception: + log.exception("logging request failed") + + resp = web.Response(body=body, status=status, + content_type="application/octet-stream") + for k, v in headers.items(): + resp.headers[k] = v + resp.headers["X-Wall-Time"] = f"{wall:.6f}" + resp.headers["X-Query-Id"] = _id_to_b64url(query_id) + if err and "X-Error" not in resp.headers: + # URL-encode to match the agent path so the client always + # decodes uniformly. err is usually a one-liner, so this + # is a no-op in practice; but keeps the contract simple. + resp.headers["X-Error"] = urllib.parse.quote(err[:512]) + return resp + + async def handle_saved(self, req: web.Request) -> web.Response: + """Look up a previously-saved query+result by its base64url id. + Returns a JSON object with output, error, timing — the browser + replays it as if the query just ran. + """ + if self.ch_creds is None: + raise web.HTTPServiceUnavailable(reason="shared queries disabled (no CH)") + b64 = req.match_info["b64"] + try: + qid = _b64url_to_id(b64) + except Exception: + raise web.HTTPBadRequest(reason="malformed id") + # Read through the parameterized view as the reader user. The + # view has SQL SECURITY DEFINER so the reader doesn't need a + # direct grant on the requests table. + sql = (f"SELECT * FROM {self.ch_creds.db}.request_by_id(q_id={qid}) " + f"FORMAT JSONEachRow") + async with aiohttp.ClientSession() as s: + async with s.post( + self.ch_creds.url, data=sql, + auth=aiohttp.BasicAuth(self.ch_creds.reader_user), + timeout=aiohttp.ClientTimeout(total=10), + ) as r: + text = await r.text() + if r.status >= 300: + raise web.HTTPBadGateway(reason=f"ch {r.status}: {text[:300]}") + text = text.strip() + if not text: + raise web.HTTPNotFound(reason="no saved query with that id") + return web.Response(text=text, content_type="application/json") + + async def _dispatch_query(self, system_name: str, sql: bytes + ) -> tuple[bytes, dict[str, str], int]: + """Run the query once. On low-level failure (VM unreachable, transport + error) tear down and retry once. Higher-level errors (non-2xx from the + agent itself, e.g. a SQL syntax error) are NOT retried — they're real + results.""" + last_exc: Exception | None = None + for attempt in (1, 2): + try: + vm = await self.vmm.ensure_ready_for_query(system_name) + except Exception as e: + last_exc = e + if attempt == 1: + self.sink.write_event(system=system_name, kind="ensure-failed", + detail=f"attempt {attempt}: {e!r}") + await asyncio.sleep(0.5) + continue + raise + url = self.vmm.agent_url(vm) + "/query" + try: + async with aiohttp.ClientSession() as s: + async with s.post(url, data=sql, + timeout=aiohttp.ClientTimeout(total=60)) as r: + body = await r.read() + headers = {k: r.headers[k] for k in r.headers if k.startswith("X-")} + headers.setdefault("X-Output-Bytes", str(len(body))) + if r.status >= 400: + # ANY error tears the VM down so the next + # request restores from snapshot. The + # playground accepts destructive SQL + # (DROP TABLE hits, TRUNCATE, ...) — once + # an error happens we can't be sure the + # daemon's state is still consistent, so + # the safe move is always to reset. + self.sink.write_event( + system=system_name, kind="post-query-error", + detail=f"attempt {attempt}: status={r.status}", + ) + await self.vmm.kick(system_name, + "post-query-error") + return body, headers, r.status + except Exception as e: + last_exc = e + self.sink.write_event(system=system_name, kind="agent-error", + detail=f"attempt {attempt}: {e!r}") + if attempt == 1: + # Hard kill, will trigger snapshot restore on next ensure. + await self.vmm.kick(system_name, "agent-error-retry") + await asyncio.sleep(0.5) + continue + raise + # unreachable, but keep mypy happy + raise RuntimeError(str(last_exc)) + +def build_app() -> web.Application: + obj = App() + app = web.Application(client_max_size=4 * 1024 * 1024) + app.on_startup.append(obj.on_startup) + app.on_cleanup.append(obj.on_cleanup) + + app.router.add_get("/api/systems", obj.handle_systems) + app.router.add_get("/api/state", obj.handle_state) + app.router.add_get("/api/system/{name}", obj.handle_system) + app.router.add_get("/api/queries/{name}", obj.handle_queries) + app.router.add_get("/api/provision-log/{name}", obj.handle_provision_log) + app.router.add_post("/api/admin/provision/{name}", obj.handle_admin_provision) + app.router.add_post("/api/warmup/{name}", obj.handle_warmup) + app.router.add_post("/api/query", obj.handle_query) + app.router.add_get("/api/saved/{b64}", obj.handle_saved) + + # Static UI + web_dir = Path(__file__).resolve().parent.parent / "web" + + async def root_redirect(_r: web.Request) -> web.Response: + raise web.HTTPFound("/ui/") + + async def ui_index(_r: web.Request) -> web.FileResponse: + resp = web.FileResponse(web_dir / "index.html") + resp.headers["Cache-Control"] = "no-store" + return resp + + @web.middleware + async def no_cache_static(request: web.Request, handler): + resp = await handler(request) + if request.path.startswith("/ui/"): + resp.headers["Cache-Control"] = "no-store" + return resp + + @web.middleware + async def cors(request: web.Request, handler): + # Permit the index.html opened from file:// (or any other origin) + # to call /api/* directly. The browser sends Origin: null in that + # case and refuses the response without ACAO. Reflecting the + # request's Origin keeps credentials-less CORS working in every + # browser. Preflight OPTIONS gets a synthetic 204 here. + origin = request.headers.get("Origin", "*") + if request.method == "OPTIONS": + return web.Response(status=204, headers={ + "Access-Control-Allow-Origin": origin, + "Access-Control-Allow-Methods": "GET, POST, OPTIONS", + "Access-Control-Allow-Headers": + request.headers.get("Access-Control-Request-Headers", "*"), + "Access-Control-Max-Age": "86400", + }) + resp = await handler(request) + resp.headers["Access-Control-Allow-Origin"] = origin + resp.headers["Access-Control-Expose-Headers"] = ( + "X-Query-Time, X-Wall-Time, X-Query-Wall-Time, " + "X-Output-Bytes, X-Output-Truncated, X-Exit-Code, " + "X-System, X-Error, X-Query-Id" + ) + return resp + + app.middlewares.append(no_cache_static) + app.middlewares.append(cors) + app.router.add_get("/", root_redirect) + app.router.add_get("/ui/", ui_index) + app.router.add_get("/ui", ui_index) + # follow_symlinks=False — GHSA-5h86-8mv2-jq9f covers a path-traversal + # in aiohttp's static handler that's only reachable when symlinks are + # followed. The repo's web/ tree has no symlinks anyway. + app.router.add_static("/ui/", path=str(web_dir), show_index=False) + + return app + + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + ) + cfg = config_mod.load() + app = build_app() + # Wire signals to a clean shutdown. + runner = web.AppRunner(app) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(runner.setup()) + + # Always bind the plain port. + site = web.TCPSite(runner, cfg.listen_host, cfg.listen_port) + loop.run_until_complete(site.start()) + log.info("playground listening on http://%s:%d", + cfg.listen_host, cfg.listen_port) + + # If TLS is configured, also bind the TLS port. The unit needs + # CAP_NET_BIND_SERVICE to bind 443 as an unprivileged user; see + # clickbench-playground.service. + if cfg.tls_cert and cfg.tls_key: + import ssl + sslctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + sslctx.load_cert_chain(cfg.tls_cert, cfg.tls_key) + # Disable client-cert request (we serve over TLS, we don't + # mutually authenticate). + sslctx.verify_mode = ssl.CERT_NONE + tls_site = web.TCPSite(runner, cfg.listen_host, cfg.tls_port, + ssl_context=sslctx) + loop.run_until_complete(tls_site.start()) + log.info("playground listening on https://%s:%d", + cfg.listen_host, cfg.tls_port) + + stop = asyncio.Event() + for sig in (signal.SIGTERM, signal.SIGINT): + loop.add_signal_handler(sig, stop.set) + loop.run_until_complete(stop.wait()) + loop.run_until_complete(runner.cleanup()) + loop.close() + + +if __name__ == "__main__": + main() diff --git a/playground/server/monitor.py b/playground/server/monitor.py new file mode 100644 index 0000000000..be3cf9b076 --- /dev/null +++ b/playground/server/monitor.py @@ -0,0 +1,250 @@ +"""Background watchdog. + +Runs alongside the API server. Once per second: + + * For every running VM, sample CPU% (from /proc//stat), RSS, and the + rootfs file's current physical size (via stat). Update the VM record. + * If a VM has been at >= cpu_busy_threshold for cpu_busy_window_sec + contiguous seconds, restart it. + * If a VM's rootfs is filled past vm_disk_pct_kill_threshold of its nominal + cap (200 GB) — i.e. the sparse file is using more than that fraction — + restart it. + * Sample host free memory / free disk on the state_dir filesystem. If under + threshold, find the largest live VM (by RSS for memory pressure, by + rootfs_used_bytes for disk pressure) and kick it. + +`kick` is implemented via vm_manager.kick(name, reason), which leaves the +snapshot intact. A subsequent /query will trigger a restore. +""" +from __future__ import annotations + +import asyncio +import logging +import os +import shutil +import time +from pathlib import Path + +from .config import Config +from .logging_sink import LoggingSink +from .vm_manager import VM, VMManager, _read_proc_jiffies + +log = logging.getLogger("monitor") + + +class Monitor: + def __init__(self, cfg: Config, vmm: VMManager, sink: LoggingSink): + self.cfg = cfg + self.vmm = vmm + self.sink = sink + self._cpu_history: dict[str, tuple[int, int, float]] = {} # name -> (utime+stime, total, ts) + self._task: asyncio.Task | None = None + + async def start(self) -> None: + self._task = asyncio.create_task(self._loop(), name="monitor") + + async def stop(self) -> None: + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + + async def _loop(self) -> None: + try: + while True: + await self._tick() + await asyncio.sleep(1.0) + except asyncio.CancelledError: + raise + except Exception: + log.exception("monitor loop crashed; restarting in 5s") + await asyncio.sleep(5) + self._task = asyncio.create_task(self._loop(), name="monitor") + + async def _tick(self) -> None: + # Per-VM sampling + for name, vm in self.vmm.vms.items(): + if vm.pid is None or not _pid_alive(vm.pid): + self._cpu_history.pop(name, None) + vm.cpu_busy_since = None + continue + cpu_pct = self._sample_cpu(name, vm.pid) + vm.rss_bytes = _rss(vm.pid) + rootfs = self.cfg.systems_dir / name / "rootfs.ext4" + try: + st = rootfs.stat() + vm.rootfs_used_bytes = st.st_blocks * 512 # actual allocated bytes + except FileNotFoundError: + vm.rootfs_used_bytes = 0 + await self._check_per_vm(vm, cpu_pct) + + # Host-wide checks + await self._check_host_pressure() + + def _sample_cpu(self, name: str, pid: int) -> float | None: + """Return ratio of CPU used since last sample, normalized by vcpu count.""" + stat_path = Path(f"/proc/{pid}/stat") + try: + stat = stat_path.read_text() + except FileNotFoundError: + self._cpu_history.pop(name, None) + return None + # The comm field can contain spaces — split around the last ')' + end = stat.rfind(")") + parts = stat[end + 2:].split() + utime = int(parts[11]) + stime = int(parts[12]) + now = time.monotonic() + prev = self._cpu_history.get(name) + self._cpu_history[name] = (utime, stime, now) + if prev is None: + return None + dt = now - prev[2] + d_jiffies = (utime + stime) - (prev[0] + prev[1]) + clk = os.sysconf("SC_CLK_TCK") + if dt <= 0 or clk <= 0: + return None + # Normalize by the number of vCPUs the VM was allocated. + cpu_seconds = d_jiffies / clk + return cpu_seconds / (dt * self.cfg.vm_vcpus) + + async def _check_per_vm(self, vm: VM, cpu_pct: float | None) -> None: + # Idle reaper. A "ready" VM that hasn't seen a /query in + # idle_kick_after_sec is consuming KVM threads + memory + # mappings + a TAP for no reason. The kernel's async_pf_execute + # workqueue starts hogging CPU when too many VMs idle-spin in + # parallel (see dmesg), which slows down unrelated services + # (sshd accept loop, in particular). Tear down idle ones; the + # snapshot is preserved and the next /query restores in seconds. + if (vm.state == "ready" and vm.last_used > 0 + and time.time() - vm.last_used >= self.cfg.idle_kick_after_sec): + self.sink.write_event( + system=vm.system.name, kind="idle-reaper", + detail=f"idle for {int(time.time() - vm.last_used)}s " + f"(threshold {self.cfg.idle_kick_after_sec}s)", + ) + await self.vmm.kick(vm.system.name, "idle-reaper") + return + + # CPU saturation watchdog + if cpu_pct is None: + vm.cpu_busy_since = None + elif cpu_pct >= self.cfg.cpu_busy_threshold: + if vm.cpu_busy_since is None: + vm.cpu_busy_since = time.monotonic() + elif time.monotonic() - vm.cpu_busy_since > self.cfg.cpu_busy_window_sec: + self.sink.write_event( + system=vm.system.name, kind="cpu-watchdog", + detail=f"sustained CPU >= {self.cfg.cpu_busy_threshold:.0%} for " + f"{self.cfg.cpu_busy_window_sec}s", + ) + await self.vmm.kick(vm.system.name, "cpu-watchdog") + vm.cpu_busy_since = None + return + else: + vm.cpu_busy_since = None + + # Cumulative CPU-cap watchdog. Only checked once we've passed + # the post-provision boundary (vm.state == "ready"); the + # cpu_baseline_jiffies was captured at that transition, so the + # delta below isolates query-serving CPU from boot/restore. + if vm.state == "ready" and vm.pid is not None and vm.cpu_baseline_jiffies: + jiffies = _read_proc_jiffies(vm.pid) + if jiffies > 0: + clk = os.sysconf("SC_CLK_TCK") or 100 + delta_s = (jiffies - vm.cpu_baseline_jiffies) / clk + if delta_s >= self.cfg.vm_cpu_total_seconds_cap: + self.sink.write_event( + system=vm.system.name, kind="cpu-cap", + detail=f"cumulative CPU {delta_s:.0f}s >= " + f"{self.cfg.vm_cpu_total_seconds_cap}s", + ) + await self.vmm.kick(vm.system.name, "cpu-cap") + return + + # Disk usage watchdog + cap = self.cfg.vm_rootfs_size_gb * (1 << 30) + if vm.rootfs_used_bytes and vm.rootfs_used_bytes / cap >= self.cfg.vm_disk_pct_kill_threshold: + self.sink.write_event( + system=vm.system.name, kind="disk-watchdog", + detail=f"rootfs used {vm.rootfs_used_bytes}/{cap}", + ) + await self.vmm.kick(vm.system.name, "disk-watchdog") + + async def _check_host_pressure(self) -> None: + # Memory pressure + info = _meminfo() + free_ram_gb = info.get("MemAvailable", 0) / (1024 * 1024) # MemAvailable is in KB + if free_ram_gb < self.cfg.host_min_free_ram_gb: + target = self._largest_running(by="rss") + if target: + self.sink.write_event( + system=target.system.name, kind="oom-kick", + detail=f"host free RAM {free_ram_gb:.1f}G < {self.cfg.host_min_free_ram_gb}G; " + f"largest is {target.system.name} ({target.rss_bytes/1e9:.1f}G)", + ) + await self.vmm.kick(target.system.name, "host-mem-pressure") + + # Disk pressure on the state dir + st = shutil.disk_usage(self.cfg.state_dir) + free_disk_gb = st.free / (1 << 30) + if free_disk_gb < self.cfg.host_min_free_disk_gb: + target = self._largest_running(by="disk") + if target: + self.sink.write_event( + system=target.system.name, kind="disk-kick", + detail=f"host free disk {free_disk_gb:.1f}G < {self.cfg.host_min_free_disk_gb}G; " + f"largest is {target.system.name} ({target.rootfs_used_bytes/1e9:.1f}G)", + ) + await self.vmm.kick(target.system.name, "host-disk-pressure") + + def _largest_running(self, *, by: str) -> VM | None: + running = [v for v in self.vmm.vms.values() + if v.pid is not None and _pid_alive(v.pid)] + if not running: + return None + key = (lambda v: v.rss_bytes) if by == "rss" else (lambda v: v.rootfs_used_bytes) + return max(running, key=key) + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + + +def _rss(pid: int) -> int: + try: + text = Path(f"/proc/{pid}/status").read_text() + except FileNotFoundError: + return 0 + for line in text.splitlines(): + if line.startswith("VmRSS:"): + parts = line.split() + return int(parts[1]) * 1024 # KB -> bytes + return 0 + + +def _meminfo() -> dict[str, int]: + out: dict[str, int] = {} + try: + text = Path("/proc/meminfo").read_text() + except FileNotFoundError: + return out + for line in text.splitlines(): + if ":" not in line: + continue + k, v = line.split(":", 1) + parts = v.split() + if parts: + try: + out[k.strip()] = int(parts[0]) + except ValueError: + continue + return out diff --git a/playground/server/net.py b/playground/server/net.py new file mode 100644 index 0000000000..78a0baa19e --- /dev/null +++ b/playground/server/net.py @@ -0,0 +1,269 @@ +"""Per-VM TAP networking setup for Firecracker. + +Each VM gets its own /24 subnet on a dedicated TAP device: + + fc-tap- host: 10.200..1/24 vm: 10.200..2 + +Where is a small integer derived from the system slot (1..N). The /24 has +plenty of headroom but only two addresses are used — one /24 per VM keeps the +host's routing trivial: no shared bridge, no ARP nonsense, no collisions. + +During the *provision* phase we masquerade outbound traffic from the VM so it +can apt-get / curl. After the snapshot we drop the FORWARD rules; the VM can +still talk to the host (and therefore the agent endpoint) but cannot reach the +internet. +""" +from __future__ import annotations + +import asyncio +import contextlib +import re + +# The /16 we hand out from. 10.200.0.0/16 -> 256 /24 subnets, plenty for our use. +_BASE = "10.200" + + +def addr_for(slot: int) -> tuple[str, str, str]: + """Return (host_ip, vm_ip, cidr) for the given slot id.""" + if not 1 <= slot <= 250: + raise ValueError(f"slot out of range: {slot}") + return f"{_BASE}.{slot}.1", f"{_BASE}.{slot}.2", f"{_BASE}.{slot}.0/24" + + +def tap_name(slot: int) -> str: + return f"fc-tap-{slot}" + + +def mac_for(slot: int) -> str: + # Locally administered, unicast, deterministic by slot. + return f"02:fc:00:00:{slot // 256:02x}:{slot % 256:02x}" + + +async def _run(*args: str, check: bool = True) -> tuple[int, bytes, bytes]: + p = await asyncio.create_subprocess_exec( + *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + o, e = await p.communicate() + if check and p.returncode != 0: + raise RuntimeError(f"cmd failed: {' '.join(args)}: {e.decode(errors='replace')}") + return p.returncode or 0, o, e + + +async def ensure_tap(slot: int) -> None: + """Create the TAP device and assign the host-side address. Idempotent.""" + tap = tap_name(slot) + host_ip, _, _ = addr_for(slot) + # Does the device already exist? + rc, out, _ = await _run("ip", "-br", "link", "show", "dev", tap, check=False) + if rc != 0: + await _run("sudo", "ip", "tuntap", "add", "dev", tap, "mode", "tap") + # Make sure the IP is there + rc, addrs, _ = await _run("ip", "-br", "addr", "show", "dev", tap, check=False) + if rc != 0 or host_ip not in addrs.decode(errors="replace"): + # Strip any old IPs then add the canonical one. + await _run("sudo", "ip", "addr", "flush", "dev", tap, check=False) + await _run("sudo", "ip", "addr", "add", f"{host_ip}/24", "dev", tap) + await _run("sudo", "ip", "link", "set", tap, "up") + + +async def teardown_tap(slot: int) -> None: + tap = tap_name(slot) + with contextlib.suppress(Exception): + await _run("sudo", "ip", "link", "set", tap, "down", check=False) + with contextlib.suppress(Exception): + await _run("sudo", "ip", "tuntap", "del", "dev", tap, "mode", "tap", check=False) + + +_NAT_RULE_PAT = re.compile(r"^-A POSTROUTING.*-o\s+(\S+).*-j\s+MASQUERADE", re.MULTILINE) + + +async def _host_default_iface() -> str: + """Return the host's default outbound interface (e.g. eth0).""" + rc, out, _ = await _run("ip", "-o", "-4", "route", "show", "default") + text = out.decode(errors="replace") + # "default via 1.2.3.4 dev eth0 ..." + parts = text.split() + for i, p in enumerate(parts): + if p == "dev" and i + 1 < len(parts): + return parts[i + 1] + raise RuntimeError(f"could not find default route: {text!r}") + + +async def _strip_slot(slot: int) -> None: + """Remove every iptables rule that mentions this slot's TAP or CIDR. + + Each enable/disable function calls this before installing its own + rules. That removes the previous mode's rules cleanly and avoids + the rule-order trap where, e.g., a 'disable_internet' catch-all + DROP added earlier sits ABOVE the RELATED-ESTABLISHED ACCEPT a + later 'enable_filtered_internet' wants to add — which would + silently block all reply traffic to the VM. + """ + tap = tap_name(slot) + _, _, cidr = addr_for(slot) + needle_tap = f" {tap} " # match -i/-o flags' value with surrounding spaces + needle_cidr = f" {cidr} " + + for table, chain in (("filter", "FORWARD"), + ("nat", "POSTROUTING"), + ("nat", "PREROUTING")): + rc, out, _ = await _run("sudo", "iptables", "-t", table, "-S", chain, + check=False) + if rc != 0: + continue + for line in out.decode(errors="replace").splitlines(): + if not line.startswith("-A "): + continue + padded = " " + line + " " + if needle_tap not in padded and needle_cidr not in padded: + continue + # Convert "-A CHAIN ..." into "-D CHAIN ..." for deletion. + args = line.split() + args[0] = "-D" + await _run("sudo", "iptables", "-t", table, *args, check=False) + + +async def enable_internet(slot: int) -> None: + """Allow the VM to reach the outside world via MASQUERADE + FORWARD.""" + await _strip_slot(slot) + iface = await _host_default_iface() + tap = tap_name(slot) + _, _, cidr = addr_for(slot) + await _run("sudo", "iptables", "-t", "nat", "-A", "POSTROUTING", + "-s", cidr, "-o", iface, "-j", "MASQUERADE") + for rule in ( + ("-i", tap, "-o", iface, "-j", "ACCEPT"), + ("-i", iface, "-o", tap, "-m", "state", "--state", + "RELATED,ESTABLISHED", "-j", "ACCEPT"), + ): + await _run("sudo", "iptables", "-A", "FORWARD", *rule) + + +# Ports the SNI-filtering proxy listens on (see sni_proxy.py). Kept in +# sync with the values in main.py. +PROXY_HTTPS_PORT = 8443 +PROXY_HTTP_PORT = 8080 + +# /16 we hand TAP addresses out of — used to scope INPUT firewall rules. +_INTERNAL_CIDR = f"{_BASE}.0.0/16" + + +async def setup_host_firewall() -> None: + """Install INPUT rules so the SNI proxy + local DNS resolver are + only reachable from the per-VM TAPs (10.200.0.0/16) and loopback. + Run once at server startup. + + Why this matters: sni_proxy.py binds 0.0.0.0:{8443,8080} so the + iptables PREROUTING REDIRECT from the VM's TAP can find it + regardless of which TAP IP the kernel routes the redirected + packet to. Without these INPUT rules the proxy would be an + open, unauthenticated S3 allowlist relay reachable from the + public internet. Same logic for the host's UDP/53 resolver. + + Per-protocol source allowlists: + TCP 8080 / 8443 (SNI proxy): internal CIDR + loopback. + UDP 53 (DNS): internal CIDR + loopback. + TCP 53 (DNS): loopback only — VMs must use UDP. + Big-payload DNS-over-TCP is a + classic exfiltration channel. + """ + # (proto, dport, allowed_sources) + ports = ( + ("tcp", str(PROXY_HTTPS_PORT), (_INTERNAL_CIDR, "127.0.0.0/8")), + ("tcp", str(PROXY_HTTP_PORT), (_INTERNAL_CIDR, "127.0.0.0/8")), + ("udp", "53", (_INTERNAL_CIDR, "127.0.0.0/8")), + # TCP/53 explicitly loopback-only: VMs are not allowed to use + # DNS-over-TCP. enable_filtered_internet's FORWARD DROP already + # covers the routed path; this closes the alternate path where + # a VM addresses the host's TAP IP directly. + ("tcp", "53", ("127.0.0.0/8",)), + ) + for proto, dport, sources in ports: + for src in sources: + allow = ("-p", proto, "--dport", dport, "-s", src, "-j", "ACCEPT") + rc, _, _ = await _run("sudo", "iptables", "-C", "INPUT", + *allow, check=False) + if rc != 0: + # Insert at the top so we override any permissive default. + await _run("sudo", "iptables", "-I", "INPUT", "1", *allow) + drop = ("-p", proto, "--dport", dport, "-j", "DROP") + rc, _, _ = await _run("sudo", "iptables", "-C", "INPUT", + *drop, check=False) + if rc != 0: + await _run("sudo", "iptables", "-A", "INPUT", *drop) + + +async def enable_filtered_internet(slot: int) -> None: + """Allow the VM to reach the *allowlisted* outside world only. + + PREROUTING REDIRECTs: + - TCP 443/80 → the host's SNI-filtering proxy. + - UDP 53 → the host's local DNS resolver (operator must run + a UDP-only resolver on the host — see + playground/scripts/install-firecracker.sh). + TCP 53 is dropped entirely (no big-payload DNS, the classic + exfiltration channel — see GHSA / RFC1918 advisories cited in + the security review). Every other outbound port from the VM is + DROPped at FORWARD. + + No POSTROUTING MASQUERADE here: the SNI proxy on the host opens + its OWN outbound socket to the allowlisted upstream, so the + host's normal egress path handles the source rewrite. The VM's + only legitimate outbound traffic now goes via REDIRECT to a + local listener; nothing on the VM's CIDR ever reaches the + outside interface directly. + """ + await _strip_slot(slot) + tap = tap_name(slot) + iface = await _host_default_iface() + + # NAT redirects: TCP 443/80 → SNI proxy, UDP 53 → host DNS resolver. + for match in ( + ("-i", tap, "-p", "tcp", "--dport", "443", + "-j", "REDIRECT", "--to-ports", str(PROXY_HTTPS_PORT)), + ("-i", tap, "-p", "tcp", "--dport", "80", + "-j", "REDIRECT", "--to-ports", str(PROXY_HTTP_PORT)), + ("-i", tap, "-p", "udp", "--dport", "53", + "-j", "REDIRECT", "--to-ports", "53"), + ): + await _run("sudo", "iptables", "-t", "nat", "-A", "PREROUTING", *match) + + # FORWARD: drop TCP/53 + UDP/53 (DNS-over-TCP is a classic exfil + # channel; UDP/53 is REDIRECTed above, this is a belt-and-braces + # for a downed resolver). Allow established replies for the SNI + # proxy's outbound to upstream. Catch-all DROP at the end. + for rule in ( + ("-i", tap, "-p", "udp", "--dport", "53", "-j", "DROP"), + ("-i", tap, "-p", "tcp", "--dport", "53", "-j", "DROP"), + ("-i", iface, "-o", tap, "-m", "state", "--state", + "RELATED,ESTABLISHED", "-j", "ACCEPT"), + ("-i", tap, "-j", "DROP"), + ): + await _run("sudo", "iptables", "-A", "FORWARD", *rule) + + +async def disable_internet(slot: int) -> None: + """Isolate the VM: remove every per-slot rule and install per-slot + catch-all DROPs (both directions) so the VM cannot reach the + outside world via FORWARD's default policy. + + Why the explicit DROPs are necessary: the host's FORWARD policy + is ACCEPT (Docker would flip it but we disable Docker's iptables + management, and we don't want to flip the global policy ourselves + — it would break unrelated forwarding on the host). With just the + per-slot ACCEPTs removed, a 'disabled' VM still has clear egress + because every FORWARD packet falls through to the default ACCEPT. + Notably this lets a VM reach 169.254.169.254 (EC2 IMDS) — even + without our MASQUERADE rule the AWS hypervisor responds to the + VM's RFC1918 source, and the reply gets forwarded back the same + way. Any system exposing arbitrary code execution to the + benchmark consumer (pandas, polars, dataframe variants) could + then pivot to the host's IAM role. + """ + await _strip_slot(slot) + tap = tap_name(slot) + for rule in ( + ("-i", tap, "-j", "DROP"), + ("-o", tap, "-j", "DROP"), + ): + await _run("sudo", "iptables", "-A", "FORWARD", *rule) diff --git a/playground/server/sni_proxy.py b/playground/server/sni_proxy.py new file mode 100644 index 0000000000..0d1a0bbe1a --- /dev/null +++ b/playground/server/sni_proxy.py @@ -0,0 +1,285 @@ +"""SNI-aware allowlist proxy for sandboxed VMs. + +Two listeners: +- HTTPS (default :8443): peeks the TLS ClientHello to read the SNI + hostname, allowlists it, then bidirectionally splices the raw bytes + to the real server. No TLS termination, no certificate spoofing — + end-to-end TLS between VM and S3 is preserved untouched. +- HTTP (default :8080): reads the first request's Host header and + applies the same allowlist. + +Used as a target for iptables REDIRECT rules in net.enable_filtered_internet. +VMs that should only reach s3.amazonaws.com get tcp dpt 443 / 80 from +their TAP REDIRECTed to these ports; the proxy enforces the allowlist +before opening the upstream connection. +""" + +from __future__ import annotations + +import asyncio +import contextlib +import fnmatch +import logging +import re + +log = logging.getLogger("sni_proxy") + + +# Hostname globs allowed through. Case-insensitive. +DEFAULT_ALLOW: tuple[str, ...] = ( + "s3.amazonaws.com", + "*.s3.amazonaws.com", + "s3.*.amazonaws.com", + "*.s3.*.amazonaws.com", + # AWS occasionally returns S3 traffic via these "dualstack" names. + "*.s3-accelerate.amazonaws.com", + "*.s3-website-*.amazonaws.com", +) + + +def _allowed(host: str, patterns: tuple[str, ...] | list[str]) -> bool: + h = host.lower() + return any(fnmatch.fnmatchcase(h, p.lower()) for p in patterns) + + +def _parse_sni(data: bytes) -> str | None: + """Pull SNI hostname out of a TLS ClientHello record (if any). + + Returns None if the bytes aren't a recognizable ClientHello or no + server_name extension is present. + """ + # TLS record header: type(1)=0x16, version(2), length(2). + if len(data) < 5 or data[0] != 0x16: + return None + p = 5 + # Handshake header: msg_type(1)=0x01 (ClientHello), length(3). + if p + 4 > len(data) or data[p] != 0x01: + return None + p += 4 + # client_version(2) + p += 2 + # random(32) + p += 32 + # session_id + if p >= len(data): + return None + sid_len = data[p] + p += 1 + sid_len + # cipher_suites + if p + 2 > len(data): + return None + cs_len = (data[p] << 8) | data[p + 1] + p += 2 + cs_len + # compression_methods + if p >= len(data): + return None + cm_len = data[p] + p += 1 + cm_len + # extensions + if p + 2 > len(data): + return None + ext_total = (data[p] << 8) | data[p + 1] + p += 2 + end = min(p + ext_total, len(data)) + while p + 4 <= end: + ext_type = (data[p] << 8) | data[p + 1] + ext_len = (data[p + 2] << 8) | data[p + 3] + p += 4 + if ext_type == 0x00 and p + 2 <= end: + # server_name list length(2), then list of (type(1),len(2),name). + sn_list_len = (data[p] << 8) | data[p + 1] + q = p + 2 + list_end = min(q + sn_list_len, len(data)) + while q + 3 <= list_end: + name_type = data[q] + name_len = (data[q + 1] << 8) | data[q + 2] + q += 3 + if name_type == 0x00 and q + name_len <= list_end: + return data[q:q + name_len].decode("ascii", errors="ignore") + q += name_len + p += ext_len + return None + + +_HOST_HEADER_RE = re.compile( + rb"\r\nHost:\s*([^\r\n:]+)", re.IGNORECASE, +) + + +def _parse_http_host(data: bytes) -> str | None: + m = _HOST_HEADER_RE.search(data) + if not m: + return None + return m.group(1).decode("ascii", errors="ignore").strip() + + +async def _read_preamble(reader: asyncio.StreamReader, + want: int = 4096, + timeout: float = 5.0, + early_match=None) -> bytes: + """Read up to `want` bytes of the connection start — enough to + cover a TLS ClientHello or the start of an HTTP request line + + headers. Returns as soon as `early_match(buf)` finds the marker + we're looking for (SNI or HTTP host), so we don't block waiting + for bytes that may never arrive.""" + buf = bytearray() + deadline = asyncio.get_event_loop().time() + timeout + while len(buf) < want: + remaining = deadline - asyncio.get_event_loop().time() + if remaining <= 0: + break + try: + chunk = await asyncio.wait_for( + reader.read(min(4096, want - len(buf))), + timeout=remaining, + ) + except asyncio.TimeoutError: + break + if not chunk: + break + buf.extend(chunk) + # For HTTP, stop at the end of headers. + if b"\r\n\r\n" in buf: + break + # For HTTPS / generic, early-out once the caller's matcher + # finds what it needs. + if early_match is not None and early_match(bytes(buf)) is not None: + break + return bytes(buf) + + +# DNS cache so the proxy doesn't synchronously resolve the same S3 +# hostname for every connection. Each S3 client makes dozens of +# requests; without this, getaddrinfo runs in the default threadpool +# (8 threads) and serializes under the partitioned-parquet workload. +# TTL is short so we still pick up upstream changes promptly. +_DNS_CACHE: dict[str, tuple[float, list]] = {} +_DNS_TTL_SECONDS = 300.0 + + +async def _resolve_cached(host: str, port: int) -> list: + import socket + now = asyncio.get_event_loop().time() + cached = _DNS_CACHE.get(host) + if cached and now - cached[0] < _DNS_TTL_SECONDS: + return cached[1] + loop = asyncio.get_event_loop() + infos = await loop.getaddrinfo(host, port, type=socket.SOCK_STREAM) + _DNS_CACHE[host] = (now, infos) + return infos + + +async def _pipe(reader: asyncio.StreamReader, + writer: asyncio.StreamWriter) -> None: + try: + while True: + data = await reader.read(65536) + if not data: + break + writer.write(data) + await writer.drain() + except (ConnectionResetError, BrokenPipeError, OSError, asyncio.CancelledError): + pass + finally: + with contextlib.suppress(Exception): + writer.write_eof() + with contextlib.suppress(Exception): + writer.close() + + +async def _open_upstream(host: str, port: int) -> tuple[asyncio.StreamReader, asyncio.StreamWriter]: + """Open a connection to (host, port) using the DNS cache. + + Falls back to asyncio.open_connection if resolution returns nothing + useful. Tries each resolved address in order — first success wins. + """ + import socket + infos = await _resolve_cached(host, port) + last_exc: Exception | None = None + for family, socktype, proto, _canon, sockaddr in infos: + try: + sock = socket.socket(family, socktype, proto) + sock.setblocking(False) + loop = asyncio.get_event_loop() + await loop.sock_connect(sock, sockaddr) + return await asyncio.open_connection(sock=sock) + except Exception as e: + last_exc = e + with contextlib.suppress(Exception): + sock.close() + continue + if last_exc is not None: + raise last_exc + return await asyncio.open_connection(host, port) + + +async def _handle(reader: asyncio.StreamReader, + writer: asyncio.StreamWriter, + *, https: bool, allow: tuple[str, ...] | list[str], + upstream_port: int) -> None: + peer = writer.get_extra_info("peername") + try: + matcher = _parse_sni if https else _parse_http_host + first = await _read_preamble(reader, want=4096, timeout=5.0, + early_match=matcher) + host = matcher(first) + if not host or not _allowed(host, allow): + log.info("blocked %s -> %r", peer, host) + writer.close() + return + try: + upr, upw = await asyncio.wait_for( + _open_upstream(host, upstream_port), + timeout=10.0, + ) + except Exception as e: + log.warning("upstream %s:%d failed: %r", host, upstream_port, e) + writer.close() + return + log.info("forwarding %s -> %s:%d", peer, host, upstream_port) + # Replay what we already read. + upw.write(first) + with contextlib.suppress(Exception): + await upw.drain() + await asyncio.gather( + _pipe(reader, upw), + _pipe(upr, writer), + return_exceptions=True, + ) + except (asyncio.TimeoutError, OSError): + with contextlib.suppress(Exception): + writer.close() + + +async def start(*, bind_addr: str = "0.0.0.0", + https_port: int = 8443, + http_port: int = 8080, + allow: tuple[str, ...] | list[str] = DEFAULT_ALLOW + ) -> list[asyncio.AbstractServer]: + """Start the proxy listeners. Returns the asyncio servers so the + caller can keep them alive / close them on shutdown.""" + async def https_h(r, w): await _handle(r, w, https=True, allow=allow, upstream_port=443) + async def http_h(r, w): await _handle(r, w, https=False, allow=allow, upstream_port=80) + s_https = await asyncio.start_server(https_h, bind_addr, https_port) + s_http = await asyncio.start_server(http_h, bind_addr, http_port) + log.info("SNI-filter proxy: https=:%d http=:%d allow=%s", + https_port, http_port, list(allow)) + return [s_https, s_http] + + +if __name__ == "__main__": + # Standalone runner for testing: + # python3 -m playground.server.sni_proxy [https_port] [http_port] + import sys + logging.basicConfig(level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s") + hp = int(sys.argv[1]) if len(sys.argv) > 1 else 8443 + pp = int(sys.argv[2]) if len(sys.argv) > 2 else 8080 + async def main(): + servers = await start(https_port=hp, http_port=pp) + try: + await asyncio.gather(*(s.serve_forever() for s in servers)) + finally: + for s in servers: + s.close() + asyncio.run(main()) diff --git a/playground/server/systems.py b/playground/server/systems.py new file mode 100644 index 0000000000..e7c1743f58 --- /dev/null +++ b/playground/server/systems.py @@ -0,0 +1,230 @@ +"""Registry of ClickBench systems that can be exposed through the playground. + +A system is *playground-eligible* if its directory contains the canonical +unified script set (install/start/load/query/check/stop) AND there is no +external service required (no `aurora-*`, `redshift*`, `bigquery`, `snowflake`, +etc. — those need API keys and live on someone else's infra). + +The registry is built by scanning the repo at startup. Each `System` carries: + + * name the directory name (also the URL-safe identifier) + * display_name pulled from template.json "system" field if present + * tags from template.json + * download_script from `BENCH_DOWNLOAD_SCRIPT=` line in benchmark.sh + * data_format inferred from download_script (parquet / parquet-partitioned / tsv / csv / none) + * durable BENCH_DURABLE=yes/no (default yes) + * restartable BENCH_RESTARTABLE=yes/no (default yes) +""" +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path + +# Systems we explicitly skip — they all require external infrastructure +# (managed cloud DBs / API keys) we can't run inside an isolated microVM. +# Local-only systems (umbra, hyper, cedardb, etc.) stay in the catalog +# even though some need a free-trial license at install time — those +# scripts fetch the binary themselves and we don't second-guess them. +_EXTERNAL = { + # Managed cloud services / require API keys / external infra. + "alloydb", "athena", "athena-partitioned", "aurora-mysql", + "aurora-postgresql", "bigquery", "brytlytdb", "bytehouse", "chyt", + "clickhouse-cloud", "clickhouse-tencent", + "crunchy-bridge-for-analytics", "databricks", "exasol", + "gravitons", "hologres", "hydrolix", + "motherduck", "pgpro_tam", "redshift", "redshift-serverless", + "s3select", "singlestore", "snowflake", "supabase", + "tembo-olap", "timescale-cloud", "tinybird", "velodb", + "vertica", "ydb", + # duckdb-memory runs duckdb with the database in :memory:. Even with + # a generous swap drive, the 100M-row hits set blows past anything + # reasonable here; duckdb has an on-disk fallback that we use via + # the regular `duckdb` entry, so disable the memory variant. + "duckdb-memory", + # sirius is a GPU-accelerated DuckDB extension that links against + # CUDA at runtime. Our microVMs have no GPU; install compiles fine + # (~35 min from source) but ./check times out because the daemon + # can't initialize a CUDA context. Disabled — we'd need GPU passthrough. + "sirius", + # oxla's only public docker image, public.ecr.aws/oxla/release, + # was de-listed (the ECR public gallery no longer surfaces the + # repository at all). No replacement on Docker Hub or GitHub + # Releases. Drop until upstream publishes a new image source. + "oxla", + # Upstream is broken, asks for credentials we don't have, or + # the engine can't survive a 16 GB cap. + # - paradedb-partitioned: install script aborts ("pg_lakehouse was + # removed from ParadeDB after 0.10.x"); historical benchmark only. + # - paradedb: postgres backend crashes during index VACUUM under + # 16 GB RAM; not investigable without bumping VM RAM. + # - pg_duckdb-motherduck: requires MOTHERDUCK_TOKEN (cloud creds). + "paradedb", "paradedb-partitioned", "pg_duckdb-motherduck", +} + +# Systems that need outbound access at query time get routed through +# the SNI-allowlist proxy on the host (see sni_proxy.py + +# net.enable_filtered_internet). Only HTTPS to the S3 hosts in +# sni_proxy.DEFAULT_ALLOW survives; everything else is dropped. The +# ClickHouse-family engines used to live in a separate +# `TRUSTED_INTERNET` set that gave them unrestricted egress (so an +# arbitrary user SQL could `url('http://169.254.169.254/...')` or +# reach internal hosts) — that set is gone; they all now use this +# filtered path too. +DATALAKE_FILTERED: frozenset[str] = frozenset({ + "chdb", + "chdb-parquet", + "chdb-parquet-partitioned", + "clickhouse", + "clickhouse-datalake", + "clickhouse-datalake-partitioned", + "clickhouse-parquet", + "clickhouse-parquet-partitioned", + # clickhouse-web ATTACHes the table to a remote web disk pointed at + # https://clickhouse-public-datasets.s3.amazonaws.com/web/ — every + # query pulls parts on demand, so it needs post-snapshot S3 access. + "clickhouse-web", + "duckdb-datalake", + "duckdb-datalake-partitioned", + "presto-datalake", + "presto-datalake-partitioned", + "trino-datalake", + "trino-datalake-partitioned", +}) + +# DataFrame / in-process engines load the full 100M-row hits set into a +# single in-process structure. Observed working set can reach 250 GB, +# well past the playground's 16 GB VM RAM cap. Give each of these a +# dedicated raw swap block device (mkswap + swapon at agent startup); +# the swap disk is reflink-snapshotted alongside rootfs/sysdisk so a +# restored VM resumes with the same swap pages it had at snapshot time. +NEEDS_SWAP: frozenset[str] = frozenset({ + "chdb-dataframe", + "duckdb-dataframe", + "polars-dataframe", + "daft-parquet", + "daft-parquet-partitioned", + "pandas", + # Umbra OOMs during load on the 16 GB cap (`psql:create.sql:109: + # ERROR: unable to allocate memory` after ~70M rows of COPY). + # The docker container has no memory.swap.max set, so the guest + # kernel will swap it the same as any process. + "umbra", +}) + +# Sparse size of the swap.raw block device handed to NEEDS_SWAP systems. +# 256 GiB matches the upper bound we've seen these engines hit on the +# partitioned-parquet set. +SWAP_SIZE_GB: int = 256 + +# Per-system sysdisk-size override (apparent size, in GiB). Default is +# 200 GiB as set by build-system-rootfs.sh. The image is sparse so the +# apparent size doesn't cost physical bytes upfront — only what the +# guest actually writes. Rootfs is intentionally not overridable here: +# the build script clones the base ext4 image via sparse-cp without +# resize2fs, so a bigger rootfs would require a deeper change. +SYSDISK_OVERRIDES_GB: dict[str, int] = { + # postgresql-orioledb's COPY blew through 200 GiB before reaching + # the end of hits.tsv: + # PANIC: could not write buffer to file orioledb_undo/0000000319page: + # No space left on device (line 69,533,798 of hits.tsv) + # The orioledb extension keeps a per-statement undo log inside + # PGDATA that roughly doubles the write footprint of the base + # table. The install script parks PGDATA on the sysdisk + # specifically so this override actually helps. + "postgresql-orioledb": 400, +} + + +@dataclass(frozen=True) +class System: + name: str + display_name: str + tags: tuple[str, ...] + download_script: str + data_format: str # parquet / parquet-partitioned / tsv / csv / none + durable: bool + restartable: bool + + def asdict(self) -> dict: + return { + "name": self.name, + "display_name": self.display_name, + "tags": list(self.tags), + "download_script": self.download_script, + "data_format": self.data_format, + "durable": self.durable, + "restartable": self.restartable, + } + + +def _read_template(p: Path) -> dict: + tpl = p / "template.json" + if not tpl.exists(): + return {} + try: + return json.loads(tpl.read_text()) + except Exception: + return {} + + +def _parse_benchmark_sh(p: Path) -> dict: + """Best-effort parse of `export FOO=bar` lines in benchmark.sh.""" + bench = p / "benchmark.sh" + if not bench.exists(): + return {} + out: dict[str, str] = {} + pat = re.compile(r'^\s*export\s+([A-Z_]+)=("([^"]*)"|([^\s]+))', re.MULTILINE) + text = bench.read_text(errors="replace") + for m in pat.finditer(text): + key = m.group(1) + out[key] = m.group(3) if m.group(3) is not None else m.group(4) + return out + + +def _data_format(download_script: str) -> str: + if not download_script: + return "none" + if "parquet-partitioned" in download_script: + return "parquet-partitioned" + if "parquet-single" in download_script: + return "parquet" + if "tsv" in download_script: + return "tsv" + if "csv" in download_script: + return "csv" + return "unknown" + + +def _is_playground_eligible(p: Path) -> bool: + if p.name in _EXTERNAL: + return False + for f in ("install", "start", "load", "query", "check", "stop"): + s = p / f + if not s.exists(): + return False + return True + + +def discover(repo_dir: Path) -> dict[str, System]: + """Walk the repo and return name -> System.""" + out: dict[str, System] = {} + for child in sorted(repo_dir.iterdir()): + if not child.is_dir(): + continue + if not _is_playground_eligible(child): + continue + tpl = _read_template(child) + env = _parse_benchmark_sh(child) + download = env.get("BENCH_DOWNLOAD_SCRIPT", "") + out[child.name] = System( + name=child.name, + display_name=tpl.get("system", child.name), + tags=tuple(tpl.get("tags", []) or []), + download_script=download, + data_format=_data_format(download), + durable=env.get("BENCH_DURABLE", "yes") != "no", + restartable=env.get("BENCH_RESTARTABLE", "yes") != "no", + ) + return out diff --git a/playground/server/vm_manager.py b/playground/server/vm_manager.py new file mode 100644 index 0000000000..8ab1b8e1dd --- /dev/null +++ b/playground/server/vm_manager.py @@ -0,0 +1,946 @@ +"""Per-system Firecracker microVM lifecycle. + +For each ClickBench system we manage a VM with this lifecycle: + + [DOWN] --build_images--> [DOWN(images-ready)] + --first_boot--> [PROVISIONING] (internet ON, /provision called) + --snapshot--> [SNAPSHOTTED(internet OFF)] + --restore--> [READY] (handles /query requests) + --idle / OOM / disk full / forced--> kill -> [SNAPSHOTTED] + +State transitions are gated by `VM.lock`. The public API +`ensure_ready_for_query(system)` returns an `(agent_url, vm)` ready to take a +POST /query, doing whatever transitions are needed. + +We avoid the jailer for now: the playground host already runs as a dedicated +user; the chroot/cgroups layer would complicate dataset disk attach and +the value-add over a vanilla firecracker process is small for our threat +model (untrusted SQL but cooperatively-built rootfs). +""" +from __future__ import annotations + +import asyncio +import contextlib +import dataclasses +import json +import logging +import os +import shutil +import signal +import time +from pathlib import Path +from typing import Optional + +import aiohttp + +from . import firecracker as fc +from . import net +from .systems import NEEDS_SWAP, SWAP_SIZE_GB, SYSDISK_OVERRIDES_GB +from .config import Config +from .systems import System, DATALAKE_FILTERED + +log = logging.getLogger("vm_manager") + + +# Lifecycle states for VM.state: +# "down" no firecracker process for this system +# "provisioning" firecracker is up, install/start/load running inside +# "ready" firecracker is up, snapshotted at least once, /query OK +# "snapshotted" firecracker process is down, but snapshot.bin exists +@dataclasses.dataclass +class VM: + system: System + slot: int + # Firecracker config + api_sock: Path + log_sock: Path # we just point this at /dev/null actually + pid: Optional[int] = None + # Keep the asyncio.subprocess.Process handle for the running firecracker. + # Without holding it, Python eventually garbage-collects the wrapper and + # the underlying child sits as a zombie until the host server + # exits — the kernel keeps the zombie's open TAP fd around with it, and a + # subsequent restore for the same slot then fails to open the TAP with + # "Resource busy". Holding the handle lets us `await proc.wait()` on + # shutdown and reap immediately. + proc: Optional[asyncio.subprocess.Process] = None + state: str = "down" + # Snapshot artifacts + snapshot_bin: Path = dataclasses.field(default_factory=lambda: Path()) + snapshot_state: Path = dataclasses.field(default_factory=lambda: Path()) + # Provision metadata + provisioned_at: Optional[float] = None + last_used: float = 0.0 + # Set when state transitions to "ready" (after restore or initial + # provision). Reset on teardown. Used by the UI to show uptime. + ready_since: Optional[float] = None + last_error: Optional[str] = None + lock: asyncio.Lock = dataclasses.field(default_factory=asyncio.Lock) + # Runtime stats refreshed by the monitor + cpu_busy_since: Optional[float] = None + rss_bytes: int = 0 + rootfs_used_bytes: int = 0 + # Cumulative (utime+stime) jiffies of the firecracker process at + # the moment this VM transitioned to "ready" (after restore). The + # CPU-cap watchdog uses (current - baseline) / SC_CLK_TCK to bill + # only the time spent serving queries, not the boot/resume cost. + # Cleared on teardown. + cpu_baseline_jiffies: int = 0 + + +class VMManager: + """Owns the registry of per-system VMs.""" + + def __init__(self, config: Config, systems: dict[str, System]): + self.cfg = config + self.systems = systems + self.vms: dict[str, VM] = {} + # Bound the number of system-disk builds running concurrently. Each + # build copies up to ~88 GB of dataset (for tsv/csv systems) — doing + # 98 in parallel would thrash the host's NVMe. 6 is enough to keep + # the disk busy without hitting writeback stalls. + self._build_sem = asyncio.Semaphore(int(os.environ.get( + "PLAYGROUND_BUILD_CONCURRENCY", "6"))) + # Cap on simultaneous in-flight provisions. Each one needs 4 vCPU + + # apt-get downloads from the public internet; running 98 concurrently + # gets rate-limited by Ubuntu mirrors and we have to retry. The host + # has plenty of headroom for 32, which still finishes the catalog + # in one pass. + self._provision_sem = asyncio.Semaphore(int(os.environ.get( + "PLAYGROUND_PROVISION_CONCURRENCY", "32"))) + # Independently cap how many VMs are inside /snapshot/create at once. + # Each snapshot writes 16 GB of memory to disk; running 30 of them + # simultaneously serializes on the host NVMe and pushed individual + # snapshots past 30 min, causing host-side timeouts on VMs that had + # already finished install+load. 6 snapshots in parallel keeps each + # one's write window under ~5 minutes on a single fast SSD. + self._snapshot_sem = asyncio.Semaphore(int(os.environ.get( + "PLAYGROUND_SNAPSHOT_CONCURRENCY", "6"))) + # Stable slot allocation. Each system gets a slot id (used as + # the TAP name fc-tap- and the /24 IP block 10.200..0/24); + # snapshot.state has the TAP name baked in, so once a snapshot + # exists we MUST keep handing the same slot back to the same + # system or restore fails with + # "Open tap device failed: Operation not permitted (os error 1). + # Invalid TUN/TAP Backend provided by fc-tap-" + # Persist the map so removing a system (e.g. sirius from + # _EXTERNAL) doesn't shift every later alphabetical neighbor. + slot_map_path = config.state_dir / "slot-assignments.json" + slot_map: dict[str, int] = {} + if slot_map_path.exists(): + with contextlib.suppress(Exception): + slot_map = json.loads(slot_map_path.read_text()) + used = set(slot_map.values()) + next_slot = 1 + for name in sorted(systems.keys()): + if name in slot_map: + continue + while next_slot in used: + next_slot += 1 + slot_map[name] = next_slot + used.add(next_slot) + with contextlib.suppress(Exception): + slot_map_path.write_text(json.dumps(slot_map, indent=2, sort_keys=True)) + for name in sorted(systems.keys()): + sys = systems[name] + slot = slot_map[name] + sys_state_dir = config.systems_dir / name + sys_state_dir.mkdir(parents=True, exist_ok=True) + vm = VM( + system=sys, + slot=slot, + api_sock=config.vms_dir / f"{name}.sock", + log_sock=config.vms_dir / f"{name}.log.sock", + snapshot_bin=sys_state_dir / "snapshot.bin", + snapshot_state=sys_state_dir / "snapshot.state", + ) + # If snapshot artifacts survived a previous server run, initialize + # to "snapshotted" so the provisioner doesn't redo install/load. + # /api/query restores lazily. + if _has_snapshot(vm): + vm.state = "snapshotted" + # Restore the last persisted error so /api/state shows the + # real failure reason even after a server restart instead + # of an empty `last_error`. + err_path = sys_state_dir / "last_error.txt" + if err_path.exists(): + with contextlib.suppress(Exception): + vm.last_error = err_path.read_text(errors="replace").strip() or None + self.vms[name] = vm + + # ── public API ─────────────────────────────────────────────────────── + + async def provision_now(self, system: str) -> None: + """Force a full initial provision. Only called by + /api/admin/provision; the /query path never lands here. + """ + if system not in self.vms: + raise KeyError(system) + vm = self.vms[system] + async with vm.lock: + if vm.state == "provisioning": + raise RuntimeError(f"{system}: provisioning already in flight") + # Bring everything down so _initial_provision starts fresh. + with contextlib.suppress(Exception): + await self._teardown(vm, "admin-provision") + vm.state = "down" + self._set_last_error(vm, None) + await self._initial_provision(vm) + + async def ensure_ready_for_query(self, system: str) -> VM: + """Make sure system is up and responsive to /query. Boot/resume as needed. + + On success the returned VM is in state "ready" and self.last_used has + been touched. + """ + if system not in self.vms: + raise KeyError(system) + vm = self.vms[system] + async with vm.lock: + if vm.state == "ready" and vm.pid and await self._agent_healthy(vm): + vm.last_used = time.time() + return vm + # The state machine: drive to "ready" by the cheapest available path. + if vm.state == "ready": + # Process is gone or unresponsive. Treat as snapshotted. + vm.state = "snapshotted" + if vm.state == "down": + if not _has_snapshot(vm): + # No snapshot yet, and /query is not a provisioning + # trigger — the operator has to /api/admin/provision + # explicitly. Refuse here so a stray query doesn't + # spin up a 30-min initial install. + raise RuntimeError( + f"{system}: no snapshot — POST /api/admin/provision" + f"/{system} to build one") + await self._restore_snapshot(vm) + elif vm.state == "snapshotted": + await self._restore_snapshot(vm) + elif vm.state == "provisioning": + raise RuntimeError(f"{system}: provisioning in progress") + vm.last_used = time.time() + return vm + + async def kick(self, system: str, reason: str) -> None: + """Forcibly tear down the VM. Caller (monitor) is responsible for logging.""" + vm = self.vms.get(system) + if vm is None: + return + async with vm.lock: + await self._teardown(vm, reason) + + def list_all(self) -> list[dict]: + out = [] + for name, vm in self.vms.items(): + out.append({ + "name": name, + "system": vm.system.display_name, + "state": vm.state, + "slot": vm.slot, + "agent_url": self.agent_url(vm), + "provisioned_at": vm.provisioned_at, + "last_used": vm.last_used, + "ready_since": vm.ready_since, + "tags": list(vm.system.tags), + "data_format": vm.system.data_format, + "last_error": vm.last_error, + "rss_bytes": vm.rss_bytes, + "rootfs_used_bytes": vm.rootfs_used_bytes, + "has_snapshot": vm.snapshot_bin.exists(), + }) + return out + + def _set_last_error(self, vm: VM, err: Optional[str]) -> None: + """Update vm.last_error AND persist to disk so /api/state shows + the real reason even after a server restart.""" + vm.last_error = err + path = self.cfg.systems_dir / vm.system.name / "last_error.txt" + with contextlib.suppress(Exception): + if err is None: + path.unlink(missing_ok=True) + else: + path.write_text(err) + + def agent_url(self, vm: VM) -> str: + _, vm_ip, _ = net.addr_for(vm.slot) + return f"http://{vm_ip}:50080" + + # ── boot / shutdown ────────────────────────────────────────────────── + + async def _spawn_firecracker(self, vm: VM) -> None: + """Start a fresh firecracker process listening on vm.api_sock.""" + with contextlib.suppress(FileNotFoundError): + vm.api_sock.unlink() + vm.api_sock.parent.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env["RUST_BACKTRACE"] = "1" + + log_path = self.cfg.logs_dir / f"firecracker-{vm.system.name}.log" + log_path.parent.mkdir(parents=True, exist_ok=True) + # Append to the existing log so prior runs are kept for postmortems. + log_fh = open(log_path, "ab", buffering=0) + + # Firecracker's --id accepts only [A-Za-z0-9-]; pg_* systems + # crash with `Invalid instance ID: InvalidChar('_')` otherwise. + fc_id = vm.system.name.replace("_", "-") + proc = await asyncio.create_subprocess_exec( + str(self.cfg.firecracker_bin), + "--api-sock", str(vm.api_sock), + "--id", fc_id, + stdout=log_fh, stderr=log_fh, env=env, start_new_session=True, + ) + vm.proc = proc + vm.pid = proc.pid + # Wait for the API socket to exist + for _ in range(80): + if vm.api_sock.exists(): + break + await asyncio.sleep(0.05) + if not vm.api_sock.exists(): + raise RuntimeError("firecracker did not create API socket in time") + + def _kernel_cmdline(self, vm: VM) -> str: + # console=ttyS0 so we get a serial-attached login (in case we drop a + # console socket for debugging); reboot=k for clean halt-on-panic. + # The kernel's built-in IP autoconfig statically assigns the VM's + # /24 from its slot, sidestepping any DHCP/networkd in userland. + # + # init_on_free=1: makes the kernel zero every page as it goes back + # on the free list. Without it, freed pages keep whatever the last + # writer put there — and Firecracker's snapshot dumps *all* RAM, + # so 8-12 GB of stale-but-freed daemon heap end up in snapshot.bin + # looking random to zstd. With it on, the pre-snapshot daemon + # shutdown leaves the guest's free pool genuinely zero-filled, and + # zstd compresses the snapshot ~300:1. The cost is a small write + # overhead on every free (~negligible vs the snapshot size win). + host_ip, vm_ip, _ = net.addr_for(vm.slot) + return ( + "console=ttyS0 reboot=k panic=1 pci=off init_on_free=1 " + f"ip={vm_ip}::{host_ip}:255.255.255.0::eth0:off " + "root=/dev/vda rw " + "init=/lib/systemd/systemd " + ) + + async def _initial_provision(self, vm: VM) -> None: + """First-time boot: build per-system images, boot with internet, run + agent /provision, snapshot, shut down.""" + if vm.state != "down": + raise RuntimeError(f"unexpected state for initial provision: {vm.state}") + + # Bound the heavy I/O phases: + # _build_images_if_needed: each call does a `cp -a /base /rootfs` + # that writes ~8 GB of base content. Running 98 in parallel + # saturates the host's NVMe writeback. + # _call_agent_provision: each spawn does `apt-get install` + # against Ubuntu mirrors and pulls 100s of MB. 98 at once gets + # rate-limited by the mirror. + # Use distinct semaphores so disk and network are bounded + # independently. + log.info("[%s] initial provision begin", vm.system.name) + vm.state = "provisioning" + try: + async with self._build_sem: + await self._build_images_if_needed(vm) + async with self._provision_sem: + await net.ensure_tap(vm.slot) + await net.enable_internet(vm.slot) + await self._boot(vm, restore_snapshot=False) + await self._wait_for_agent(vm, timeout=180) + await self._call_agent_provision(vm) + await self._snapshot(vm) + await self._shutdown(vm) + if vm.system.name in DATALAKE_FILTERED: + await net.enable_filtered_internet(vm.slot) + else: + await net.disable_internet(vm.slot) + vm.state = "snapshotted" + vm.provisioned_at = time.time() + log.info("[%s] initial provision complete", vm.system.name) + except Exception as e: + self._set_last_error(vm, f"provision: {e!r}") + log.exception("[%s] provision failed", vm.system.name) + await self._teardown(vm, "provision-failed") + raise + + async def _build_images_if_needed(self, vm: VM) -> None: + sys_dir = self.cfg.systems_dir / vm.system.name + rootfs = sys_dir / "rootfs.ext4" + sysdisk = sys_dir / "system.ext4" + swap = sys_dir / "swap.raw" + base = self.cfg.state_dir / "base-rootfs.ext4" + # If we're (re-)provisioning a system whose rootfs already has + # /var/lib/clickbench-agent/provisioned set, drop just the rootfs so + # the agent reruns the full install/start/load flow on the next + # boot. The system.ext4 (scripts + ~14 GB of dataset) is preserved — + # rebuilding it copies 14 GB unnecessarily. + if rootfs.exists() and not _has_snapshot(vm): + log.info("[%s] rootfs exists but no snapshot — dropping it for " + "a fresh agent state", vm.system.name) + rootfs.unlink() + # If base-rootfs has been rebuilt since the per-system rootfs was + # cloned (typically because we updated the in-VM agent or one of + # the lib/download-* stubs), drop the stale rootfs and the system + # disk too — the system disk's upper layer holds the scripts + # rsynced from the repo, so a stale agent and stale per-system + # scripts both come from here. Without this check, every code + # change to playground/agent/agent.py silently fails to reach + # already-provisioned systems on re-provision: vm_manager finds + # rootfs.ext4 + system.ext4 already present and skips the rebuild. + if rootfs.exists() and base.exists() and \ + rootfs.stat().st_mtime < base.stat().st_mtime: + log.info("[%s] base-rootfs is newer than rootfs — dropping " + "rootfs + sysdisk for a fresh agent + scripts", + vm.system.name) + rootfs.unlink() + with contextlib.suppress(FileNotFoundError): + sysdisk.unlink() + # For memory-bound dataframe systems, also (re)create a sparse + # swap.raw block device that the in-VM agent mkswaps + swapons. + # Sized to the worst-case working set we've seen; sparse so the + # host pays only for the bytes the guest actually pages out. + if vm.system.name in NEEDS_SWAP and not _has_snapshot(vm): + with contextlib.suppress(FileNotFoundError): + swap.unlink() + if vm.system.name in NEEDS_SWAP and not swap.exists(): + log.info("[%s] creating swap.raw (%d GiB sparse)", + vm.system.name, SWAP_SIZE_GB) + p = await asyncio.create_subprocess_exec( + "truncate", "-s", f"{SWAP_SIZE_GB}G", str(swap), + stderr=asyncio.subprocess.PIPE, + ) + _, err = await p.communicate() + if p.returncode != 0: + raise RuntimeError( + f"truncate swap.raw failed: {err.decode(errors='replace')[-400:]}") + if rootfs.exists() and sysdisk.exists(): + return + log.info("[%s] building rootfs + system disk", vm.system.name) + script = self.cfg.repo_dir / "playground" / "images" / "build-system-rootfs.sh" + build_env = {**os.environ, + "PLAYGROUND_STATE_DIR": str(self.cfg.state_dir)} + # Per-system sysdisk-size override for engines whose load blew + # through the 200 GiB default (postgresql-orioledb, ...). + sysdisk_override = SYSDISK_OVERRIDES_GB.get(vm.system.name) + if sysdisk_override: + build_env["VM_SYSDISK_SIZE_GB"] = str(sysdisk_override) + p = await asyncio.create_subprocess_exec( + "bash", str(script), vm.system.name, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, + env=build_env, + ) + out, _ = await p.communicate() + if p.returncode != 0: + raise RuntimeError(f"build-system-rootfs failed: {out.decode(errors='replace')[-2000:]}") + + async def _boot(self, vm: VM, *, restore_snapshot: bool) -> None: + """Configure and start a Firecracker instance. If restore_snapshot is + True, we load from the snapshot files; else we cold-boot from kernel + + rootfs.""" + await self._spawn_firecracker(vm) + try: + await self._configure_boot(vm, restore_snapshot=restore_snapshot) + except Exception: + # If config fails partway, the firecracker process still owns the + # TAP fd; without reaping it, the next attempt sees "Resource + # busy" because the kernel hasn't released the TAP. Kill + + # wait() before propagating. + await self._shutdown(vm) + raise + + async def _configure_boot(self, vm: VM, *, restore_snapshot: bool) -> None: + sock = str(vm.api_sock) + + rootfs = self.cfg.systems_dir / vm.system.name / "rootfs.ext4" + sysdisk = self.cfg.systems_dir / vm.system.name / "system.ext4" + + if restore_snapshot: + # Firecracker's rule: `PUT /snapshot/load` must be the *first* + # configuring action — no boot-source, no drives, no network + # interfaces, no machine-config beforehand. The snapshot itself + # encodes all of that. We just need the same TAP available on + # the host with the same name (host_ensure_tap below handles + # this). + await fc.put(sock, "/snapshot/load", { + "snapshot_path": str(vm.snapshot_state), + "mem_backend": {"backend_type": "File", "backend_path": str(vm.snapshot_bin)}, + "enable_diff_snapshots": False, + "resume_vm": True, + }, timeout=120.0) + return + + # Cold boot. + await fc.put(sock, "/network-interfaces/eth0", { + "iface_id": "eth0", + "guest_mac": net.mac_for(vm.slot), + "host_dev_name": net.tap_name(vm.slot), + }) + await fc.put(sock, "/boot-source", { + "kernel_image_path": str(self.cfg.kernel_path), + "boot_args": self._kernel_cmdline(vm), + }) + await fc.put(sock, "/drives/rootfs", { + "drive_id": "rootfs", + "path_on_host": str(rootfs), + "is_root_device": True, + "is_read_only": False, + }) + await fc.put(sock, "/drives/system", { + "drive_id": "system", + "path_on_host": str(sysdisk), + "is_root_device": False, + "is_read_only": False, + }) + # Shared dataset disk, attached read-only to every VM (LABEL=cbdata + # mount in the guest fstab). Saves ~1-2 TB of host storage compared + # to embedding the dataset into each per-system disk. + datasets_img = self.cfg.datasets_image + if datasets_img.exists(): + await fc.put(sock, "/drives/datasets", { + "drive_id": "datasets", + "path_on_host": str(datasets_img), + "is_root_device": False, + "is_read_only": True, + }) + # Per-VM swap disk for memory-bound dataframe systems. The in-VM + # agent finds it by scanning /dev/vd? for an unformatted block + # device, then mkswaps + swapons it. Reflink-snapshotted along + # with rootfs + sysdisk so a restored VM resumes with the same + # paged-out pages it had at snapshot time. + if vm.system.name in NEEDS_SWAP: + swap_path = self.cfg.systems_dir / vm.system.name / "swap.raw" + if swap_path.exists(): + await fc.put(sock, "/drives/swap", { + "drive_id": "swap", + "path_on_host": str(swap_path), + "is_root_device": False, + "is_read_only": False, + }) + await fc.put(sock, "/machine-config", { + "vcpu_count": self.cfg.vm_vcpus, + "mem_size_mib": self.cfg.vm_mem_mib, + "smt": False, + }) + await fc.put(sock, "/actions", {"action_type": "InstanceStart"}) + + async def _snapshot(self, vm: VM) -> None: + # Flush the guest's dirty pages to the virtio-blk devices before we + # pause the vcpus. Without an explicit sync here, KVM can freeze + # the guest mid-flush — the snapshot then captures memory that + # references on-disk blocks that haven't actually landed yet, and + # the next read after restore sees a checksum mismatch / torn + # write on whatever was being written at the moment of pause. + await self._sync_guest(vm) + + sock = str(vm.api_sock) + # Bound concurrent snapshots. /snapshot/create writes ~16 GB of + # memory to disk and ~30 simultaneous ones serialize on a single + # NVMe long enough to time out individual VMs. + async with self._snapshot_sem: + await fc.patch(sock, "/vm", {"state": "Paused"}) + try: + # 60 min. Even with snapshot_sem bounding to 6 concurrent + # snapshots, the rest of the host's I/O (install/load + # writes from 30+ other VMs in the apt/pip phase) competes + # for the same NVMe and stretches /snapshot/create well + # past 30 min in the long tail. + await fc.put(sock, "/snapshot/create", { + "snapshot_type": "Full", + "snapshot_path": str(vm.snapshot_state), + "mem_file_path": str(vm.snapshot_bin), + }, timeout=3600.0) + # Capture the *disk* state while the VM is still paused — + # the memory snapshot has in-flight references to specific + # inodes / file positions / mmap'd ranges on the rootfs and + # system disks, and any post-pause writes (journal commits, + # atime updates, etc.) by Firecracker on resume torn the + # golden disk relative to the memory image and surface as + # ext4 EBADMSG on restore for whichever file's metadata + # got dirtied. Reflink-clone keeps the working disks live + # for the clean shutdown that follows. + await self._snapshot_disks(vm) + finally: + # Try to resume so we can shut down cleanly; ignore failures. + with contextlib.suppress(Exception): + await fc.patch(sock, "/vm", {"state": "Resumed"}) + + # We no longer compress the memory dump. Firecracker mmaps + # snapshot.bin on restore, so leaving it uncompressed means a + # restore is O(1) for memory (the kernel page-faults pages in + # lazily). The cost is disk: ~16 GB nominal per system. Sparse- + # write + init_on_free=1 + pre-snapshot drop_caches+fstrim keep + # the actual on-disk size to ~5-10% of the apparent size for + # most systems. snapshot.state stays as-is; it's tiny (~60 KB). + + async def _compress_snapshot(self, vm: VM) -> None: + bin_path = vm.snapshot_bin + zst_path = vm.snapshot_bin.with_suffix(".bin.zst") + if not bin_path.exists(): + return + log.info("[%s] zstd -T0 -3 snapshot.bin (%s)", + vm.system.name, _fmt_size(bin_path.stat().st_size)) + t0 = time.monotonic() + # Stream from snapshot.bin to .zst, multi-threaded. `--long=27` + # widens the matching window to 128 MB which helps with repetitive + # zero-region patterns common in guest RAM. + proc = await asyncio.create_subprocess_exec( + "zstd", "-T0", "-3", "--long=27", "-q", "-f", + str(bin_path), "-o", str(zst_path), + ) + rc = await proc.wait() + dt = time.monotonic() - t0 + if rc != 0: + log.warning("[%s] zstd compression failed rc=%d; keeping raw .bin", + vm.system.name, rc) + zst_path.unlink(missing_ok=True) + return + new = zst_path.stat().st_size + log.info("[%s] zstd done in %.1fs: %s -> %s (%.1fx)", + vm.system.name, dt, + _fmt_size(bin_path.stat().st_size), _fmt_size(new), + bin_path.stat().st_size / max(1, new)) + # The raw .bin can go; restore re-decompresses into a temp file. + bin_path.unlink(missing_ok=True) + + async def _decompress_snapshot(self, vm: VM) -> None: + """If the snapshot lives as .bin.zst, decompress to .bin in place. + Idempotent: a no-op if .bin already exists. + """ + bin_path = vm.snapshot_bin + zst_path = vm.snapshot_bin.with_suffix(".bin.zst") + if bin_path.exists(): + return + if not zst_path.exists(): + return + log.info("[%s] unzstd snapshot.bin.zst (%s)", + vm.system.name, _fmt_size(zst_path.stat().st_size)) + t0 = time.monotonic() + proc = await asyncio.create_subprocess_exec( + "zstd", "-T0", "-d", "-q", "-f", "--long=27", + str(zst_path), "-o", str(bin_path), + ) + rc = await proc.wait() + dt = time.monotonic() - t0 + if rc != 0: + raise RuntimeError(f"zstd decompress failed rc={rc}") + log.info("[%s] unzstd done in %.1fs -> %s", + vm.system.name, dt, _fmt_size(bin_path.stat().st_size)) + + async def _restore_snapshot(self, vm: VM) -> None: + log.info("[%s] restore from snapshot", vm.system.name) + # Restore is the only auto-recovery path from a user /query. If + # the on-disk snapshot is gone (manual wipe, half-built artifact, + # ...) we fail loudly here; the operator has to kick a fresh + # provision via /api/admin/provision/. + if not _has_snapshot(vm): + vm.state = "down" + raise RuntimeError( + f"[{vm.system.name}] snapshot on disk is missing; " + f"POST /api/admin/provision/{vm.system.name} to rebuild") + # Always boot from a *fresh copy* of the golden disks captured at + # snapshot time. Restore #N inherits zero state from restore #N-1, + # which is what makes the playground safe to expose to arbitrary + # SQL: the worst a user query can do is dirty the working copy, + # which we throw away on the next /teardown. + await self._restore_disks(vm) + # If we only have the zstd-compressed memory dump, expand it before + # Firecracker tries to mmap it. + await self._decompress_snapshot(vm) + await net.ensure_tap(vm.slot) + # Systems that read live S3 at query time get the SNI-allowlist + # proxy. Everything else stays fully offline post-snapshot. + if vm.system.name in DATALAKE_FILTERED: + await net.enable_filtered_internet(vm.slot) + await self._boot(vm, restore_snapshot=True) + await self._wait_for_agent(vm, timeout=60) + # Block here until the system's daemon reports ready, so the + # first user query doesn't time out mid-startup. Big upper bound + # for slow JVMs (Doris/Druid/Trino). + await self._wait_for_daemon_ready(vm, timeout=600) + vm.state = "ready" + vm.ready_since = time.time() + # Baseline the firecracker's current jiffy counter so the + # per-VM CPU-cap watchdog can bill only post-ready CPU time. + vm.cpu_baseline_jiffies = _read_proc_jiffies(vm.pid) if vm.pid else 0 + + def _golden_paths(self, vm: VM) -> tuple[Path, Path, Path, Path]: + """(working rootfs, working sysdisk, golden rootfs, golden sysdisk).""" + sys_dir = self.cfg.systems_dir / vm.system.name + return ( + sys_dir / "rootfs.ext4", + sys_dir / "system.ext4", + sys_dir / "rootfs.golden.ext4", + sys_dir / "system.golden.ext4", + ) + + def _swap_paths(self, vm: VM) -> tuple[Path, Path] | None: + """(working swap.raw, golden swap.raw) — or None for systems + that don't need swap.""" + if vm.system.name not in NEEDS_SWAP: + return None + sys_dir = self.cfg.systems_dir / vm.system.name + return sys_dir / "swap.raw", sys_dir / "swap.golden.raw" + + async def _snapshot_disks(self, vm: VM) -> None: + rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) + # Reflink-clone the working images into the golden slot. We can't + # rename: the working file stays bound to Firecracker's open + # virtio-blk fd through the post-snapshot resume + shutdown, and + # any writes during that window would leak into the golden (we + # observed restored systems hitting ext4 EBADMSG on small files + # like duckdb's hits.db.wal and a venv activate script). With + # reflink the snapshot is near-instant; the working file's + # post-snapshot writes diverge into its own extents and don't + # touch the golden. The disk is btrfs with compress=zstd so + # the goldens occupy ~30-50% of their apparent size on disk + # transparently — no application-level compression needed. + async def _clone(src: Path, dst: Path) -> None: + if dst.exists(): + dst.unlink() + proc = await asyncio.create_subprocess_exec( + "cp", "--reflink=always", str(src), str(dst), + stderr=asyncio.subprocess.PIPE, + ) + _, err = await proc.communicate() + if proc.returncode != 0: + raise RuntimeError( + f"reflink snapshot cp {src} -> {dst} failed: " + f"{err.decode(errors='replace')[-400:]}") + clones = [ + _clone(rootfs, rootfs_gold), + _clone(sysdisk, sysdisk_gold), + ] + swap_pair = self._swap_paths(vm) + if swap_pair is not None and swap_pair[0].exists(): + clones.append(_clone(swap_pair[0], swap_pair[1])) + await asyncio.gather(*clones) + sizes = [_fmt_size(rootfs_gold.stat().st_size), + _fmt_size(sysdisk_gold.stat().st_size)] + if swap_pair is not None and swap_pair[1].exists(): + sizes.append(_fmt_size(swap_pair[1].stat().st_size)) + log.info("[%s] golden disks saved (%s)", vm.system.name, + ", ".join(sizes)) + + async def _restore_disks(self, vm: VM) -> None: + rootfs, sysdisk, rootfs_gold, sysdisk_gold = self._golden_paths(vm) + if not rootfs_gold.exists() or not sysdisk_gold.exists(): + raise RuntimeError( + f"[{vm.system.name}] missing golden disks; cannot restore") + # Reflink-clone the goldens into fresh working copies. The host + # filesystem is btrfs with `compress=zstd` enabled; reflink is + # O(1) (extent-list copy) and the engine transparently + # decompresses on read, so restore latency is dominated by + # firecracker boot, not disk I/O. Both clones run concurrently; + # they touch disjoint files. + async def _clone(src: Path, dst: Path) -> None: + if dst.exists(): + dst.unlink() + proc = await asyncio.create_subprocess_exec( + "cp", "--reflink=always", str(src), str(dst), + stderr=asyncio.subprocess.PIPE, + ) + _, err = await proc.communicate() + if proc.returncode != 0: + raise RuntimeError( + f"reflink cp {src} -> {dst} failed: " + f"{err.decode(errors='replace')[-400:]}") + clones = [ + _clone(rootfs_gold, rootfs), + _clone(sysdisk_gold, sysdisk), + ] + swap_pair = self._swap_paths(vm) + if swap_pair is not None and swap_pair[1].exists(): + clones.append(_clone(swap_pair[1], swap_pair[0])) + await asyncio.gather(*clones) + log.info("[%s] working disks reflink-cloned from golden", + vm.system.name) + + async def _shutdown(self, vm: VM) -> None: + """Best-effort clean shutdown of the firecracker process. + + Always reap the asyncio.subprocess.Process handle so the kernel + releases its open file descriptors (notably the TAP — without this + the next /restore for the same slot fails with `Resource busy`). + """ + if not vm.pid and not vm.proc: + return + with contextlib.suppress(Exception): + await fc.put(str(vm.api_sock), "/actions", {"action_type": "SendCtrlAltDel"}) + # Wait briefly for graceful exit. + for _ in range(50): + if vm.pid is None or not _pid_alive(vm.pid): + break + await asyncio.sleep(0.1) + if vm.pid is not None and _pid_alive(vm.pid): + with contextlib.suppress(ProcessLookupError): + os.kill(vm.pid, signal.SIGKILL) + # Reap the process. asyncio.Process.wait() drains the exit status so + # the kernel can release the resources (TAP fd, memory mappings). + if vm.proc is not None: + with contextlib.suppress(Exception): + await asyncio.wait_for(vm.proc.wait(), timeout=5.0) + vm.proc = None + vm.pid = None + with contextlib.suppress(FileNotFoundError): + vm.api_sock.unlink() + + async def _teardown(self, vm: VM, reason: str) -> None: + log.warning("[%s] teardown: %s", vm.system.name, reason) + with contextlib.suppress(Exception): + await self._shutdown(vm) + vm.state = "snapshotted" if _has_snapshot(vm) else "down" + vm.ready_since = None + vm.cpu_baseline_jiffies = 0 + # Drop the decompressed snapshot.bin if we still have the .zst — it's + # ~16 GB of redundancy on disk. Keep .zst as the canonical artifact. + zst = vm.snapshot_bin.with_suffix(".bin.zst") + if vm.snapshot_bin.exists() and zst.exists(): + with contextlib.suppress(FileNotFoundError): + vm.snapshot_bin.unlink() + # Discard the working disks. Any changes the daemon scribbled into + # them during this session (background merges, log writes, /tmp + # churn) die with them; the next restore will clone fresh copies + # from the golden disks, so user N+1 sees the same starting state + # as user N. + if _has_snapshot(vm): + rootfs, sysdisk, _, _ = self._golden_paths(vm) + to_unlink = [rootfs, sysdisk] + swap_pair = self._swap_paths(vm) + if swap_pair is not None: + to_unlink.append(swap_pair[0]) + for p in to_unlink: + with contextlib.suppress(FileNotFoundError): + p.unlink() + + # ── agent helpers ──────────────────────────────────────────────────── + + async def _agent_healthy(self, vm: VM) -> bool: + if not vm.pid or not _pid_alive(vm.pid): + return False + url = self.agent_url(vm) + "/health" + try: + async with aiohttp.ClientSession() as s: + async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r: + return r.status == 200 + except Exception: + return False + + async def _wait_for_agent(self, vm: VM, *, timeout: float) -> None: + url = self.agent_url(vm) + "/health" + t0 = time.monotonic() + last_err: Exception | None = None + async with aiohttp.ClientSession() as s: + while time.monotonic() - t0 < timeout: + try: + async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r: + if r.status == 200: + return + except Exception as e: + last_err = e + await asyncio.sleep(0.5) + raise RuntimeError(f"agent unreachable after {timeout}s: {last_err!r}") + + async def _wait_for_daemon_ready(self, vm: VM, *, timeout: float) -> None: + """Wait for the system's daemon to start serving (post-restore). + + Slow JVM daemons (Doris, Druid, Trino) can take several minutes to + come up after a snapshot restore. The agent's daemon-kick thread + runs ./start + ./check in the background; /ready flips to 200 once + that completes. Without this gate, the first user query lands + mid-start and times out at the host's 60 s query budget. + """ + url = self.agent_url(vm) + "/ready" + t0 = time.monotonic() + async with aiohttp.ClientSession() as s: + while time.monotonic() - t0 < timeout: + try: + async with s.get(url, timeout=aiohttp.ClientTimeout(total=2)) as r: + if r.status == 200: + return + except Exception: + pass + await asyncio.sleep(1.0) + log.warning("[%s] daemon not ready after %s s; serving queries anyway", + vm.system.name, timeout) + + async def _call_agent_provision(self, vm: VM) -> None: + url = self.agent_url(vm) + "/provision" + # No fast idle check — /provision is a single POST that returns + # only when install+load is fully done. The TCP connection sits + # idle (no body streaming) for the entire run. Some systems take + # many hours to load 100 M rows; we just set a generous total + # deadline so a genuinely stuck call eventually breaks. + async with aiohttp.ClientSession() as s: + async with s.post( + url, + timeout=aiohttp.ClientTimeout( + total=7 * 86400, sock_connect=30, + ), + ) as r: + body = await r.read() + # Stash the full body — the 2000-byte tail we surface in + # the exception only covers the install epilogue; the real + # failure often happens later (start, check, load). + dump = self.cfg.logs_dir / f"provision-{vm.system.name}.log" + with contextlib.suppress(Exception): + dump.write_bytes(body) + if r.status >= 300: + raise RuntimeError( + f"agent /provision failed: {r.status}: " + f"{body[-2000:].decode(errors='replace')} " + f"(full output: {dump})") + + async def _sync_guest(self, vm: VM) -> None: + url = self.agent_url(vm) + "/sync" + try: + async with aiohttp.ClientSession() as s: + async with s.post(url, timeout=aiohttp.ClientTimeout(total=300)) as r: + body = (await r.read()).decode("utf-8", errors="replace").strip() + log.info("[%s] guest sync: %s", vm.system.name, body) + except Exception as e: + log.warning("[%s] guest sync failed (%r); proceeding anyway", vm.system.name, e) + + +def _has_snapshot(vm: VM) -> bool: + """A snapshot is complete only when *both* the memory image and the + golden disks have been captured. A half-built snapshot (memory present + but goldens missing, or vice versa) is treated as no snapshot at all + so the next ensure_ready_for_query re-provisions cleanly. + """ + mem_ok = (vm.snapshot_bin.exists() or + vm.snapshot_bin.with_suffix(".bin.zst").exists()) + sys_dir = vm.snapshot_bin.parent + disks_ok = ((sys_dir / "rootfs.golden.ext4").exists() and + (sys_dir / "system.golden.ext4").exists()) + return mem_ok and disks_ok + + +def _fmt_size(n: int) -> str: + for u in ("B", "KiB", "MiB", "GiB", "TiB"): + if n < 1024: + return f"{n:.1f}{u}" + n //= 1024 + return f"{n}PiB" + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + + +def _read_proc_jiffies(pid: int) -> int: + """Return (utime+stime) for `pid` in jiffies, or 0 if unreadable.""" + try: + stat = Path(f"/proc/{pid}/stat").read_text() + except (FileNotFoundError, PermissionError): + return 0 + end = stat.rfind(")") + parts = stat[end + 2:].split() + try: + return int(parts[11]) + int(parts[12]) + except (IndexError, ValueError): + return 0 diff --git a/playground/web/app.js b/playground/web/app.js new file mode 100644 index 0000000000..dc75c54396 --- /dev/null +++ b/playground/web/app.js @@ -0,0 +1,825 @@ +// ClickBench Playground — minimal vanilla-JS client. +// +// Talks to the host API. +// 1. On load, fetch /api/systems for the catalog and /api/state for live +// states. Render systems as a vertical list, colored by current state. +// 2. Re-poll /api/state every 2 s and re-color the list. The currently +// selected system also re-renders its status JSON blob below. +// 3. On click of a system row, select it. On "Run query", POST the SQL to +// /api/query?system= and render output as plain text in a
.
+
+const $ = (sel) => document.querySelector(sel);
+// When the page is served by the playground over HTTP, relative URLs
+// work. When it's opened from disk (file://), relative fetches resolve
+// against file:// and fail; rewrite to an absolute localhost URL.
+// CORS is handled by the server's middleware (Access-Control-Allow-Origin: *).
+const API = location.protocol === "file:" ? "http://localhost:8000" : "";
+
+const listEl = $("#system-list");
+const queryEl = $("#query");
+const runBtn = $("#run");
+const outEl = $("#output");
+const outLabelEl = $("#output-label");
+const timeEl = $("#time");
+const stateBlob = $("#state-blob");
+const lastErrorEl = $("#last-error");
+const exampleSel = $("#example");
+// #ui-stats and #ui-output are toggled independently by
+// showResult/runQuery — only visible once there's a result for the
+// selected system.
+const uiActive = ["#ui-active", "#ui-query"].map($);
+const uiStats = $("#ui-stats");
+const uiOutput = $("#ui-output");
+const uiDown = $("#ui-down");
+
+let catalog = [];          // [{name, display_name, data_format, ...}]
+let stateByName = {};      // {name: {state, ...}}
+let selected = null;       // selected system name
+let pollTimer = null;
+let resultsByName = {};    // {name: {output, time, wall, bytes, truncated, exit}}
+let queriesByName = {};    // {name: [q1, q2, ...]}
+// The exact string we last auto-populated the textarea with (from an
+// example). If the current textarea still equals it, the user hasn't
+// edited it and we're free to swap in the next system's example.
+let pristineQuery = "";
+
+async function loadCatalog() {
+    const r = await fetch(API + "/api/systems");
+    catalog = await r.json();
+    catalog.sort((a, b) => a.display_name.localeCompare(b.display_name));
+    renderList();
+    const hash = (location.hash || "").slice(1);
+    if (hash && catalog.some(s => s.name === hash)) {
+        select(hash);
+    } else if (catalog.some(s => s.name === "clickhouse")) {
+        select("clickhouse");
+    } else if (catalog.length) {
+        select(catalog[0].name);
+    }
+}
+
+function renderList() {
+    listEl.innerHTML = "";
+    for (const s of catalog) {
+        const sObj = stateByName[s.name];
+        const st = (sObj && sObj.state) || "down";
+        const row = document.createElement("div");
+        row.className = `system-item state-${st}` + (s.name === selected ? " selected" : "");
+        row.dataset.name = s.name;
+        row.textContent = s.display_name;
+        row.dataset.tooltip = tooltipFor(sObj, st);
+        row.addEventListener("click", () => onSlabClick(s.name));
+        // In competition mode, hovering a slab highlights its row in
+        // the leaderboard so the user can scan from picker to result
+        // without losing context.
+        row.addEventListener("mouseenter", () => _setRailHover(s.name));
+        row.addEventListener("mouseleave", () => _setRailHover(null));
+        listEl.appendChild(row);
+    }
+}
+
+function tooltipFor(sObj, st) {
+    if (st === "ready") {
+        const since = sObj && sObj.ready_since;
+        if (since) {
+            const ago = Math.max(0, Math.floor(Date.now() / 1000 - since));
+            return "up " + formatDuration(ago);
+        }
+        return "up";
+    }
+    if (st === "snapshotted") return "ready";
+    if (st === "provisioning") return "provisioning";
+    if (st === "down") return "failed";
+    return st;
+}
+
+function formatDuration(secs) {
+    if (secs < 60) return `${secs} second${secs === 1 ? "" : "s"}`;
+    if (secs < 3600) {
+        const m = Math.floor(secs / 60);
+        return `${m} minute${m === 1 ? "" : "s"}`;
+    }
+    if (secs < 86400) {
+        const h = Math.floor(secs / 3600);
+        return `${h} hour${h === 1 ? "" : "s"}`;
+    }
+    const d = Math.floor(secs / 86400);
+    return `${d} day${d === 1 ? "" : "s"}`;
+}
+
+function onSlabClick(name) {
+    // Systems that are mid-install/load aren't queryable yet; ignore
+    // clicks so the user doesn't get a stranded selection.
+    const st = stateByName[name] && stateByName[name].state;
+    if (st === "provisioning") return;
+    // Click on the already-selected system = shortcut to run the
+    // current query, as long as that system is in a queryable state.
+    if (name === selected) {
+        if (st && st !== "down") runQuery();
+        return;
+    }
+    select(name);
+}
+
+function select(name) {
+    selected = name;
+    location.hash = name;
+    for (const row of listEl.children) {
+        row.classList.toggle("selected", row.dataset.name === name);
+    }
+    if (stateByName[name]) {
+        stateBlob.textContent = JSON.stringify(stateByName[name], null, 2);
+    }
+    showResult(resultsByName[name]);
+    loadExamples(name);
+    refreshDownUI();
+    // Kick the restore in the background so the VM is hopefully ready
+    // by the time the user presses Run query. No-op if the system is
+    // already ready / provisioning / has no snapshot.
+    maybeWarmup(name);
+}
+
+function maybeWarmup(name) {
+    const s = stateByName[name];
+    if (!s || s.state !== "snapshotted") return;
+    fetch(`${API}/api/warmup/${encodeURIComponent(name)}`, {method: "POST"})
+        .catch(() => {});  // fire-and-forget
+}
+
+async function loadExamples(name) {
+    let qs = queriesByName[name];
+    if (!qs) {
+        try {
+            const r = await fetch(`${API}/api/queries/${encodeURIComponent(name)}`);
+            qs = r.ok ? await r.json() : [];
+        } catch (e) {
+            qs = [];
+        }
+        queriesByName[name] = qs;
+    }
+    if (selected !== name) return;  // user moved on
+    // Preserve the example index across system switches: if the user
+    // had Q5 selected for system A, switching to B keeps Q5.
+    const prevIndex = parseInt(exampleSel.value, 10);
+    exampleSel.innerHTML = "";
+    if (!qs.length) {
+        const o = document.createElement("option");
+        o.textContent = "(no examples)";
+        o.disabled = true;
+        exampleSel.appendChild(o);
+    } else {
+        // Unselected placeholder so the first real change(...) the
+        // user picks always counts as "different from current value"
+        // and fires the change handler — even if they're re-picking
+        // the option that was selected before they edited the
+        // textarea. The textarea-edit handler below resets us back
+        // to this entry.
+        const placeholder = document.createElement("option");
+        placeholder.value = "";
+        placeholder.textContent = "—";
+        placeholder.disabled = true;
+        exampleSel.appendChild(placeholder);
+        for (let i = 0; i < qs.length; i++) {
+            const o = document.createElement("option");
+            o.value = String(i);
+            const label = qs[i].replace(/\s+/g, " ").slice(0, 90);
+            o.textContent = `Q${i + 1}: ${label}`;
+            exampleSel.appendChild(o);
+        }
+        // Clamp prevIndex into range; default to 0.
+        let idx = 0;
+        if (!isNaN(prevIndex) && prevIndex >= 0 && prevIndex < qs.length) {
+            idx = prevIndex;
+        }
+        exampleSel.value = String(idx);
+        refreshRunAllVisibility();
+        // Replace the textarea with this system's example at the same
+        // index, but only if the user hasn't edited the current text
+        // (i.e., it still matches whatever example we last set, or
+        // it's empty).
+        const isPristine = queryEl.value === pristineQuery
+            || !queryEl.value.trim();
+        if (isPristine) {
+            queryEl.value = qs[idx];
+            pristineQuery = qs[idx];
+        }
+    }
+}
+
+let lastDownShownName = null;
+
+function refreshDownUI() {
+    const s = stateByName[selected];
+    const isDown = s && s.state === "down";
+    for (const el of uiActive) {
+        if (el) el.style.display = isDown ? "none" : "";
+    }
+    if (isDown) {
+        uiOutput.style.display = "none";
+        uiStats.style.display = "none";
+    }
+    uiDown.style.display = isDown ? "" : "none";
+    if (isDown) {
+        // Render the last error once per selection. If poll picks up a
+        // new last_error for the same system later, leave the UI alone
+        // — the user is reading the text, we shouldn't move it under
+        // their eyes.
+        if (lastDownShownName !== selected) {
+            const raw = (s && s.last_error) || "(no error recorded)";
+            lastErrorEl.textContent = raw
+                .replace(/\\n/g, "\n")
+                .replace(/\\t/g, "\t")
+                .replace(/\\r/g, "");
+            lastDownShownName = selected;
+        }
+    } else {
+        lastDownShownName = null;
+    }
+}
+
+function _maybePrettyJson(s) {
+    // If the body parses as JSON (entire string), re-emit it with
+    // 2-space indentation. Otherwise return the original — most
+    // engines print plain tables and we shouldn't touch them.
+    if (!s || s.length < 2) return s;
+    const first = s.charCodeAt(0);
+    // Cheap pre-filter: only attempt JSON.parse when the first
+    // non-whitespace byte looks like '{' or '['. Avoids parsing
+    // every 14 GB count(*) row through a try/catch.
+    let i = 0;
+    while (i < s.length && (s.charCodeAt(i) <= 32)) i++;
+    const c = s.charCodeAt(i);
+    if (c !== 123 /* { */ && c !== 91 /* [ */) return s;
+    try {
+        const parsed = JSON.parse(s);
+        // Only pretty-print structured values; bare numbers/strings/
+        // booleans shouldn't be re-serialized.
+        if (parsed === null || typeof parsed !== "object") return s;
+        return JSON.stringify(parsed, null, 2);
+    } catch {
+        return s;
+    }
+}
+
+function showResult(r) {
+    if (!r) {
+        outEl.textContent = "";
+        timeEl.textContent = "—";
+        outLabelEl.textContent = "Output";
+        uiOutput.style.display = "none";
+        uiStats.style.display = "none";
+        return;
+    }
+    outEl.textContent = _maybePrettyJson(r.output);
+    timeEl.textContent = r.time;
+    outLabelEl.textContent = r.truncated === "yes" ? "Output (truncated)" : "Output";
+    uiOutput.style.display = "";
+    uiStats.style.display = "";
+}
+
+async function pollState() {
+    try {
+        const r = await fetch(API + "/api/state");
+        if (!r.ok) throw new Error(`HTTP ${r.status}`);
+        const arr = await r.json();
+        stateByName = {};
+        for (const s of arr) stateByName[s.name] = s;
+        // Update each row's color + state badge without rebuilding the DOM
+        for (const row of listEl.children) {
+            const s = stateByName[row.dataset.name];
+            const st = (s && s.state) || "down";
+            row.className = `system-item state-${st}` +
+                (row.dataset.name === selected ? " selected" : "");
+            row.dataset.tooltip = tooltipFor(s, st);
+        }
+        if (selected && stateByName[selected]) {
+            stateBlob.textContent = JSON.stringify(stateByName[selected], null, 2);
+        }
+        refreshDownUI();
+    } catch (e) {
+        stateBlob.textContent = String(e);
+    }
+}
+
+// X-Error is URL-encoded on the wire because HTTP headers can't carry
+// raw \n. Decode here so real newlines survive end-to-end. Falls back
+// to the raw value if the header isn't actually encoded (older agents).
+function _decodeXError(s) {
+    if (!s) return s;
+    try { return decodeURIComponent(s); }
+    catch { return s; }
+}
+
+async function runQuery() {
+    if (!selected) return;
+    const sql = queryEl.value;
+    if (!sql.trim()) return;
+    // Running a single query takes us out of competition view.
+    hideRunAll();
+    runBtn.disabled = true;
+    outEl.textContent = "(running …)";
+    timeEl.textContent = "…";
+    outLabelEl.textContent = "Output";
+    uiOutput.style.display = "";
+    uiStats.style.display = "";
+
+    const target = selected;  // capture in case the user switches mid-flight
+    const t0 = performance.now();
+    let payload = null;
+    try {
+        const r = await fetch(`${API}/api/query?system=${encodeURIComponent(target)}`, {
+            method: "POST",
+            body: sql,
+            headers: {"Content-Type": "application/octet-stream"},
+        });
+        const body = await r.arrayBuffer();
+        const txt = bytesToText(body) || "(no output)";
+        const h = (k) => r.headers.get(k);
+        const qt = h("X-Query-Time");
+        const wt = h("X-Wall-Time");
+        let output = txt;
+        if (r.status >= 400) {
+            const err = _decodeXError(h("X-Error"));
+            if (err) {
+                const trailer = `\n\n(error)\n${err}`;
+                output = (txt === "(no output)" ? "" : txt) + trailer;
+            }
+        }
+        payload = {
+            output,
+            time: qt ? `${parseFloat(qt).toFixed(3)} s` : "—",
+            wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${((performance.now() - t0) / 1000).toFixed(3)} s`,
+            bytes: h("X-Output-Bytes") || String(body.byteLength),
+            truncated: h("X-Output-Truncated") === "1" ? "yes" : "no",
+            exit: h("X-Exit-Code") || String(r.status),
+        };
+        // Permalink: the server returns a base64url 64-bit id; drop
+        // it in the URL bar so reload/share keeps the result.
+        const qid = h("X-Query-Id");
+        if (qid) {
+            const u = new URL(window.location.href);
+            u.searchParams.set("q", qid);
+            window.history.replaceState({}, "", u.toString());
+        }
+    } catch (e) {
+        payload = {
+            output: `(client error)\n${e}`,
+            time: "—", wall: "—", bytes: "—", truncated: "—", exit: "err",
+        };
+    } finally {
+        runBtn.disabled = false;
+    }
+    resultsByName[target] = payload;
+    if (selected === target) showResult(payload);
+}
+
+function bytesToText(buf) {
+    try {
+        return new TextDecoder("utf-8", {fatal: false}).decode(buf);
+    } catch {
+        return [...new Uint8Array(buf)].map(b => String.fromCharCode(b)).join("");
+    }
+}
+
+runBtn.addEventListener("click", runQuery);
+function applyExampleIdx(i) {
+    const qs = queriesByName[selected];
+    if (!qs || isNaN(i) || i < 0 || i >= qs.length) return;
+    queryEl.value = qs[i];
+    pristineQuery = qs[i];
+}
+
+function applyCurrentExample() {
+    applyExampleIdx(parseInt(exampleSel.value, 10));
+}
+
+exampleSel.addEventListener("change", () => {
+    applyCurrentExample();
+    refreshRunAllVisibility();
+    hideRunAll();
+});
+// When the user types in the textarea, mark the select as
+// "unselected" (the disabled placeholder option). That way a
+// subsequent click on whatever they had picked before counts as a
+// real change and re-applies the example — no more blur-listener
+// hack.
+queryEl.addEventListener("input", () => {
+    if (queryEl.value !== pristineQuery) {
+        exampleSel.value = "";
+    }
+    refreshRunAllVisibility();
+});
+queryEl.addEventListener("keydown", (e) => {
+    if ((e.metaKey || e.ctrlKey) && e.key === "Enter") runQuery();
+});
+
+async function maybeLoadShared() {
+    // ?q= permalink — fetch the saved query+result and
+    // replay it as if we just ran it.
+    const u = new URL(window.location.href);
+    const qid = u.searchParams.get("q");
+    if (!qid) return;
+    try {
+        const r = await fetch(`${API}/api/saved/${encodeURIComponent(qid)}`);
+        if (!r.ok) return;
+        const row = await r.json();
+        // The CH parameterized view returns JSONEachRow → one object.
+        const sys = row.system;
+        if (sys && stateByName[sys]) {
+            select(sys);
+        }
+        queryEl.value = row.query || "";
+        pristineQuery = queryEl.value;
+        const payload = {
+            output: row.output || "(no output)",
+            time: row.query_time != null ? `${row.query_time.toFixed(3)} s` : "—",
+            wall: row.wall_time != null ? `${row.wall_time.toFixed(3)} s` : "—",
+            bytes: String(row.output_bytes ?? ""),
+            truncated: row.output_truncated ? "yes" : "no",
+            exit: String(row.status ?? ""),
+        };
+        if (row.error) {
+            payload.output = (payload.output === "(no output)" ? "" : payload.output)
+                + `\n\n(error)\n${row.error}`;
+        }
+        resultsByName[selected] = payload;
+        showResult(payload);
+    } catch (e) {
+        console.error("failed to load shared query", e);
+    }
+}
+
+(async function init() {
+    // Treat the HTML default ("SELECT COUNT(*) FROM hits;") as pristine
+    // so first-system selection is free to swap it for the first
+    // example.
+    pristineQuery = queryEl.value;
+    await loadCatalog();
+    await pollState();
+    await maybeLoadShared();
+    pollTimer = setInterval(pollState, 2000);
+})();
+
+// ─── Competition mode ────────────────────────────────────────────────
+// "Run all" fires the SAME example index against every snapshotted
+// system in parallel, each using its OWN translation of that query
+// (pandas runs hits.count(), polars runs hits.select(pl.len())..., etc.).
+// Results are rendered in a table that re-sorts on every update:
+// completed (fastest first), then failed (alphabetical), then running
+// (alphabetical).
+const runAllBtn = $("#run-all");
+const runAllSection = $("#ui-runall");
+const runAllTable = $("#runall-table");
+
+function refreshRunAllVisibility() {
+    // Always available when there's *something* to run: a picked
+    // example index OR a non-empty custom query in the textarea.
+    const haveExample = exampleSel.value !== ""
+        && !isNaN(parseInt(exampleSel.value, 10));
+    const haveCustom = queryEl.value.trim() !== "";
+    runAllBtn.style.display = (haveExample || haveCustom) ? "" : "none";
+}
+
+function _setRailHover(name) {
+    // Toggle a `.slab-hover` class on the matching tr in the
+    // competition table. No-op when the rail isn't visible.
+    if (runAllSection.style.display === "none") return;
+    const tbody = runAllTable.querySelector("tbody");
+    if (!tbody) return;
+    for (const tr of tbody.children) {
+        tr.classList.toggle("slab-hover", tr.dataset.name === name);
+    }
+}
+
+function hideRunAll() {
+    // Picking a different example or pressing Run are signals that
+    // the user's attention has moved off the competition rail.
+    // Collapse it so the right pane retakes the full width. Editing
+    // the textarea does *not* trigger this — the user may want to
+    // tweak the query, see the results refresh, and still compare
+    // against the rail.
+    if (runAllSection.style.display === "none") return;
+    runAllSection.style.display = "none";
+    uiSplit.classList.remove("split");
+}
+
+async function ensureQueriesLoaded(name) {
+    if (queriesByName[name]) return queriesByName[name];
+    try {
+        const r = await fetch(`${API}/api/queries/${encodeURIComponent(name)}`);
+        queriesByName[name] = r.ok ? await r.json() : [];
+    } catch {
+        queriesByName[name] = [];
+    }
+    return queriesByName[name];
+}
+
+const uiSplit = $("#ui-split");
+
+function _measureSplitOffset() {
+    // Pin the split row's height so that aside scrolls inside its
+    // own track. Read the distance from the page top to #ui-split
+    // and store it in a CSS var; the layout rule subtracts it from
+    // 100 vh.
+    const top = uiSplit.getBoundingClientRect().top + window.scrollY;
+    document.documentElement.style.setProperty("--ui-split-offset", `${top + 20}px`);
+}
+window.addEventListener("resize", _measureSplitOffset);
+
+async function runAll() {
+    const idx = parseInt(exampleSel.value, 10);
+    const useExampleIndex = !isNaN(idx);
+    const customQuery = queryEl.value;
+    if (!useExampleIndex && !customQuery.trim()) return;
+    // Track which mode the competition was launched in. pickFromRunAll
+    // uses this to decide whether to update pristineQuery on rail
+    // clicks: in example-mode, each row click should re-baseline
+    // pristineQuery to the new system's example translation so it
+    // tracks the visible textarea cleanly; in custom-mode the
+    // original pristineQuery is intentionally stale (different from
+    // the textarea) so the user's edit isn't treated as pristine and
+    // clobbered by loadExamples.
+    runAllExampleMode = useExampleIndex;
+    runAllBtn.disabled = true;
+    runAllSection.style.display = "";
+    uiSplit.classList.add("split");
+    _measureSplitOffset();
+    runAllSection.focus();
+
+    // Collect candidate systems. With an example picked, each system
+    // runs its OWN translation of the example at the same index
+    // (the apples-to-apples ClickBench format). With a custom query
+    // in the textarea, every system runs the exact same string —
+    // the systems whose query language doesn't accept it will just
+    // show up in the failed bucket.
+    const candidates = Object.values(stateByName)
+        .filter(s => s.state === "snapshotted" || s.state === "ready");
+    const targets = [];
+    for (const s of candidates) {
+        if (useExampleIndex) {
+            const qs = await ensureQueriesLoaded(s.name);
+            if (qs && idx < qs.length) targets.push({name: s.name, query: qs[idx]});
+        } else {
+            targets.push({name: s.name, query: customQuery});
+        }
+    }
+
+    const status = {};
+    for (const t of targets) {
+        status[t.name] = {
+            state: "running",
+            runs: [{state: "running"}, {state: "pending"}, {state: "pending"}],
+            query: t.query,
+        };
+    }
+    // Reset the flash-diff cache so the first render seeds 'running'
+    // for every row without animating them all at once.
+    runAllLast = {};
+    runAllSelected = null;
+    runAllStatus = status;
+    renderRunAll(status);
+
+    // Random order: keep the table sorted but fire requests in a
+    // shuffled sequence so no single system gets a systematic head
+    // start.
+    const shuffled = targets.slice();
+    for (let i = shuffled.length - 1; i > 0; i--) {
+        const j = Math.floor(Math.random() * (i + 1));
+        [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
+    }
+
+    async function _runOne(t) {
+        const t0 = performance.now();
+        try {
+            const r = await fetch(`${API}/api/query?system=${encodeURIComponent(t.name)}`, {
+                method: "POST",
+                body: t.query,
+                headers: {"Content-Type": "application/octet-stream"},
+            });
+            const body = await r.arrayBuffer();
+            const txt = bytesToText(body) || "(no output)";
+            const h = (k) => r.headers.get(k);
+            const qid = h("X-Query-Id");
+            if (r.status >= 400) {
+                const err = _decodeXError(h("X-Error")) || `HTTP ${r.status}`;
+                return {
+                    ok: false, note: err, qid,
+                    payload: {
+                        output: `(error)\n${err}`,
+                        time: "—", wall: "—", bytes: "—",
+                        truncated: "—", exit: String(r.status),
+                    },
+                };
+            }
+            const qt = h("X-Query-Time");
+            const wt = h("X-Wall-Time");
+            const tsec = qt != null && qt !== ""
+                ? parseFloat(qt)
+                : (wt != null && wt !== "" ? parseFloat(wt) : (performance.now() - t0) / 1000);
+            return {
+                ok: true, time: tsec, qid,
+                payload: {
+                    output: txt,
+                    time: qt ? `${parseFloat(qt).toFixed(3)} s` : "—",
+                    wall: wt ? `${parseFloat(wt).toFixed(3)} s` : `${tsec.toFixed(3)} s`,
+                    bytes: h("X-Output-Bytes") || String(body.byteLength),
+                    truncated: h("X-Output-Truncated") === "1" ? "yes" : "no",
+                    exit: h("X-Exit-Code") || String(r.status),
+                },
+            };
+        } catch (e) {
+            return {ok: false, note: String(e)};
+        }
+    }
+
+    function _recordRun(name, idx, res, query) {
+        const s = status[name];
+        if (!s) return;
+        s.runs[idx] = res.ok
+            ? {state: "done", time: res.time}
+            : {state: "failed", note: res.note};
+        // Cache the first successful run's payload for click-to-show.
+        if (res.ok && !s.payload) {
+            s.payload = res.payload;
+            s.qid = res.qid;
+            s.query = query;
+        } else if (!res.ok && !s.payload && idx === 0) {
+            s.payload = res.payload || {
+                output: `(error)\n${res.note || ""}`,
+                time: "—", wall: "—", bytes: "—", truncated: "—", exit: "err",
+            };
+            s.query = query;
+        }
+        // Overall: failed if any run failed; done when all 3 are done;
+        // running otherwise. bestTime tracks the min of whatever runs
+        // have completed so far so partial-done systems can sort into
+        // the done group instead of sitting in running until the last
+        // round lands.
+        const doneRuns = s.runs.filter(r => r.state === "done");
+        s.bestTime = doneRuns.length
+            ? Math.min(...doneRuns.map(r => r.time))
+            : undefined;
+        if (s.runs.some(r => r.state === "failed")) {
+            s.state = "failed";
+            s.note = s.runs.find(r => r.state === "failed").note;
+        } else if (s.runs.every(r => r.state === "done")) {
+            s.state = "done";
+            s.time = s.bestTime;
+        } else {
+            s.state = "running";
+        }
+        runAllStatus = status;
+        renderRunAll(status);
+    }
+
+    // Run all three rounds in the shuffled order; rounds 2 and 3 only
+    // fire for systems whose round 1 succeeded.
+    await Promise.all(shuffled.map(async (t) => {
+        const r1 = await _runOne(t);
+        _recordRun(t.name, 0, r1, t.query);
+        if (!r1.ok) return;
+        const r2 = await _runOne(t);
+        _recordRun(t.name, 1, r2, t.query);
+        if (!r2.ok) return;
+        const r3 = await _runOne(t);
+        _recordRun(t.name, 2, r3, t.query);
+    }));
+    runAllBtn.disabled = false;
+}
+
+let runAllStatus = {};
+let runAllSelected = null;
+let runAllExampleMode = false;
+
+function pickFromRunAll(name) {
+    const entry = runAllStatus[name];
+    if (!entry) return;
+    runAllSelected = name;
+    // Switch the system list highlight + state panel to this system.
+    if (stateByName[name]) select(name);
+    // Rewrite the query textarea + result pane to this system's run.
+    // pristineQuery handling depends on which mode the competition is
+    // in. In example-mode, entry.query IS the new system's example
+    // translation, so re-baseline pristineQuery so subsequent rail
+    // clicks see the textarea as pristine and let loadExamples swap
+    // in the next system's translation cleanly. In custom-mode,
+    // entry.query is the user's edited string; leaving pristineQuery
+    // alone keeps loadExamples from thinking the edit is pristine
+    // and replacing it with that system's example.
+    if (entry.query) {
+        queryEl.value = entry.query;
+        if (runAllExampleMode) pristineQuery = entry.query;
+    }
+    if (entry.payload) {
+        resultsByName[name] = entry.payload;
+        showResult(entry.payload);
+    }
+    // Update URL: prefer the X-Query-Id for sharability, fall back
+    // to a system-scoped permalink so reload at least reopens the
+    // right system.
+    const u = new URL(window.location.href);
+    if (entry.qid) {
+        u.searchParams.set("q", entry.qid);
+    } else {
+        u.searchParams.delete("q");
+    }
+    window.history.replaceState({}, "", u.toString());
+    renderRunAll(runAllStatus);  // re-paint to highlight the selected row
+}
+
+let runAllLast = {};
+
+function _runAllRowKey(s) {
+    const runs = (s.runs || []).map(r => {
+        if (!r) return "-";
+        if (r.state === "done") return `d:${r.time}`;
+        if (r.state === "failed") return `f`;
+        if (r.state === "pending") return `p`;
+        return r.state;
+    }).join("|");
+    return `${s.state}|${runs}`;
+}
+
+function renderRunAll(status) {
+    const done = [], failed = [], running = [];
+    for (const [name, s] of Object.entries(status)) {
+        if (s.state === "failed") failed.push({name, ...s});
+        // Include partial-done systems in the timed group so the
+        // table sorts as soon as the first run lands rather than
+        // waiting for all three rounds.
+        else if (s.bestTime != null) done.push({name, ...s});
+        else running.push({name, ...s});
+    }
+    done.sort((a, b) => a.bestTime - b.bestTime);
+    failed.sort((a, b) => a.name.localeCompare(b.name));
+    running.sort((a, b) => a.name.localeCompare(b.name));
+    // Diff against the previous render so only rows whose status
+    // string actually changed get the flash animation; otherwise the
+    // mere act of re-sorting would re-animate everyone every tick.
+    const changed = new Set();
+    for (const [name, s] of Object.entries(status)) {
+        const key = _runAllRowKey(s);
+        if (runAllLast[name] !== undefined && runAllLast[name] !== key) {
+            changed.add(name);
+        }
+        runAllLast[name] = key;
+    }
+    const tbody = runAllTable.querySelector("tbody");
+    tbody.innerHTML = "";
+    const fragment = document.createDocumentFragment();
+    const all = [...done, ...failed, ...running];
+    for (let i = 0; i < all.length; i++) {
+        const row = all[i];
+        const tr = document.createElement("tr");
+        const cls = [row.state];
+        if (changed.has(row.name)) cls.push("flash");
+        if (runAllSelected === row.name) cls.push("selected");
+        tr.className = cls.join(" ");
+        tr.dataset.name = row.name;
+        tr.addEventListener("click", () => pickFromRunAll(row.name));
+        const td1 = document.createElement("td");
+        td1.textContent = row.name;
+        tr.appendChild(td1);
+        const runs = row.runs || [{state: row.state, time: row.time, note: row.note}];
+        for (let k = 0; k < 3; k++) {
+            const td = document.createElement("td");
+            td.className = "time";
+            const r = runs[k];
+            if (!r || r.state === "pending") {
+                td.textContent = "—";
+                td.classList.add("pending");
+            } else if (r.state === "done") {
+                td.textContent = `${r.time.toFixed(3)} s`;
+            } else if (r.state === "failed") {
+                td.textContent = "failed";
+                td.title = r.note || "";
+            } else {
+                td.textContent = "running";
+            }
+            tr.appendChild(td);
+        }
+        fragment.appendChild(tr);
+    }
+    tbody.appendChild(fragment);
+    runAllOrder = all.map(r => r.name);
+}
+
+// Up/down navigation through the rail. The aside is focusable
+// (tabindex=0) so the user can tab into it; arrow keys then walk
+// the current sort order and pick the next/prev row.
+let runAllOrder = [];
+runAllSection.addEventListener("keydown", (e) => {
+    if (e.key !== "ArrowDown" && e.key !== "ArrowUp") return;
+    if (!runAllOrder.length) return;
+    e.preventDefault();
+    let i = runAllOrder.indexOf(runAllSelected);
+    if (i === -1) i = e.key === "ArrowDown" ? -1 : runAllOrder.length;
+    const step = e.key === "ArrowDown" ? 1 : -1;
+    const next = runAllOrder[Math.max(0, Math.min(runAllOrder.length - 1, i + step))];
+    pickFromRunAll(next);
+    // Keep the picked row in view inside the scrollable rail.
+    const sel = runAllTable.querySelector("tr.selected");
+    if (sel) sel.scrollIntoView({block: "nearest"});
+});
+
+runAllBtn.addEventListener("click", runAll);
diff --git a/playground/web/index.html b/playground/web/index.html
new file mode 100644
index 0000000000..b8622bf238
--- /dev/null
+++ b/playground/web/index.html
@@ -0,0 +1,69 @@
+
+
+
+
+
+ClickBench Playground — run SQL against 90+ databases
+
+
+
+
+

ClickBench Playground — run SQL against 90+ databases

+
+ +
+
+
+
+ +
+ + + + +
+ +
+ + +
+
+ +
+ + + + + + + +
+
+ System status +
loading…
+
+
+
+
+
+ + + + diff --git a/playground/web/style.css b/playground/web/style.css new file mode 100644 index 0000000000..ae1aaba0e8 --- /dev/null +++ b/playground/web/style.css @@ -0,0 +1,258 @@ +:root { + --fg: black; + --muted: #6e7681; + --border: black; + --bg: #fff8f0; + --bg-alt: white; + --fg-output: lightgray; + --bg-output: black; + --bg-error: #f88; + --accent: yellow; + --accent-fg: black; + --good: black; + --bad: #cf222e; + --warn: #9a6700; + --info: #0969da; +} + +* { box-sizing: border-box; } +html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); } +body { font: 14px sans-serif; } +header, main, footer { max-width: 100%; margin: 0; padding: 0 1em; } + +header { padding: 1em 1em; } +header h1 { margin: 0; font-size: 22px; font-weight: 600; } +.muted { color: var(--muted); font-weight: normal; } + +section { margin: 12px 0; } + +label { display: block; font-weight: 600; font-size: 12px; text-transform: uppercase; + letter-spacing: 0.04em; color: var(--muted); margin-bottom: 4px; } + +textarea, pre, input { + font-family: monospace; + font-size: 14px; + border: 1px solid var(--border); + background: var(--bg-alt); + color: var(--fg); + border-radius: 0; + max-height: 50vh; +} + +textarea { width: 100%; padding: 0.5em; resize: vertical; } + +pre#output { + padding: 10px; + color: var(--fg-output); + background: var(--bg-output); + margin: 0; + overflow: auto; + white-space: pre; + line-height: 1; +} + +pre#last-error { + padding: 10px; + background: var(--bg-error); + margin: 0; + overflow: auto; + white-space: pre-wrap; + word-break: break-word; + line-height: 1; +} + +pre#state-blob { + padding: 10px; + background: var(--bg-alt); + margin: 0; + overflow: auto; + white-space: pre-wrap; + word-break: break-word; +} + +button { + background: var(--accent); color: var(--accent-fg); + border: 1px solid var(--border); border-radius: 0; padding: 6px 16px; + font-weight: 600; cursor: pointer; +} +button:disabled { opacity: 0.6; cursor: not-allowed; } +button:hover:not(:disabled) { filter: brightness(0.95); } + +/* "Run all" — faint secondary button, pushed to the right end of the + buttons row via margin-left:auto so it sits opposite the primary + Run/Example controls. */ +button.run-all { + background: transparent; + color: var(--muted); + border-color: var(--muted); + font-weight: normal; + padding: 4px 10px; + margin-left: auto; +} +button.run-all:hover:not(:disabled) { + background: var(--accent); + color: var(--accent-fg); + border-color: var(--border); + filter: none; +} + +/* Competition-mode table. */ +#runall-table { + width: 100%; + border-collapse: collapse; + border: 1px solid var(--border); + background: var(--bg-alt); +} +#runall-table td { + padding: 4px 8px; + border-bottom: 1px solid #eee; + font-family: monospace; +} +#runall-table td.time { text-align: right; } +#runall-table tr.done td.time { color: var(--good); font-weight: 600; } +#runall-table tr.failed td.time { color: var(--bad); } +#runall-table tr.running td.time { color: var(--muted); } +#runall-table tr { cursor: pointer; } +#runall-table tr:hover td { background: #f5f5f5; } +#runall-table tr.selected td { background: white; color: var(--fg); font-weight: 600; } +#runall-table tr.selected:hover td { background: white; } +/* Hovering the matching slab in the system picker highlights the + competition row, so the user can move between picker and result + without losing track of which system they're looking at. */ +#runall-table tr.slab-hover td { background: #fff4d6; } +#runall-table tr.selected.slab-hover td { background: #fff4d6; } +#runall-table td.time.pending { color: var(--muted); } + +/* Default view: #ui-main fills the row, the (hidden) aside takes no + space. The .split modifier is added by JS only when competition + mode is active; that's when the grid actually splits into rail + + main and the rail shrinks to max-content. */ +#ui-split { margin: 12px 0; } +#ui-split.split { + display: grid; + grid-template-columns: max-content minmax(0, 1fr); + gap: 16px; + align-items: stretch; + /* Take the rest of the viewport (whatever's left below the + buttons row) so the rail can scroll independently rather than + stretching the page. */ + height: calc(100vh - var(--ui-split-offset, 200px)); +} +#ui-main { min-width: 0; overflow: auto; } +#ui-main > section:first-child { margin-top: 0; } +aside#ui-runall { + margin: 0; + overflow-y: auto; + /* Reserve scrollbar gutter even when content fits — without this + the rail's width fluctuates as rows finish, briefly making the + right pane overflow and the page grow a horizontal scrollbar. */ + scrollbar-gutter: stable; + height: 100%; + border: 1px solid var(--border); + background: var(--bg-alt); + outline: none; +} +aside#ui-runall #runall-table { + width: max-content; + border: 0; +} +@media (max-width: 800px) { + #ui-split.split { grid-template-columns: 1fr; height: auto; } +} + +/* Flash on every state change in competition mode: yellow that + fades out over a second. .flash is added by JS only to rows whose + stringified status differs from the previous render. */ +@keyframes runall-flash { + from { background-color: var(--accent); } + to { background-color: transparent; } +} +#runall-table tr.flash td { + animation: runall-flash 1s ease-out; +} + +.row { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; } +.row label { margin: 0; } +.stats { font-family: monospace; } +.stats span { color: var(--fg); } +#time { font-weight: bold; } + +/* System slabs — horizontal flex-wrap row of chiclets like the main + ClickBench results page. Each slab is a clickable button, background + colored by current state. */ +.system-list { + display: flex; + flex-wrap: wrap; + gap: 0.3em; +} +.system-item { + display: inline-block; + padding: 4px 8px; + cursor: pointer; + font-family: monospace; + font-size: 12px; + border: 1px solid var(--border); + background: var(--bg-alt); + color: var(--fg); + line-height: 1.2; + white-space: nowrap; + position: relative; + user-select: none; + -webkit-user-select: none; +} + +.system-item::after { + content: attr(data-tooltip); + position: absolute; + bottom: calc(100% + 8px); + left: 50%; + transform: translateX(-50%); + background: #000; + color: #fff; + padding: 3px 6px; + font-size: 11px; + line-height: 1.2; + white-space: nowrap; + pointer-events: none; + visibility: hidden; + z-index: 10; +} + +.system-item::before { + content: ""; + position: absolute; + /* Tip touches the slab; body fills the gap to the tooltip box. */ + bottom: 100%; + left: 50%; + transform: translateX(-50%); + border: 8px solid transparent; + border-top-color: #000; + border-bottom-width: 0; + pointer-events: none; + visibility: hidden; + z-index: 10; +} + +.system-item:hover::after, +.system-item:hover::before { + visibility: visible; +} +.system-item:hover { filter: brightness(0.95); } +.system-item.selected { + outline: 2px solid #000; + outline-offset: -2px; +} + +.system-item.state-snapshotted { background: #c8f0d4; color: var(--good); } +.system-item.state-ready { background: #a4e6b6; color: var(--good); font-weight: 600; } +.system-item.state-provisioning { + background: #eaeef2; + color: var(--muted); + cursor: not-allowed; +} +.system-item.state-down { background: #f6d1ce; color: var(--bad); } + +select { padding: 0.5em; border-radius: 0; } + +a { color: var(--accent); text-decoration: none; } +a:hover { text-decoration: underline; } diff --git a/polars-dataframe/.preserve-state b/polars-dataframe/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/polars-dataframe/benchmark.sh b/polars-dataframe/benchmark.sh index 90bdcf07e3..a5a40fb651 100755 --- a/polars-dataframe/benchmark.sh +++ b/polars-dataframe/benchmark.sh @@ -6,4 +6,9 @@ export BENCH_DURABLE=no # queries.sql holds those Python expressions, one per line, so the # default BENCH_QUERIES_FILE=queries.sql in lib/benchmark-common.sh # picks them up unchanged. +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/polars/benchmark.sh b/polars/benchmark.sh index 6bf667e4f0..4148581913 100755 --- a/polars/benchmark.sh +++ b/polars/benchmark.sh @@ -2,4 +2,9 @@ # Thin shim — actual flow is in lib/benchmark-common.sh. export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=no +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/polars/query b/polars/query index 9129884cf7..3aa96790e7 100755 --- a/polars/query +++ b/polars/query @@ -1,6 +1,7 @@ #!/bin/bash -# Reads a SQL query from stdin, dispatches to the running polars server. -# Stdout: server response JSON. +# Reads a polars expression from stdin, dispatches to the running +# polars server. +# Stdout: rendered result (the eval'd value as a string). # Stderr: query runtime in fractional seconds on the last line. # Exit non-zero on error. set -e @@ -19,5 +20,14 @@ if [ "$status" != "200" ]; then exit 1 fi -echo "$body" -echo "$body" | python3 -c 'import json,sys; print(json.load(sys.stdin)["elapsed"])' >&2 +# Server returns {"elapsed": , "result": ""}. +# Print the rendered result to stdout, elapsed to stderr. +printf '%s\n' "$body" | python3 -c ' +import json, sys +o = json.load(sys.stdin) +r = o.get("result", "") +sys.stdout.write(r) +if r and not r.endswith("\n"): + sys.stdout.write("\n") +sys.stderr.write(str(o["elapsed"]) + "\n") +' diff --git a/polars/server.py b/polars/server.py index 73f5413795..8f01385202 100755 --- a/polars/server.py +++ b/polars/server.py @@ -64,9 +64,17 @@ async def query(request: Request): except SyntaxError as e: raise HTTPException(status_code=400, detail=f"syntax error: {e}") start = timeit.default_timer() - eval(compiled, {"hits": hits, "pl": pl, "date": date}) + value = eval(compiled, {"hits": hits, "pl": pl, "date": date}) elapsed = round(timeit.default_timer() - start, 3) - return {"elapsed": elapsed} + # Render the eval result so the playground UI shows something + # instead of just a timing line. polars DataFrames / Series / + # LazyFrames have a useful __str__; everything else (scalar, + # tuple, dict, ...) falls through repr. + if isinstance(value, (pl.DataFrame, pl.Series, pl.LazyFrame)): + result = str(value) + else: + result = repr(value) + return {"elapsed": elapsed, "result": result} @app.get("/data-size") diff --git a/presto-partitioned/start b/presto-partitioned/start index 92bbe10997..125a4ca19d 100755 --- a/presto-partitioned/start +++ b/presto-partitioned/start @@ -17,4 +17,5 @@ sudo docker run -d --name presto \ -v "$PWD/etc/jvm.config:/opt/presto-server/etc/jvm.config:ro" \ -v "$PWD/etc/config.properties:/opt/presto-server/etc/config.properties:ro" \ -v "$PWD/data:/clickbench" \ + -v "/opt/clickbench/datasets_ro:/opt/clickbench/datasets_ro:ro" \ prestodb/presto:${PRESTO_VERSION} diff --git a/quickwit/benchmark.sh b/quickwit/benchmark.sh index 298de0454d..cb593204ba 100755 --- a/quickwit/benchmark.sh +++ b/quickwit/benchmark.sh @@ -5,4 +5,10 @@ export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes export BENCH_QUERIES_FILE="queries.json" +# After firecracker snapshot+restore the cluster's +# internal connections (brpc/gossip) are stale; ./start's +# shallow health probe doesn't notice and short-circuits. +# Tell the playground agent to ./stop the cluster before +# ./start so the next bring-up is from a clean state. +export PLAYGROUND_RESTART_AFTER_RESTORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/siglens/README.md b/siglens/README.md index 074b89ff98..0c43bf91f0 100644 --- a/siglens/README.md +++ b/siglens/README.md @@ -3,4 +3,4 @@ This document outlines the process for running a benchmark on SigLens, a observa Note about queries: - SigLens does not support SQL but supports Splunk Query Language (SPL). The SQL queries used by the benchmark have been translated into the splunk query language. - To ensure the accuracy of the translated Splunk Query Language queries, each SQL query was executed against the same dataset in ClickHouse. The responses from SigLens and ClickHouse were compared, and all results were identical. -- Some of the original queries are not supported and not run by the benchmark. The corresponding results have been recorded as null in `queries.spl` and `results.csv` respectively. +- Some of the original queries are not supported and not run by the benchmark. The corresponding results have been recorded as null in `queries.sql` and `results.csv` respectively. diff --git a/siglens/benchmark.sh b/siglens/benchmark.sh index 1eb2f016c9..d57ece0bc8 100755 --- a/siglens/benchmark.sh +++ b/siglens/benchmark.sh @@ -4,5 +4,5 @@ export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes # queries are SPL/Splunk QL, not SQL. -export BENCH_QUERIES_FILE="queries.spl" +export BENCH_QUERIES_FILE="queries.sql" exec ../lib/benchmark-common.sh diff --git a/siglens/queries.spl b/siglens/queries.sql similarity index 100% rename from siglens/queries.spl rename to siglens/queries.sql diff --git a/siglens/query b/siglens/query index 1ca8b5a918..a1fe18284a 100755 --- a/siglens/query +++ b/siglens/query @@ -7,7 +7,7 @@ set -e querytxt=$(cat) -# A "null" query in queries.spl means "not supported"; emit null timing. +# A "null" query in queries.sql means "not supported"; emit null timing. if [ "$querytxt" = "null" ]; then echo "{}" echo "null" >&2 diff --git a/tidb/.preserve-state b/tidb/.preserve-state deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tidb/benchmark.sh b/tidb/benchmark.sh index 107b9dbb65..73f1c4ad83 100755 --- a/tidb/benchmark.sh +++ b/tidb/benchmark.sh @@ -3,4 +3,9 @@ # TiDB Lightning loads from ..csv files; we use the CSV download. export BENCH_DOWNLOAD_SCRIPT="download-hits-csv" export BENCH_DURABLE=yes +# Skip the pre-snapshot ./stop+./start cycle: the loaded +# state lives only in the daemon's process memory (in-process +# DataFrame, JVM heap caches) and stopping wipes it. The +# playground agent reads this and snapshots the running daemon. +export PLAYGROUND_SKIP_RESTART_BEFORE_SNAPSHOT=yes exec ../lib/benchmark-common.sh diff --git a/trino-datalake-partitioned/benchmark.sh b/trino-datalake-partitioned/benchmark.sh index 63f96c9b51..64f93a400d 100755 --- a/trino-datalake-partitioned/benchmark.sh +++ b/trino-datalake-partitioned/benchmark.sh @@ -3,4 +3,6 @@ # Datalake variant: Parquet is read directly from public S3, no download. export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes +# Trino bootstrap on a cold sysdisk pushes past the 300s default. +export BENCH_CHECK_TIMEOUT=3600 exec ../lib/benchmark-common.sh diff --git a/trino-datalake/benchmark.sh b/trino-datalake/benchmark.sh index 63f96c9b51..64f93a400d 100755 --- a/trino-datalake/benchmark.sh +++ b/trino-datalake/benchmark.sh @@ -3,4 +3,6 @@ # Datalake variant: Parquet is read directly from public S3, no download. export BENCH_DOWNLOAD_SCRIPT="" export BENCH_DURABLE=yes +# Trino bootstrap on a cold sysdisk pushes past the 300s default. +export BENCH_CHECK_TIMEOUT=3600 exec ../lib/benchmark-common.sh diff --git a/trino-partitioned/start b/trino-partitioned/start index da87d704b4..b07580d780 100755 --- a/trino-partitioned/start +++ b/trino-partitioned/start @@ -13,4 +13,5 @@ sudo docker run -d --name trino \ -p 8080:8080 \ -v "$PWD/etc/catalog/hive.properties:/etc/trino/catalog/hive.properties:ro" \ -v "$PWD/data:/clickbench" \ + -v "/opt/clickbench/datasets_ro:/opt/clickbench/datasets_ro:ro" \ trinodb/trino:latest diff --git a/umbra/start b/umbra/start index eafb1ebcb4..097802346f 100755 --- a/umbra/start +++ b/umbra/start @@ -32,11 +32,44 @@ sudo docker run -d --name umbradb \ -v "$(pwd)/data:/data" \ -p 5432:5432 \ --ulimit nofile=1048576:1048576 \ - --ulimit memlock=8388608:8388608 \ + --ulimit memlock=-1:-1 \ umbradb/umbra:latest >/dev/null # Container needs a moment before psql can connect. for _ in $(seq 1 60); do - PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1 && exit 0 + if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres \ + -c 'SELECT 1' >/dev/null 2>&1; then + # Diagnostic dump so a future OOM during load lands with the + # memory/swap state of the VM in the provision log. Previously + # silent — every "unable to allocate memory" failure looked + # the same and we couldn't tell whether the agent's mkswap+ + # swapon ran, whether the container saw the swap, or whether + # the sysctl tweaks above stuck. + echo "=== umbra: VM memory state ===" + free -h || true + echo "=== umbra: swap state ===" + swapon --show=NAME,SIZE,USED,PRIO --bytes || true + echo "=== umbra: sysctl ===" + for k in vm.overcommit_memory vm.swappiness vm.max_map_count \ + vm.overcommit_ratio; do + echo " $k = $(sysctl -n $k 2>/dev/null)" + done + echo "=== umbra: container memory cgroup ===" + sudo docker inspect umbradb --format \ + 'memory={{.HostConfig.Memory}} memory-swap={{.HostConfig.MemorySwap}}' || true + echo "=== umbra: container memlock ulimit ===" + sudo docker exec umbradb sh -c 'ulimit -l' 2>&1 || true + cgpath=$(sudo docker inspect umbradb --format '{{.State.Pid}}' 2>/dev/null | \ + xargs -I{} cat /proc/{}/cgroup 2>/dev/null | awk -F: '{print $NF}') + if [ -n "$cgpath" ]; then + for f in memory.max memory.swap.max memory.swap.current; do + p="/sys/fs/cgroup${cgpath}/$f" + [ -r "$p" ] && echo " $f = $(cat "$p")" + done + fi + echo "=== umbra: container procs ===" + sudo docker top umbradb -eo pid,vsz,rss,comm 2>&1 | head -10 + exit 0 + fi sleep 1 done