Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ data/manifest.json
data/docs_selected.jsonl
.mypy_cache/
.venv
logs/
.venv-vastai/
deploy_runs/
logs/
23 changes: 23 additions & 0 deletions deploy/vast/Dockerfile.amd64
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.12-py3
FROM --platform=linux/amd64 ${BASE_IMAGE}

SHELL ["/bin/bash", "-lc"]

RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
git \
jq \
bc \
procps \
rsync \
openssh-client \
&& rm -rf /var/lib/apt/lists/*

COPY deploy/vast/requirements.remote.txt /tmp/requirements.remote.txt

RUN python3 -m venv --system-site-packages /opt/parameter-golf-venv \
&& /opt/parameter-golf-venv/bin/pip install --upgrade pip wheel setuptools \
&& /opt/parameter-golf-venv/bin/pip install --no-cache-dir -r /tmp/requirements.remote.txt

ENV PATH=/opt/parameter-golf-venv/bin:${PATH}
WORKDIR /workspace/parameter-golf
158 changes: 158 additions & 0 deletions deploy/vast/build_amd64_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
IMAGE_TAG="${PG_IMAGE_TAG:-}"
BASE_IMAGE="${PG_BASE_IMAGE:-nvcr.io/nvidia/pytorch:25.12-py3}"
PUSH_FLAG="${PG_PUSH_IMAGE:-1}"

docker_hub_repo_json() {
local image_ref="$1"
python3 - "$image_ref" <<'PY'
import json
import re
import sys

ref = sys.argv[1]
ref = ref.split("@", 1)[0]
last_slash = ref.rfind("/")
last_colon = ref.rfind(":")
if last_colon > last_slash:
ref = ref[:last_colon]
parts = ref.split("/")
if len(parts) == 2:
registry = "docker.io"
namespace, repo = parts
elif len(parts) >= 3 and ("." in parts[0] or ":" in parts[0] or parts[0] == "localhost"):
registry = parts[0]
namespace = parts[1]
repo = "/".join(parts[2:])
else:
raise SystemExit(1)
print(json.dumps({"registry": registry, "namespace": namespace, "repo": repo}))
PY
}

require_current_builder_supports_amd64() {
if ! docker buildx inspect >/tmp/pg_buildx_inspect.txt 2>&1; then
cat /tmp/pg_buildx_inspect.txt >&2 || true
echo "error: docker buildx inspect failed; select or create a buildx builder first" >&2
exit 1
fi
if ! grep -q 'linux/amd64' /tmp/pg_buildx_inspect.txt; then
cat /tmp/pg_buildx_inspect.txt >&2
echo "error: current buildx builder does not advertise linux/amd64 support" >&2
exit 1
fi
}

require_docker_hub_push_prereqs() {
local image_ref="$1"
local repo_json registry namespace repo

repo_json="$(docker_hub_repo_json "$image_ref")" || {
echo "error: expected a fully-qualified image tag like docker.io/<namespace>/<repo>:<tag>" >&2
exit 1
}
registry="$(python3 -c 'import json,sys; print(json.loads(sys.stdin.read())["registry"])' <<<"$repo_json")"
namespace="$(python3 -c 'import json,sys; print(json.loads(sys.stdin.read())["namespace"])' <<<"$repo_json")"
repo="$(python3 -c 'import json,sys; print(json.loads(sys.stdin.read())["repo"])' <<<"$repo_json")"

if [[ "$registry" != "docker.io" && "$registry" != "index.docker.io" ]]; then
return 0
fi

if ! python3 - <<'PY'
import json
import os
import sys

cfg_path = os.path.expanduser("~/.docker/config.json")
try:
with open(cfg_path, "r", encoding="utf-8") as fh:
cfg = json.load(fh)
except FileNotFoundError:
sys.exit(1)

auths = cfg.get("auths", {})
targets = {
"https://index.docker.io/v1/",
"index.docker.io",
"docker.io",
"registry-1.docker.io",
}

if any(key in auths for key in targets):
sys.exit(0)
if cfg.get("credsStore"):
sys.exit(0)
if any(key in cfg.get("credHelpers", {}) for key in targets):
sys.exit(0)
sys.exit(1)
PY
then
echo "error: no Docker Hub login is visible from this WSL/docker CLI environment" >&2
echo "hint: run 'docker login -u $namespace' in WSL before building" >&2
exit 1
fi

if ! python3 - "$namespace" "$repo" <<'PY'
import json
import sys
import urllib.error
import urllib.request

namespace, repo = sys.argv[1], sys.argv[2]
url = f"https://hub.docker.com/v2/repositories/{namespace}/{repo}/"
try:
with urllib.request.urlopen(url, timeout=20) as resp:
if resp.status == 200:
sys.exit(0)
except urllib.error.HTTPError as exc:
if exc.code == 404:
sys.exit(2)
raise
except Exception:
sys.exit(3)
sys.exit(4)
PY
then
rc=$?
if [[ "$rc" == "2" ]]; then
echo "error: Docker Hub repo $namespace/$repo does not exist yet" >&2
echo "hint: create https://hub.docker.com/repository/docker/$namespace/$repo first" >&2
else
echo "error: could not verify Docker Hub repo $namespace/$repo" >&2
fi
exit 1
fi
}

if [[ -z "$IMAGE_TAG" ]]; then
echo "error: set PG_IMAGE_TAG, e.g. ghcr.io/you/parameter-golf-vast:latest" >&2
exit 1
fi

cd "$ROOT_DIR"
require_current_builder_supports_amd64
if [[ "$PUSH_FLAG" == "1" ]]; then
require_docker_hub_push_prereqs "$IMAGE_TAG"
fi

CMD=(
docker buildx build
--platform linux/amd64
--build-arg "BASE_IMAGE=$BASE_IMAGE"
-f deploy/vast/Dockerfile.amd64
-t "$IMAGE_TAG"
)

if [[ "$PUSH_FLAG" == "1" ]]; then
CMD+=(--push)
else
CMD+=(--load)
fi

CMD+=(.)
echo "Running: ${CMD[*]}"
"${CMD[@]}"
22 changes: 22 additions & 0 deletions deploy/vast/create_vast_template.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
VAST_BIN="${VAST_BIN:-$ROOT_DIR/.venv-vastai/bin/vastai}"
TEMPLATE_NAME="${PG_TEMPLATE_NAME:-parameter-golf-8xh100}"
IMAGE="${PG_TEMPLATE_IMAGE:-nvcr.io/nvidia/pytorch:25.12-py3}"
DISK_SPACE="${PG_TEMPLATE_DISK_GB:-300}"
DESC="${PG_TEMPLATE_DESC:-Parameter Golf 8xH100 SSH template with direct mode and shared memory configured.}"
SEARCH_PARAMS="${PG_TEMPLATE_SEARCH_PARAMS:-gpu_name in [H100_SXM,H100_NVL] num_gpus=8 reliability>=0.98 static_ip=True direct_port_count>=2 cpu_arch=amd64 disk_space>=300 rented=False}"
ENV_FLAGS="${PG_TEMPLATE_ENV:---shm-size=64g -p 22:22}"

cd "$ROOT_DIR"
"$VAST_BIN" create template \
--name "$TEMPLATE_NAME" \
--image "$IMAGE" \
--env "$ENV_FLAGS" \
--search_params "$SEARCH_PARAMS" \
--disk_space "$DISK_SPACE" \
--desc "$DESC" \
--ssh \
--direct
40 changes: 40 additions & 0 deletions deploy/vast/ddp_smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

import os
import socket

import torch
import torch.distributed as dist


def main() -> None:
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])

torch.cuda.set_device(local_rank)
dist.init_process_group(backend="nccl")

tensor = torch.tensor([rank + 1.0], device="cuda")
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)

expected = world_size * (world_size + 1) / 2
if abs(float(tensor.item()) - float(expected)) > 1e-5:
raise RuntimeError(f"Unexpected all_reduce result {tensor.item()} != {expected}")

props = torch.cuda.get_device_properties(local_rank)
print(
"ddp_smoke_rank "
f"rank={rank} local_rank={local_rank} host={socket.gethostname()} "
f"device={torch.cuda.get_device_name(local_rank)!r} "
f"cc={props.major}.{props.minor} "
f"mem_total={props.total_memory}"
)
dist.barrier()
if rank == 0:
print(f"ddp_smoke:ok world_size={world_size} reduced_sum={tensor.item():.1f}")
dist.destroy_process_group()


if __name__ == "__main__":
main()
Loading