From d7c4f27b6e2bbcc632a5b77a08f42ab9bf32b0c2 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 07:28:15 -0600 Subject: [PATCH 01/16] start Co-Authored-By: Claude Opus 4.6 --- cosmos_curate/README.md | 96 +++++++++ cosmos_curate/all_nodes_init_script.py | 36 ++++ cosmos_curate/cosmos_curate_tokens.yaml | 4 + cosmos_curate/docker/anyscale.Dockerfile | 76 +++++++ .../docker/build_and_push_anyscale.sh | 35 +++ cosmos_curate/docker/build_cosmos.sh | 10 + cosmos_curate/docker/cosmos-curate.Dockerfile | 200 ++++++++++++++++++ cosmos_curate/docker/generate_dockerfile.sh | 8 + cosmos_curate/hello_world.yaml | 25 +++ cosmos_curate/model_download_worker.py | 24 +++ cosmos_curate/reference_pipeline.yaml | 26 +++ cosmos_curate/write_s3_creds_file.sh | 7 + 12 files changed, 547 insertions(+) create mode 100644 cosmos_curate/README.md create mode 100644 cosmos_curate/all_nodes_init_script.py create mode 100644 cosmos_curate/cosmos_curate_tokens.yaml create mode 100644 cosmos_curate/docker/anyscale.Dockerfile create mode 100755 cosmos_curate/docker/build_and_push_anyscale.sh create mode 100755 cosmos_curate/docker/build_cosmos.sh create mode 100644 cosmos_curate/docker/cosmos-curate.Dockerfile create mode 100755 cosmos_curate/docker/generate_dockerfile.sh create mode 100644 cosmos_curate/hello_world.yaml create mode 100644 cosmos_curate/model_download_worker.py create mode 100644 cosmos_curate/reference_pipeline.yaml create mode 100755 cosmos_curate/write_s3_creds_file.sh diff --git a/cosmos_curate/README.md b/cosmos_curate/README.md new file mode 100644 index 0000000..31322e5 --- /dev/null +++ b/cosmos_curate/README.md @@ -0,0 +1,96 @@ +# Cosmos Curate + +* awscli in image +* remove `COSMOS_S3_PROFILE_PATH` use default +* bake in PIXI for workspaces +* 1 layer for pixi again? +* can we eliminate code from being in the image at all for clarity? +* validate py_modules vs. baked in vs. entrypoint pixi run ref + +* ideally you can easily configure the (1) models downloaded (2) pipeline you run with options. entrypoint for (2), but (1) is hidden in script in a script +* ideally just a local cred file would be sufficient - just broadcast to nodes +* ideally do not think about s3 at all +* model download output should stream to stdout + +This repository has example Anyscale Jobs for the `cosmos-curate` Hello World & Reference Video Pipelines. + +To run these on Anyscale looks like: +``` +anyscale job submit -f hello_world.yaml +``` + +or: +``` +anyscale job submit -f reference_video_pipeline.yaml +``` + +The `entrypoint:` in each job will run: + +1. **python all_nodes_init_script.py** + +This runs the same script on all nodes to initialize state of the cluster. In particular + +(a) We have to set the hardcoded **/cosmos_curate/config/cosmos_curate.yaml** on each node from the shared storage `/mnt/user_storage/`. This is how `cosmos-curate` does API and model authentication. + +(b) We need to write our S3 creds to `COSMOS_S3_PROFILE_PATH` configurable default path of `/dev/shm/s3_creds_file`. + +(c) We run the `pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models gpt2` commands to download models for the job. + +2. **pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline** + +This is the actual pipeline entrypoint command. The `pixi run` depends on `PIXI_PROJECT_MANIFEST` being properly set to match what was built into the image. + +Where in turn + +``` +name: custom-image-cosmos +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/wagner-west-2:15 +ray_version: 2.48.0 +entrypoint: > + python all_nodes_init_script.py + && pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline +py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] +compute_config: + head_node: + instance_type: m5.2xlarge + resources: + CPU: 8 + GPU: 0 + flags: {} + worker_nodes: + - instance_type: g6e.2xlarge + flags: {} + min_nodes: 1 + max_nodes: 1 + market_type: ON_DEMAND +working_dir: "." +max_retries: 0 +env_vars: + PIXI_PROJECT_MANIFEST: /opt/cosmos-curate/pixi.toml +``` + +Key state of the `cosmos-curate` setup: + +**/cosmos_curate/config/cosmos_curate.yaml**: + +Where the + +To run `comsmos-curate` on Anyscale you need: + +1. Anyscale compatible Docker image. + +2. File based authentication + +# Building Anyscale Compatible Docker Image + +Can skip steps (0) and (1) as the `cosmos-curate.Dockerfile` is committed (and modified to layer per `pixi` env for faster pulling). This is how other `cosmos-curate` build configurations can be built. If you already have an image built you can start at (4) updating the image name and tag to build an Anyscale compatible image on top. + +Inside the `docker/` folder: + +0. `pip install -e .` inside of the **cosmos-curate/** repo to make the `cosmos-curate` cli command available + +1. `./generate_dockerfile.sh` to create **cosmos-curate.Dockerfile** + +2. `./build_cosmos.sh`. to produce `cosmos-curate:1` image to build Anyscale image on. + +4. `./build_and_push_anyscale.sh` to build and push `anyscale-cosmos-curate:1` image. diff --git a/cosmos_curate/all_nodes_init_script.py b/cosmos_curate/all_nodes_init_script.py new file mode 100644 index 0000000..7f8950b --- /dev/null +++ b/cosmos_curate/all_nodes_init_script.py @@ -0,0 +1,36 @@ +import ray +import subprocess +from time import perf_counter as pc + +SCRIPT = """ +set -e +cp /mnt/user_storage/cosmos-config.yaml /cosmos_curate/config/cosmos_curate.yaml +# Hello World +pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models gpt2 +# Reference Video Pipeline +pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models qwen2.5_vl,transnetv2,internvideo2_mm,bert +""" + +@ray.remote(num_cpus=0) +def run_init(): + try: + return subprocess.check_output(SCRIPT, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Init script failed (exit code {e.returncode}):\n{e.output.decode()}") from None + +if __name__ == "__main__": + t = pc() + ray.init(address="auto") + nodes = [n for n in ray.nodes() if n["Alive"]] + tasks = [ + run_init.options( + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=n["NodeID"], soft=False + ) + ).remote() + for n in nodes + ] + print(f"Downloading models on {len(tasks)} nodes...") + ray.get(tasks) + dur = pc() - t + print(f"Done. ({dur:0.1f}s)") diff --git a/cosmos_curate/cosmos_curate_tokens.yaml b/cosmos_curate/cosmos_curate_tokens.yaml new file mode 100644 index 0000000..808febe --- /dev/null +++ b/cosmos_curate/cosmos_curate_tokens.yaml @@ -0,0 +1,4 @@ +huggingface: + user: "" + api_key: "" + diff --git a/cosmos_curate/docker/anyscale.Dockerfile b/cosmos_curate/docker/anyscale.Dockerfile new file mode 100644 index 0000000..2bb2dca --- /dev/null +++ b/cosmos_curate/docker/anyscale.Dockerfile @@ -0,0 +1,76 @@ +# syntax=docker/dockerfile:1.3-labs +# Anyscale-compatible image built on top of the Cosmos Curate image. +# Build with: docker build --platform linux/amd64 -f docker/anyscale.Dockerfile -t cosmos-curate:anyscale . + +ARG COSMOS_TAG=2 +FROM cosmos-curate:${COSMOS_TAG} + +SHELL ["/bin/bash", "-c"] +ENV DEBIAN_FRONTEND=noninteractive + +# Install required system packages. +RUN set -euxo pipefail \ + && apt-get update -y \ + && apt-get install -y --no-install-recommends \ + sudo \ + tzdata \ + openssh-client \ + openssh-server \ + rsync \ + zip \ + unzip \ + git \ + gdb \ + curl \ + vim \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +# Rename ubuntu -> ray and align uid/gid with Anyscale requirements. +RUN set -euxo pipefail \ + && groupmod -n ray users \ + && usermod -l ray -d /home/ray -m ubuntu \ + && usermod -u 1000 -g 100 ray \ + && usermod -aG sudo ray \ + && echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \ + && chown -R ray:ray /home/ray + + +# Install required Python packages into the default Pixi env. +# Note: Jupyter is optional; uncomment if needed for workspaces. +RUN set -euxo pipefail \ + && pixi run -e default pip install --no-cache-dir \ + anyscale \ + packaging \ + boto3 \ + google \ + google-cloud-storage \ + terminado \ + && pixi run -e default pip install --no-cache-dir jupyterlab + +# Workspace dependencies (optional but safe for all images). +RUN set -euxo pipefail \ + && echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc \ + && echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc \ + && chown ray:ray /home/ray/.bashrc + +RUN mkdir -p /cosmos_curate/config /config \ + && chown -R ray /cosmos_curate /config +ENV COSMOS_S3_PROFILE_PATH=/mnt/user_storage/s3_creds_file +ENV PATH=/opt/cosmos-curate/.pixi/envs/default/bin:$PATH +ENV HOME=/home/ray +WORKDIR /home/ray +USER ray + +RUN sudo mkdir -p /anyscale/init +RUN sudo chown -R ray /anyscale/init +RUN <<'EOF' +sudo cat >/anyscale/init/init.sh <<'EOC' +ls -halrt /mnt/user_storage/ > /tmp/init_ls.log 2>&1 || true +EOC +EOF + +# Do not inherit base entrypoint; default to an interactive shell. +ENTRYPOINT [] +CMD ["bash"] diff --git a/cosmos_curate/docker/build_and_push_anyscale.sh b/cosmos_curate/docker/build_and_push_anyscale.sh new file mode 100755 index 0000000..4a590bf --- /dev/null +++ b/cosmos_curate/docker/build_and_push_anyscale.sh @@ -0,0 +1,35 @@ +TAG=${1:-2} +COSMOS_TAG=${2:-2} +REGISTRY=aws +REPO_ROOT=$HOME/git/cosmos-curate +IMAGE=anyscale-cosmos-curate + +docker build \ + --ulimit nofile=65536 \ + --progress=auto \ + --network=host \ + --build-arg COSMOS_TAG=${COSMOS_TAG} \ + -f $REPO_ROOT/docker/anyscale.Dockerfile \ + -t ${IMAGE}:$TAG \ + -t ${IMAGE}:latest \ + $REPO_ROOT + +SRC=${IMAGE}:${TAG} + +if [ "$REGISTRY" = "aws" ]; then + AWS_ACCOUNT=367974485317 + AWS_REGION=us-west-2 + AWS_REPO=wagner-west-2 + DST_BASE=${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${AWS_REPO} + aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com +else + PROJECT_ID=troubleshootingorg-gcp-pub + REGION=us-central1 + REPO=wagner-docker + DST_BASE=${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO}/${IMAGE} +fi + +docker tag ${SRC} ${DST_BASE}:${TAG} +docker push ${DST_BASE}:${TAG} +docker tag ${SRC} ${DST_BASE}:latest +docker push ${DST_BASE}:latest diff --git a/cosmos_curate/docker/build_cosmos.sh b/cosmos_curate/docker/build_cosmos.sh new file mode 100755 index 0000000..d605989 --- /dev/null +++ b/cosmos_curate/docker/build_cosmos.sh @@ -0,0 +1,10 @@ +TAG=1 +REPO_ROOT=$HOME/git/cosmos-curate +docker build \ + --ulimit nofile=65536 \ + --progress=auto \ + --network=host \ + -f $REPO_ROOT/docker/cosmos-curate.Dockerfile \ + -t cosmos-curate:$TAG \ + -t cosmos-curate:latest \ + $REPO_ROOT diff --git a/cosmos_curate/docker/cosmos-curate.Dockerfile b/cosmos_curate/docker/cosmos-curate.Dockerfile new file mode 100644 index 0000000..db1be97 --- /dev/null +++ b/cosmos_curate/docker/cosmos-curate.Dockerfile @@ -0,0 +1,200 @@ +# syntax=docker/dockerfile:1.3 +# Dockerfile template for cosmos-curate +# +# The dockerfile is templated so that we can provide different conda env information. +# Docs on docker best practices: +# - https://linuxhandbook.com/dockerize-python-apps/ +# - https://uwekorn.com/2021/03/01/deploying-conda-environments-in-docker-how-to-do-it-right.html +# - https://cloud.google.com/architecture/best-practices-for-building-containers + +ARG DEBIAN_FRONTEND=noninteractive + +FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 AS main + +SHELL ["/bin/bash", "-c"] +ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility +ENV TZ=America/Los_Angeles +# Get system level packages +RUN apt-get update \ + && apt-get install -y \ + # Needed for opencv + libsm6 libxext6 \ + # Needed because the certs age out sometimes? + ca-certificates \ + # Needed for installing pixi \ + wget \ + # Needed for pip install \ + git \ + # Needed for cuda profiling \ + nsight-systems-2025.3.2 \ + --option=Dpkg::Options::=--force-confdef \ + # Needed to copy model weights using rsync + rsync \ + && update-ca-certificates \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# GPU-accelerated ffmpeg (also needed for opencv) +ENV FFMPEG_VERSION=7.0.1 \ + NVCODEC_VERSION=12.1.14.0 +RUN mkdir -p /tmp && chmod 1777 /tmp && \ + apt-get update && \ + apt-get install -y \ + libcrypt-dev \ + autoconf \ + automake \ + build-essential \ + cmake \ + libaom-dev \ + libass-dev \ + libdav1d-dev \ + libdrm-dev \ + libfreetype6-dev \ + libgnutls28-dev \ + libnuma-dev \ + libopenh264-dev \ + libtool \ + libva-dev \ + libvorbis-dev \ + libvpx-dev \ + libwebp-dev \ + pkg-config \ + texinfo \ + vainfo \ + yasm \ + zlib1g-dev && \ + wget -O /tmp/nv-codec-headers.tar.gz https://github.com/FFmpeg/nv-codec-headers/releases/download/n${NVCODEC_VERSION}/nv-codec-headers-${NVCODEC_VERSION}.tar.gz && \ + tar xzvf /tmp/nv-codec-headers.tar.gz -C /tmp/ && \ + cd /tmp/nv-codec-headers-${NVCODEC_VERSION} && \ + make && \ + make install && \ + wget -O /tmp/ffmpeg-snapshot.tar.bz2 https://www.ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.bz2 && \ + tar xjvf /tmp/ffmpeg-snapshot.tar.bz2 -C /tmp/ && \ + cd /tmp/ffmpeg-${FFMPEG_VERSION} && \ + PATH="/usr/local/cuda/bin:$PATH" \ + ./configure \ + --prefix=/usr/local \ + --enable-nonfree \ + --enable-cuda-nvcc \ + --enable-libnpp \ + --enable-libopenh264 \ + --enable-libaom \ + --enable-libdav1d \ + --enable-libvorbis \ + --enable-libvpx \ + --enable-libwebp \ + --enable-vaapi \ + --extra-cflags=-I/usr/local/cuda/include \ + --extra-ldflags=-L/usr/local/cuda/lib64 \ + --extra-libs=-lpthread \ + --extra-libs=-lm \ + --disable-static \ + --enable-shared \ + --disable-doc \ + --disable-debug && \ + make -j$(nproc) && \ + make install && \ + ldconfig && \ + # Clean up + cd / && \ + rm -rf /tmp/ffmpeg* && \ + rm -rf /tmp/nv-codec-headers* && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install pixi +RUN wget -qO- https://pixi.sh/install.sh | PIXI_HOME=/usr/local PIXI_NO_PATH_UPDATE=1 sh + +# Common ENV variables needed by some ML libs +ENV AM_I_DOCKER=True \ + BUILD_WITH_CUDA=True \ + TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0;10.0+PTX" \ + CUDA_HOME="/usr/local/cuda" \ + XFORMERS_IGNORE_FLASH_VERSION_CHECK="1" \ + VLLM_WORKER_MULTIPROC_METHOD="spawn" \ + VLLM_USE_V1="1" + +# Disable Ray log dedup +ENV RAY_DEDUP_LOGS=0 \ + RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1 \ + RAY_MAX_LIMIT_FROM_API_SERVER=40000 \ + RAY_MAX_LIMIT_FROM_DATA_SOURCE=40000 \ + RAY_DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES=800000000000 \ + RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION=0.4 \ + RAY_gcs_rpc_server_connect_timeout_s=30 \ + RAY_gcs_rpc_server_reconnect_timeout_s=180 \ + RAY_WARN_BLOCKING_GET_INSIDE_ASYNC=0 \ + XENNA_RAY_METRICS_PORT=9002 + +# boto3 & pbss +ENV AWS_REQUEST_CHECKSUM_CALCULATION='when_required' + +# Set a bunch of env vars so that we cache weights in a workspace +ENV DEFAULT_WORKSPACE_LOC="/config/default_workspace" +ENV HF_HOME="${DEFAULT_WORKSPACE_LOC}/weights/hf_home/" \ + LAION_CACHE_HOME="${DEFAULT_WORKSPACE_LOC}/weights/laion_cache/" + +# Set permissive umask so all files in /opt/cosmos-curate are world-readable/writable. +# This avoids expensive recursive chown in downstream Dockerfiles. +RUN umask 0000 && mkdir -p /opt/cosmos-curate + +# Set up pixi environments +COPY --chmod=666 pixi.toml pixi.lock /opt/cosmos-curate/ + +# Install each pixi environment in a separate layer to reduce individual layer sizes +# and improve pull/streaming performance. Cache mounts keep rattler cache outside layers. +RUN --mount=type=cache,target=/root/.cache/rattler \ + umask 0000 && cd /opt/cosmos-curate && \ + export CONDA_OVERRIDE_CUDA=12.9.1 && \ + pixi install -e default --frozen + +RUN --mount=type=cache,target=/root/.cache/rattler \ + umask 0000 && cd /opt/cosmos-curate && \ + export CONDA_OVERRIDE_CUDA=12.9.1 && \ + pixi install -e legacy-transformers --frozen + +RUN --mount=type=cache,target=/root/.cache/rattler \ + umask 0000 && cd /opt/cosmos-curate && \ + export CONDA_OVERRIDE_CUDA=12.9.1 && \ + pixi install -e model-download --frozen + +RUN --mount=type=cache,target=/root/.cache/rattler \ + umask 0000 && cd /opt/cosmos-curate && \ + export CONDA_OVERRIDE_CUDA=12.9.1 && \ + pixi install -e transformers --frozen + +RUN --mount=type=cache,target=/root/.cache/rattler \ + umask 0000 && cd /opt/cosmos-curate && \ + export CONDA_OVERRIDE_CUDA=12.9.1 && \ + pixi install -e unified --frozen + +# RUN --mount=type=cache,target=/root/.cache/rattler \ +# umask 0000 && cd /opt/cosmos-curate && \ +# export CONDA_OVERRIDE_CUDA=12.9.1 && \ +# pixi install -e cuml --frozen + + +# Run any hacky post-install script for each environment +COPY package/cosmos_curate/envs/ /tmp/cosmos_curate_build_envs + + +# For cosmos-xenna development +# For every environment, uninstall cosmos-xenna and then reinstall from local build. + + +# Copy the video pipeline code +COPY cosmos_curate /opt/cosmos-curate/cosmos_curate +COPY tests /opt/cosmos-curate/tests +COPY pytest.ini .coveragerc /opt/cosmos-curate/ + +# Copy additional code paths into the container + + +# Debug env vars +# ENV PYTHON_LOG=debug RUST_LOG=debug VLLM_LOG_LEVEL=DEBUG + +# Expose port for FastAPI & Ray +EXPOSE 8000 6379 + +WORKDIR /opt/cosmos-curate + +CMD ["pixi", "run", "python", "cosmos_curate/scripts/onto_nvcf.py", "--helm", "True"] diff --git a/cosmos_curate/docker/generate_dockerfile.sh b/cosmos_curate/docker/generate_dockerfile.sh new file mode 100755 index 0000000..04344a5 --- /dev/null +++ b/cosmos_curate/docker/generate_dockerfile.sh @@ -0,0 +1,8 @@ +REPO_ROOT=$HOME/git/cosmos-curate +cosmos-curate image build \ + --curator-path "${REPO_ROOT}" \ + --image-name cosmos-curate \ + --image-tag 1 \ + --dry-run \ + --verbose \ + --dockerfile-output-path "${REPO_ROOT}/docker/cosmos-curate.Dockerfile" diff --git a/cosmos_curate/hello_world.yaml b/cosmos_curate/hello_world.yaml new file mode 100644 index 0000000..97eccc9 --- /dev/null +++ b/cosmos_curate/hello_world.yaml @@ -0,0 +1,25 @@ +name: custom-image-cosmos +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/wagner-west-2:15 +ray_version: 2.48.0 +entrypoint: > + python all_nodes_init_script.py + && pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline +py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] +compute_config: + head_node: + instance_type: m5.2xlarge + resources: + CPU: 8 + GPU: 0 + flags: {} + worker_nodes: + - instance_type: g6e.2xlarge + flags: {} + min_nodes: 1 + max_nodes: 1 + market_type: ON_DEMAND +working_dir: "." +max_retries: 0 +env_vars: + PIXI_PROJECT_MANIFEST: /opt/cosmos-curate/pixi.toml + COSMOS_S3_PROFILE_PATH: /mnt/user_storage/s3_creds_file diff --git a/cosmos_curate/model_download_worker.py b/cosmos_curate/model_download_worker.py new file mode 100644 index 0000000..6db7ee7 --- /dev/null +++ b/cosmos_curate/model_download_worker.py @@ -0,0 +1,24 @@ +import ray +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +import cosmos_curate + +@ray.remote(runtime_env={"py_executable": "pixi run -e model-download python", "excludes": ["./pixi/"], "env_vars": {"PIXI_PROJECT_MANIFEST": "/opt/cosmos-curate/pixi.toml"}}) +def download_model(): + # Only works from /opt/cosmos-curate. Not importable otherwise from package. + import cosmos_curate.core.managers.model_cli as cli + cli.main(["download", "--models", "qwen2.5_vl,transnetv2,internvideo2_mm,bert"]) + +if __name__ == "__main__": + ray.init(runtime_env={"env_vars": {"PIXI_PROJECT_MANIFEST": "/opt/cosmos-curate/pixi.toml"}, "py_modules": [cosmos_curate]}) + refs = [] + for n in ray.nodes(): + if not n["Alive"]: + continue + ref = ( + download_model + .options(scheduling_strategy=NodeAffinitySchedulingStrategy(node_id=n["NodeID"], soft=False)) + .remote() + ) + refs.append(ref) + ray.get(refs) + diff --git a/cosmos_curate/reference_pipeline.yaml b/cosmos_curate/reference_pipeline.yaml new file mode 100644 index 0000000..f9e8c02 --- /dev/null +++ b/cosmos_curate/reference_pipeline.yaml @@ -0,0 +1,26 @@ +name: custom-image-cosmos +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/wagner-west-2:15 +ray_version: 2.48.0 +entrypoint: > + python all_nodes_init_script.py + && pixi run python -m cosmos_curate.pipelines.video.run_pipeline split + --input-video-path "s3://ray-example-data/videos/Hollywood2-actions-videos/Hollywood2/AVIClips/" + --output-clip-path "/mnt/user_storage/output_clips3/" +py_modules: ["/Users/davidwagner/git/cosmos-curate"] +compute_config: + head_node: + instance_type: m5.2xlarge + resources: + CPU: 8 + GPU: 0 + flags: {} + worker_nodes: + - instance_type: g6e.4xlarge + flags: {} + min_nodes: 4 + max_nodes: 4 + market_type: ON_DEMAND +working_dir: "." +max_retries: 0 +env_vars: + PIXI_PROJECT_MANIFEST: /opt/cosmos-curate/pixi.toml diff --git a/cosmos_curate/write_s3_creds_file.sh b/cosmos_curate/write_s3_creds_file.sh new file mode 100755 index 0000000..7d0e9db --- /dev/null +++ b/cosmos_curate/write_s3_creds_file.sh @@ -0,0 +1,7 @@ +eval $(aws configure export-credentials --format env) +cat > /mnt/user_storage/s3_creds_file < Date: Sat, 28 Feb 2026 18:25:55 +0000 Subject: [PATCH 02/16] New anyscale image --- .../docker/anyscale-cosmos-curate.Dockerfile | 209 ++++++++++++++++++ cosmos_curate/docker/build_anyscale.sh | 12 + 2 files changed, 221 insertions(+) create mode 100644 cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile create mode 100755 cosmos_curate/docker/build_anyscale.sh diff --git a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile new file mode 100644 index 0000000..d52c1b1 --- /dev/null +++ b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile @@ -0,0 +1,209 @@ +# Dockerfile template for cosmos-curate +# +# The dockerfile is templated so that we can provide different conda env information. +# Docs on docker best practices: +# - https://linuxhandbook.com/dockerize-python-apps/ +# - https://uwekorn.com/2021/03/01/deploying-conda-environments-in-docker-how-to-do-it-right.html +# - https://cloud.google.com/architecture/best-practices-for-building-containers + +ARG DEBIAN_FRONTEND=noninteractive + +FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubuntu24.04 AS main + +SHELL ["/bin/bash", "-c"] +ENV NVIDIA_DRIVER_CAPABILITIES=compute,video,utility +ENV TZ=America/Los_Angeles +# Get system level packages +RUN apt-get update \ + && apt-get install -y \ + # Needed for opencv + libsm6 libxext6 \ + # Needed because the certs age out sometimes? + ca-certificates \ + # Needed for installing pixi \ + wget \ + # Needed for pip install \ + git \ + # Needed for cuda profiling \ + nsight-systems-2025.3.2 \ + --option=Dpkg::Options::=--force-confdef \ + # Needed to copy model weights using rsync + rsync \ + && update-ca-certificates \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# GPU-accelerated ffmpeg (also needed for opencv) +ENV FFMPEG_VERSION=7.0.1 \ + NVCODEC_VERSION=12.1.14.0 +RUN mkdir -p /tmp && chmod 1777 /tmp && \ + apt-get update && \ + apt-get install -y \ + libcrypt-dev \ + autoconf \ + automake \ + build-essential \ + cmake \ + libaom-dev \ + libass-dev \ + libdav1d-dev \ + libdrm-dev \ + libfreetype6-dev \ + libgnutls28-dev \ + libnuma-dev \ + libopenh264-dev \ + libtool \ + libva-dev \ + libvorbis-dev \ + libvpx-dev \ + libwebp-dev \ + pkg-config \ + texinfo \ + vainfo \ + yasm \ + zlib1g-dev && \ + wget -O /tmp/nv-codec-headers.tar.gz https://github.com/FFmpeg/nv-codec-headers/releases/download/n${NVCODEC_VERSION}/nv-codec-headers-${NVCODEC_VERSION}.tar.gz && \ + tar xzvf /tmp/nv-codec-headers.tar.gz -C /tmp/ && \ + cd /tmp/nv-codec-headers-${NVCODEC_VERSION} && \ + make && \ + make install && \ + wget -O /tmp/ffmpeg-snapshot.tar.bz2 https://www.ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.bz2 && \ + tar xjvf /tmp/ffmpeg-snapshot.tar.bz2 -C /tmp/ && \ + cd /tmp/ffmpeg-${FFMPEG_VERSION} && \ + PATH="/usr/local/cuda/bin:$PATH" \ + ./configure \ + --prefix=/usr/local \ + --enable-nonfree \ + --enable-cuda-nvcc \ + --enable-libnpp \ + --enable-libopenh264 \ + --enable-libaom \ + --enable-libdav1d \ + --enable-libvorbis \ + --enable-libvpx \ + --enable-libwebp \ + --enable-vaapi \ + --extra-cflags=-I/usr/local/cuda/include \ + --extra-ldflags=-L/usr/local/cuda/lib64 \ + --extra-libs=-lpthread \ + --extra-libs=-lm \ + --disable-static \ + --enable-shared \ + --disable-doc \ + --disable-debug && \ + make -j$(nproc) && \ + make install && \ + ldconfig && \ + # Clean up + cd / && \ + rm -rf /tmp/ffmpeg* && \ + rm -rf /tmp/nv-codec-headers* && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install pixi +RUN wget -qO- https://pixi.sh/install.sh | PIXI_HOME=/usr/local PIXI_NO_PATH_UPDATE=1 sh + +# Common ENV variables needed by some ML libs +ENV AM_I_DOCKER=True \ + BUILD_WITH_CUDA=True \ + TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0;10.0+PTX" \ + CUDA_HOME="/usr/local/cuda" \ + XFORMERS_IGNORE_FLASH_VERSION_CHECK="1" \ + VLLM_WORKER_MULTIPROC_METHOD="spawn" \ + VLLM_USE_V1="1" + +# Disable Ray log dedup +ENV RAY_DEDUP_LOGS=0 \ + RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1 \ + RAY_MAX_LIMIT_FROM_API_SERVER=40000 \ + RAY_MAX_LIMIT_FROM_DATA_SOURCE=40000 \ + RAY_DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES=800000000000 \ + RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION=0.4 \ + RAY_gcs_rpc_server_connect_timeout_s=30 \ + RAY_gcs_rpc_server_reconnect_timeout_s=180 \ + RAY_WARN_BLOCKING_GET_INSIDE_ASYNC=0 \ + XENNA_RAY_METRICS_PORT=9002 + +# boto3 & pbss +ENV AWS_REQUEST_CHECKSUM_CALCULATION='when_required' + +# Set a bunch of env vars so that we cache weights in a workspace +ENV DEFAULT_WORKSPACE_LOC="/config/default_workspace" +ENV HF_HOME="${DEFAULT_WORKSPACE_LOC}/weights/hf_home/" \ + LAION_CACHE_HOME="${DEFAULT_WORKSPACE_LOC}/weights/laion_cache/" + +# Set up pixi environments +COPY pixi.toml pixi.lock /opt/cosmos-curate/ + +# ========================================================================== +# Anyscale compatibility layer +# Ref: https://docs.anyscale.com/container-image/image-requirement.md +# +# Everything above runs as root. Everything below runs as ray. Critically the pixi envs need to be ray owned. +# ========================================================================== + +# Anyscale system packages +RUN set -euxo pipefail \ + && apt-get update -y \ + && apt-get install -y --no-install-recommends \ + sudo \ + tzdata \ + openssh-client \ + openssh-server \ + zip \ + unzip \ + gdb \ + curl \ + vim \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +# Rename ubuntu (uid 1000) -> ray and align with Anyscale requirements +# (uid 1000, gid 100, passwordless sudo). +RUN set -euxo pipefail \ + && groupmod -n ray users \ + && usermod -l ray -d /home/ray -m ubuntu \ + && usermod -u 1000 -g 100 ray \ + && usermod -aG sudo ray \ + && echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \ + && chown -R ray:ray /home/ray \ + && chown ray:ray /opt/cosmos-curate + +USER ray + +# ---------- pixi environments (owned by ray, no chown needed) ---------- +# If we install all the environments in a single layer, it's over 20GB and will cause slurm/NVCF to timeout pulling the +# layer. Since the cuml environment is large and needs non-overlapping RAPIDS packages, we install it separately. +RUN cd /opt/cosmos-curate && \ + export CONDA_OVERRIDE_CUDA=12.9.1 && \ + pixi install -e default -e model-download -e transformers -e unified --frozen && \ + pixi clean cache -y + +# ---------- Anyscale Python packages ---------- +# This also validates ray has write access to the pixi environments. +RUN set -euxo pipefail \ + && cd /opt/cosmos-curate \ + && pixi run -e default pip install --no-cache-dir \ + anyscale \ + packaging \ + boto3 \ + google \ + google-cloud-storage \ + terminado \ + && pixi run -e default pip install --no-cache-dir jupyterlab + +# Workspace shell setup (Anyscale workspace requirement). +RUN set -euxo pipefail \ + && echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc \ + && echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc + +RUN sudo mkdir -p /cosmos_curate/config /config /anyscale/init \ + && sudo chown -R ray:ray /cosmos_curate /config /anyscale/init + +ENV PATH=/opt/cosmos-curate/.pixi/envs/default/bin:$PATH \ + HOME=/home/ray +WORKDIR /home/ray + +ENTRYPOINT [] +CMD ["bash"] diff --git a/cosmos_curate/docker/build_anyscale.sh b/cosmos_curate/docker/build_anyscale.sh new file mode 100755 index 0000000..3b10ee9 --- /dev/null +++ b/cosmos_curate/docker/build_anyscale.sh @@ -0,0 +1,12 @@ +TAG=${1:-1} +REPO_ROOT=$HOME/git/cosmos-curate +IMAGE=anyscale-cosmos-curate + +docker build \ + --ulimit nofile=65536 \ + --progress=auto \ + --network=host \ + -f anyscale-cosmos-curate.Dockerfile \ + -t ${IMAGE}:$TAG \ + -t ${IMAGE}:latest \ + $REPO_ROOT From b4b2a0d7c403cb6459fb82b99dcdda7891c43a7c Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 18:32:55 +0000 Subject: [PATCH 03/16] New --- .../docker/anyscale-cosmos-curate.Dockerfile | 6 +++++ cosmos_curate/docker/push_anyscale.sh | 22 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100755 cosmos_curate/docker/push_anyscale.sh diff --git a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile index d52c1b1..fa3902f 100644 --- a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile @@ -193,6 +193,12 @@ RUN set -euxo pipefail \ terminado \ && pixi run -e default pip install --no-cache-dir jupyterlab +RUN cd /tmp \ + && curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o awscliv2.zip \ + && unzip -q awscliv2.zip \ + && sudo ./aws/install \ + && rm -rf aws awscliv2.zip + # Workspace shell setup (Anyscale workspace requirement). RUN set -euxo pipefail \ && echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc \ diff --git a/cosmos_curate/docker/push_anyscale.sh b/cosmos_curate/docker/push_anyscale.sh new file mode 100755 index 0000000..847bd90 --- /dev/null +++ b/cosmos_curate/docker/push_anyscale.sh @@ -0,0 +1,22 @@ +TAG=${1:-1} +REGISTRY=${2:-aws} +IMAGE=anyscale-cosmos-curate +SRC=${IMAGE}:${TAG} + +if [ "$REGISTRY" = "aws" ]; then + AWS_ACCOUNT=367974485317 + AWS_REGION=us-west-2 + AWS_REPO=wagner-west-2 + DST_BASE=${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${IMAGE} + aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com +else + PROJECT_ID=troubleshootingorg-gcp-pub + REGION=us-central1 + REPO=wagner-docker + DST_BASE=${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO}/${IMAGE} +fi + +docker tag ${SRC} ${DST_BASE}:${TAG} +docker push ${DST_BASE}:${TAG} +docker tag ${SRC} ${DST_BASE}:latest +docker push ${DST_BASE}:latest From c18544805a59f92230077f1395a22d11fe5d1e3b Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 18:33:38 +0000 Subject: [PATCH 04/16] up --- cosmos_curate/docker/anyscale.Dockerfile | 76 ------------------- .../docker/build_and_push_anyscale.sh | 35 --------- cosmos_curate/docker/build_cosmos.sh | 3 +- 3 files changed, 1 insertion(+), 113 deletions(-) delete mode 100644 cosmos_curate/docker/anyscale.Dockerfile delete mode 100755 cosmos_curate/docker/build_and_push_anyscale.sh diff --git a/cosmos_curate/docker/anyscale.Dockerfile b/cosmos_curate/docker/anyscale.Dockerfile deleted file mode 100644 index 2bb2dca..0000000 --- a/cosmos_curate/docker/anyscale.Dockerfile +++ /dev/null @@ -1,76 +0,0 @@ -# syntax=docker/dockerfile:1.3-labs -# Anyscale-compatible image built on top of the Cosmos Curate image. -# Build with: docker build --platform linux/amd64 -f docker/anyscale.Dockerfile -t cosmos-curate:anyscale . - -ARG COSMOS_TAG=2 -FROM cosmos-curate:${COSMOS_TAG} - -SHELL ["/bin/bash", "-c"] -ENV DEBIAN_FRONTEND=noninteractive - -# Install required system packages. -RUN set -euxo pipefail \ - && apt-get update -y \ - && apt-get install -y --no-install-recommends \ - sudo \ - tzdata \ - openssh-client \ - openssh-server \ - rsync \ - zip \ - unzip \ - git \ - gdb \ - curl \ - vim \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /var/run/sshd - -# Rename ubuntu -> ray and align uid/gid with Anyscale requirements. -RUN set -euxo pipefail \ - && groupmod -n ray users \ - && usermod -l ray -d /home/ray -m ubuntu \ - && usermod -u 1000 -g 100 ray \ - && usermod -aG sudo ray \ - && echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \ - && chown -R ray:ray /home/ray - - -# Install required Python packages into the default Pixi env. -# Note: Jupyter is optional; uncomment if needed for workspaces. -RUN set -euxo pipefail \ - && pixi run -e default pip install --no-cache-dir \ - anyscale \ - packaging \ - boto3 \ - google \ - google-cloud-storage \ - terminado \ - && pixi run -e default pip install --no-cache-dir jupyterlab - -# Workspace dependencies (optional but safe for all images). -RUN set -euxo pipefail \ - && echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc \ - && echo '[ -e ~/.workspacerc ] && source ~/.workspacerc' >> /home/ray/.bashrc \ - && chown ray:ray /home/ray/.bashrc - -RUN mkdir -p /cosmos_curate/config /config \ - && chown -R ray /cosmos_curate /config -ENV COSMOS_S3_PROFILE_PATH=/mnt/user_storage/s3_creds_file -ENV PATH=/opt/cosmos-curate/.pixi/envs/default/bin:$PATH -ENV HOME=/home/ray -WORKDIR /home/ray -USER ray - -RUN sudo mkdir -p /anyscale/init -RUN sudo chown -R ray /anyscale/init -RUN <<'EOF' -sudo cat >/anyscale/init/init.sh <<'EOC' -ls -halrt /mnt/user_storage/ > /tmp/init_ls.log 2>&1 || true -EOC -EOF - -# Do not inherit base entrypoint; default to an interactive shell. -ENTRYPOINT [] -CMD ["bash"] diff --git a/cosmos_curate/docker/build_and_push_anyscale.sh b/cosmos_curate/docker/build_and_push_anyscale.sh deleted file mode 100755 index 4a590bf..0000000 --- a/cosmos_curate/docker/build_and_push_anyscale.sh +++ /dev/null @@ -1,35 +0,0 @@ -TAG=${1:-2} -COSMOS_TAG=${2:-2} -REGISTRY=aws -REPO_ROOT=$HOME/git/cosmos-curate -IMAGE=anyscale-cosmos-curate - -docker build \ - --ulimit nofile=65536 \ - --progress=auto \ - --network=host \ - --build-arg COSMOS_TAG=${COSMOS_TAG} \ - -f $REPO_ROOT/docker/anyscale.Dockerfile \ - -t ${IMAGE}:$TAG \ - -t ${IMAGE}:latest \ - $REPO_ROOT - -SRC=${IMAGE}:${TAG} - -if [ "$REGISTRY" = "aws" ]; then - AWS_ACCOUNT=367974485317 - AWS_REGION=us-west-2 - AWS_REPO=wagner-west-2 - DST_BASE=${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${AWS_REPO} - aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com -else - PROJECT_ID=troubleshootingorg-gcp-pub - REGION=us-central1 - REPO=wagner-docker - DST_BASE=${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO}/${IMAGE} -fi - -docker tag ${SRC} ${DST_BASE}:${TAG} -docker push ${DST_BASE}:${TAG} -docker tag ${SRC} ${DST_BASE}:latest -docker push ${DST_BASE}:latest diff --git a/cosmos_curate/docker/build_cosmos.sh b/cosmos_curate/docker/build_cosmos.sh index d605989..bf7a9b6 100755 --- a/cosmos_curate/docker/build_cosmos.sh +++ b/cosmos_curate/docker/build_cosmos.sh @@ -2,9 +2,8 @@ TAG=1 REPO_ROOT=$HOME/git/cosmos-curate docker build \ --ulimit nofile=65536 \ - --progress=auto \ --network=host \ - -f $REPO_ROOT/docker/cosmos-curate.Dockerfile \ + -f cosmos-curate.Dockerfile \ -t cosmos-curate:$TAG \ -t cosmos-curate:latest \ $REPO_ROOT From 7bc88d36f6bc7f2062e5fecd2a8523090d778a79 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 18:35:26 +0000 Subject: [PATCH 05/16] Orig --- cosmos_curate/docker/cosmos-curate.Dockerfile | 45 ++++--------------- cosmos_curate/docker/generate_dockerfile.sh | 7 ++- 2 files changed, 13 insertions(+), 39 deletions(-) diff --git a/cosmos_curate/docker/cosmos-curate.Dockerfile b/cosmos_curate/docker/cosmos-curate.Dockerfile index db1be97..910c3c6 100644 --- a/cosmos_curate/docker/cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/cosmos-curate.Dockerfile @@ -1,4 +1,3 @@ -# syntax=docker/dockerfile:1.3 # Dockerfile template for cosmos-curate # # The dockerfile is templated so that we can provide different conda env information. @@ -133,44 +132,16 @@ ENV DEFAULT_WORKSPACE_LOC="/config/default_workspace" ENV HF_HOME="${DEFAULT_WORKSPACE_LOC}/weights/hf_home/" \ LAION_CACHE_HOME="${DEFAULT_WORKSPACE_LOC}/weights/laion_cache/" -# Set permissive umask so all files in /opt/cosmos-curate are world-readable/writable. -# This avoids expensive recursive chown in downstream Dockerfiles. -RUN umask 0000 && mkdir -p /opt/cosmos-curate - # Set up pixi environments -COPY --chmod=666 pixi.toml pixi.lock /opt/cosmos-curate/ - -# Install each pixi environment in a separate layer to reduce individual layer sizes -# and improve pull/streaming performance. Cache mounts keep rattler cache outside layers. -RUN --mount=type=cache,target=/root/.cache/rattler \ - umask 0000 && cd /opt/cosmos-curate && \ - export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e default --frozen - -RUN --mount=type=cache,target=/root/.cache/rattler \ - umask 0000 && cd /opt/cosmos-curate && \ - export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e legacy-transformers --frozen - -RUN --mount=type=cache,target=/root/.cache/rattler \ - umask 0000 && cd /opt/cosmos-curate && \ - export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e model-download --frozen - -RUN --mount=type=cache,target=/root/.cache/rattler \ - umask 0000 && cd /opt/cosmos-curate && \ - export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e transformers --frozen - -RUN --mount=type=cache,target=/root/.cache/rattler \ - umask 0000 && cd /opt/cosmos-curate && \ +COPY pixi.toml pixi.lock /opt/cosmos-curate/ +# If we install all the environments in a single layer, it's over 20GB and will cause slurm/NVCF to timeout pulling the +# layer. Since the cuml environment is large and needs non-overlapping RAPIDS packages, we install it separately. +RUN cd /opt/cosmos-curate && \ export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e unified --frozen + pixi install -e default -e model-download -e transformers -e unified --frozen && \ + pixi clean cache -y -# RUN --mount=type=cache,target=/root/.cache/rattler \ -# umask 0000 && cd /opt/cosmos-curate && \ -# export CONDA_OVERRIDE_CUDA=12.9.1 && \ -# pixi install -e cuml --frozen +# Install the cuml environment separately if requested. # Run any hacky post-install script for each environment @@ -197,4 +168,4 @@ EXPOSE 8000 6379 WORKDIR /opt/cosmos-curate -CMD ["pixi", "run", "python", "cosmos_curate/scripts/onto_nvcf.py", "--helm", "True"] +CMD ["pixi", "run", "python", "cosmos_curate/scripts/onto_nvcf.py", "--helm", "True"] \ No newline at end of file diff --git a/cosmos_curate/docker/generate_dockerfile.sh b/cosmos_curate/docker/generate_dockerfile.sh index 04344a5..7031689 100755 --- a/cosmos_curate/docker/generate_dockerfile.sh +++ b/cosmos_curate/docker/generate_dockerfile.sh @@ -1,8 +1,11 @@ +# drop cuml,legacy-transformers from default envs built +CWD=$(pwd) REPO_ROOT=$HOME/git/cosmos-curate +cd $REPO_ROOT cosmos-curate image build \ --curator-path "${REPO_ROOT}" \ --image-name cosmos-curate \ --image-tag 1 \ --dry-run \ - --verbose \ - --dockerfile-output-path "${REPO_ROOT}/docker/cosmos-curate.Dockerfile" + --envs transformers,unified \ + --dockerfile-output-path "${CWD}/cosmos-curate.Dockerfile" From 83eb4bef0a74576e783bdfb8517c3e208edbe2e6 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 18:46:43 +0000 Subject: [PATCH 06/16] Bake --- cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile index fa3902f..f1ac078 100644 --- a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile @@ -208,7 +208,8 @@ RUN sudo mkdir -p /cosmos_curate/config /config /anyscale/init \ && sudo chown -R ray:ray /cosmos_curate /config /anyscale/init ENV PATH=/opt/cosmos-curate/.pixi/envs/default/bin:$PATH \ - HOME=/home/ray + HOME=/home/ray \ + PIXI_PROJECT_MANIFEST=/opt/cosmos-curate/pixi.toml WORKDIR /home/ray ENTRYPOINT [] From c2a2cac1c1d25d2ca66f90b243aed920dd5b751a Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 12:49:47 -0600 Subject: [PATCH 07/16] ups --- cosmos_curate/all_nodes_init_script.py | 23 +++++++++++++++-------- cosmos_curate/hello_world.yaml | 9 +++------ cosmos_curate/reference_pipeline.yaml | 10 ++++------ cosmos_curate/write_s3_creds_file.sh | 2 +- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/cosmos_curate/all_nodes_init_script.py b/cosmos_curate/all_nodes_init_script.py index 7f8950b..93ec81f 100644 --- a/cosmos_curate/all_nodes_init_script.py +++ b/cosmos_curate/all_nodes_init_script.py @@ -1,24 +1,31 @@ +import sys import ray import subprocess from time import perf_counter as pc SCRIPT = """ set -e -cp /mnt/user_storage/cosmos-config.yaml /cosmos_curate/config/cosmos_curate.yaml -# Hello World -pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models gpt2 -# Reference Video Pipeline -pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models qwen2.5_vl,transnetv2,internvideo2_mm,bert +echo '---------------------------------------' +echo '---------------------------------------' +pwd +ls -hlart +bash write_s3_creds_file.sh +cp cosmos_curate_tokens.yaml /cosmos_curate/config/cosmos_curate.yaml +pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models {models} +echo '---------------------------------------' +echo '---------------------------------------' """ @ray.remote(num_cpus=0) -def run_init(): +def run_init(script): try: - return subprocess.check_output(SCRIPT, shell=True, stderr=subprocess.STDOUT) + return subprocess.check_output(script, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise RuntimeError(f"Init script failed (exit code {e.returncode}):\n{e.output.decode()}") from None if __name__ == "__main__": + models = sys.argv[1] + script = SCRIPT.format(models=models) t = pc() ray.init(address="auto") nodes = [n for n in ray.nodes() if n["Alive"]] @@ -27,7 +34,7 @@ def run_init(): scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( node_id=n["NodeID"], soft=False ) - ).remote() + ).remote(script) for n in nodes ] print(f"Downloading models on {len(tasks)} nodes...") diff --git a/cosmos_curate/hello_world.yaml b/cosmos_curate/hello_world.yaml index 97eccc9..08d27ed 100644 --- a/cosmos_curate/hello_world.yaml +++ b/cosmos_curate/hello_world.yaml @@ -1,8 +1,8 @@ name: custom-image-cosmos -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/wagner-west-2:15 +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:3 ray_version: 2.48.0 entrypoint: > - python all_nodes_init_script.py + python all_nodes_init_script.py gpt2 && pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] compute_config: @@ -13,13 +13,10 @@ compute_config: GPU: 0 flags: {} worker_nodes: - - instance_type: g6e.2xlarge + - instance_type: g4dn.xlarge flags: {} min_nodes: 1 max_nodes: 1 market_type: ON_DEMAND working_dir: "." max_retries: 0 -env_vars: - PIXI_PROJECT_MANIFEST: /opt/cosmos-curate/pixi.toml - COSMOS_S3_PROFILE_PATH: /mnt/user_storage/s3_creds_file diff --git a/cosmos_curate/reference_pipeline.yaml b/cosmos_curate/reference_pipeline.yaml index f9e8c02..ee327db 100644 --- a/cosmos_curate/reference_pipeline.yaml +++ b/cosmos_curate/reference_pipeline.yaml @@ -1,12 +1,12 @@ name: custom-image-cosmos -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/wagner-west-2:15 +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:2 ray_version: 2.48.0 entrypoint: > - python all_nodes_init_script.py + python all_nodes_init_script.py qwen2.5_vl,transnetv2,internvideo2_mm,bert && pixi run python -m cosmos_curate.pipelines.video.run_pipeline split --input-video-path "s3://ray-example-data/videos/Hollywood2-actions-videos/Hollywood2/AVIClips/" - --output-clip-path "/mnt/user_storage/output_clips3/" -py_modules: ["/Users/davidwagner/git/cosmos-curate"] + --output-clip-path "/mnt/user_storage/output_clips/" +py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] compute_config: head_node: instance_type: m5.2xlarge @@ -22,5 +22,3 @@ compute_config: market_type: ON_DEMAND working_dir: "." max_retries: 0 -env_vars: - PIXI_PROJECT_MANIFEST: /opt/cosmos-curate/pixi.toml diff --git a/cosmos_curate/write_s3_creds_file.sh b/cosmos_curate/write_s3_creds_file.sh index 7d0e9db..b0d6285 100755 --- a/cosmos_curate/write_s3_creds_file.sh +++ b/cosmos_curate/write_s3_creds_file.sh @@ -1,5 +1,5 @@ eval $(aws configure export-credentials --format env) -cat > /mnt/user_storage/s3_creds_file < /dev/shm/s3_creds_file < Date: Sat, 28 Feb 2026 21:40:37 +0000 Subject: [PATCH 08/16] legacy --- cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile | 4 ++++ cosmos_curate/docker/cosmos-curate.Dockerfile | 2 +- cosmos_curate/docker/generate_dockerfile.sh | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile index f1ac078..5ab7b89 100644 --- a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile @@ -135,6 +135,7 @@ ENV HF_HOME="${DEFAULT_WORKSPACE_LOC}/weights/hf_home/" \ # Set up pixi environments COPY pixi.toml pixi.lock /opt/cosmos-curate/ + # ========================================================================== # Anyscale compatibility layer # Ref: https://docs.anyscale.com/container-image/image-requirement.md @@ -207,6 +208,9 @@ RUN set -euxo pipefail \ RUN sudo mkdir -p /cosmos_curate/config /config /anyscale/init \ && sudo chown -R ray:ray /cosmos_curate /config /anyscale/init +# Model registry needed by cosmos-curate at import time +COPY cosmos_curate/configs/all_models.json /opt/cosmos-curate/cosmos_curate/configs/all_models.json + ENV PATH=/opt/cosmos-curate/.pixi/envs/default/bin:$PATH \ HOME=/home/ray \ PIXI_PROJECT_MANIFEST=/opt/cosmos-curate/pixi.toml diff --git a/cosmos_curate/docker/cosmos-curate.Dockerfile b/cosmos_curate/docker/cosmos-curate.Dockerfile index 910c3c6..779b449 100644 --- a/cosmos_curate/docker/cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/cosmos-curate.Dockerfile @@ -138,7 +138,7 @@ COPY pixi.toml pixi.lock /opt/cosmos-curate/ # layer. Since the cuml environment is large and needs non-overlapping RAPIDS packages, we install it separately. RUN cd /opt/cosmos-curate && \ export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e default -e model-download -e transformers -e unified --frozen && \ + pixi install -e default -e legacy-transformers -e model-download -e transformers -e unified --frozen && \ pixi clean cache -y # Install the cuml environment separately if requested. diff --git a/cosmos_curate/docker/generate_dockerfile.sh b/cosmos_curate/docker/generate_dockerfile.sh index 7031689..9d0dd42 100755 --- a/cosmos_curate/docker/generate_dockerfile.sh +++ b/cosmos_curate/docker/generate_dockerfile.sh @@ -1,4 +1,4 @@ -# drop cuml,legacy-transformers from default envs built +# drop cuml from default envs built CWD=$(pwd) REPO_ROOT=$HOME/git/cosmos-curate cd $REPO_ROOT @@ -7,5 +7,5 @@ cosmos-curate image build \ --image-name cosmos-curate \ --image-tag 1 \ --dry-run \ - --envs transformers,unified \ + --envs legacy-transformers,transformers,unified \ --dockerfile-output-path "${CWD}/cosmos-curate.Dockerfile" From a8c89edfe59d8d3ae5f0396b5f76041f6a168806 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 21:41:12 +0000 Subject: [PATCH 09/16] leg --- cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile index 5ab7b89..7cf9132 100644 --- a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile @@ -178,7 +178,7 @@ USER ray # layer. Since the cuml environment is large and needs non-overlapping RAPIDS packages, we install it separately. RUN cd /opt/cosmos-curate && \ export CONDA_OVERRIDE_CUDA=12.9.1 && \ - pixi install -e default -e model-download -e transformers -e unified --frozen && \ + pixi install -e default -e legacy-transformers -e model-download -e transformers -e unified --frozen && \ pixi clean cache -y # ---------- Anyscale Python packages ---------- From 49292bf4d6c632a215363a87e15081db2ac6cea9 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 19:34:39 -0600 Subject: [PATCH 10/16] Go --- cosmos_curate/all_nodes_init_script.py | 4 ++-- cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile | 5 +++++ cosmos_curate/hello_world.yaml | 4 ++-- cosmos_curate/reference_pipeline.yaml | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cosmos_curate/all_nodes_init_script.py b/cosmos_curate/all_nodes_init_script.py index 93ec81f..7955f30 100644 --- a/cosmos_curate/all_nodes_init_script.py +++ b/cosmos_curate/all_nodes_init_script.py @@ -19,9 +19,9 @@ @ray.remote(num_cpus=0) def run_init(script): try: - return subprocess.check_output(script, shell=True, stderr=subprocess.STDOUT) + subprocess.check_call(script, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: - raise RuntimeError(f"Init script failed (exit code {e.returncode}):\n{e.output.decode()}") from None + raise RuntimeError(f"Init script failed (exit code {e.returncode})") from None if __name__ == "__main__": models = sys.argv[1] diff --git a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile index f1ac078..a1296e3 100644 --- a/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile +++ b/cosmos_curate/docker/anyscale-cosmos-curate.Dockerfile @@ -199,6 +199,11 @@ RUN cd /tmp \ && sudo ./aws/install \ && rm -rf aws awscliv2.zip +# ---------- cosmos-curate source code ---------- +COPY --chown=ray:ray cosmos_curate /opt/cosmos-curate/cosmos_curate +COPY --chown=ray:ray tests /opt/cosmos-curate/tests +COPY --chown=ray:ray pytest.ini .coveragerc /opt/cosmos-curate/ + # Workspace shell setup (Anyscale workspace requirement). RUN set -euxo pipefail \ && echo 'PROMPT_COMMAND="history -a"' >> /home/ray/.bashrc \ diff --git a/cosmos_curate/hello_world.yaml b/cosmos_curate/hello_world.yaml index 08d27ed..fbbc757 100644 --- a/cosmos_curate/hello_world.yaml +++ b/cosmos_curate/hello_world.yaml @@ -1,5 +1,5 @@ -name: custom-image-cosmos -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:3 +name: cosmos-curate-hello-world +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:4 ray_version: 2.48.0 entrypoint: > python all_nodes_init_script.py gpt2 diff --git a/cosmos_curate/reference_pipeline.yaml b/cosmos_curate/reference_pipeline.yaml index ee327db..d19348f 100644 --- a/cosmos_curate/reference_pipeline.yaml +++ b/cosmos_curate/reference_pipeline.yaml @@ -1,5 +1,5 @@ -name: custom-image-cosmos -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:2 +name: cosmos-curate-reference-pipeline +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:4 ray_version: 2.48.0 entrypoint: > python all_nodes_init_script.py qwen2.5_vl,transnetv2,internvideo2_mm,bert From 79a8afa5d4cbe9429491df809f4bea810cb57feb Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 20:26:00 -0600 Subject: [PATCH 11/16] Go --- cosmos_curate/README.md | 10 ---------- cosmos_curate/hello_world.yaml | 4 ++-- cosmos_curate/reference_pipeline.yaml | 8 ++++---- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/cosmos_curate/README.md b/cosmos_curate/README.md index 31322e5..d1f4284 100644 --- a/cosmos_curate/README.md +++ b/cosmos_curate/README.md @@ -1,16 +1,6 @@ # Cosmos Curate -* awscli in image -* remove `COSMOS_S3_PROFILE_PATH` use default -* bake in PIXI for workspaces -* 1 layer for pixi again? * can we eliminate code from being in the image at all for clarity? -* validate py_modules vs. baked in vs. entrypoint pixi run ref - -* ideally you can easily configure the (1) models downloaded (2) pipeline you run with options. entrypoint for (2), but (1) is hidden in script in a script -* ideally just a local cred file would be sufficient - just broadcast to nodes -* ideally do not think about s3 at all -* model download output should stream to stdout This repository has example Anyscale Jobs for the `cosmos-curate` Hello World & Reference Video Pipelines. diff --git a/cosmos_curate/hello_world.yaml b/cosmos_curate/hello_world.yaml index fbbc757..59501f0 100644 --- a/cosmos_curate/hello_world.yaml +++ b/cosmos_curate/hello_world.yaml @@ -1,10 +1,10 @@ name: cosmos-curate-hello-world -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:4 +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:6 ray_version: 2.48.0 entrypoint: > python all_nodes_init_script.py gpt2 && pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline -py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] +py_modules: ["./cosmos-curate"] compute_config: head_node: instance_type: m5.2xlarge diff --git a/cosmos_curate/reference_pipeline.yaml b/cosmos_curate/reference_pipeline.yaml index d19348f..971c0f3 100644 --- a/cosmos_curate/reference_pipeline.yaml +++ b/cosmos_curate/reference_pipeline.yaml @@ -1,15 +1,15 @@ name: cosmos-curate-reference-pipeline -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:4 +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:6 ray_version: 2.48.0 entrypoint: > python all_nodes_init_script.py qwen2.5_vl,transnetv2,internvideo2_mm,bert && pixi run python -m cosmos_curate.pipelines.video.run_pipeline split --input-video-path "s3://ray-example-data/videos/Hollywood2-actions-videos/Hollywood2/AVIClips/" - --output-clip-path "/mnt/user_storage/output_clips/" -py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] + --output-clip-path "/mnt/user_storage/output_clips_4x1/" +py_modules: ["./cosmos-curate"] compute_config: head_node: - instance_type: m5.2xlarge + instance_type: m5.2xlarge resources: CPU: 8 GPU: 0 From 5fe2294cd5eb476f9cb59bf4e9a0095811671a7d Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sat, 28 Feb 2026 20:27:49 -0600 Subject: [PATCH 12/16] ignore --- cosmos_curate/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 cosmos_curate/.gitignore diff --git a/cosmos_curate/.gitignore b/cosmos_curate/.gitignore new file mode 100644 index 0000000..3a3397f --- /dev/null +++ b/cosmos_curate/.gitignore @@ -0,0 +1 @@ +cosmos-curate From e9b30a5860156c33363ed64de903e20de3d86ee6 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sun, 1 Mar 2026 08:30:52 -0600 Subject: [PATCH 13/16] Up --- cosmos_curate/reference_pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosmos_curate/reference_pipeline.yaml b/cosmos_curate/reference_pipeline.yaml index 971c0f3..b28602a 100644 --- a/cosmos_curate/reference_pipeline.yaml +++ b/cosmos_curate/reference_pipeline.yaml @@ -5,7 +5,7 @@ entrypoint: > python all_nodes_init_script.py qwen2.5_vl,transnetv2,internvideo2_mm,bert && pixi run python -m cosmos_curate.pipelines.video.run_pipeline split --input-video-path "s3://ray-example-data/videos/Hollywood2-actions-videos/Hollywood2/AVIClips/" - --output-clip-path "/mnt/user_storage/output_clips_4x1/" + --output-clip-path "/mnt/user_storage/output_clips/" py_modules: ["./cosmos-curate"] compute_config: head_node: From fd56b5612f1dc7b93a0dcc9a9ddf918891d973a6 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sun, 1 Mar 2026 08:48:23 -0600 Subject: [PATCH 14/16] prints --- cosmos_curate/all_nodes_init_script.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cosmos_curate/all_nodes_init_script.py b/cosmos_curate/all_nodes_init_script.py index 7955f30..412810e 100644 --- a/cosmos_curate/all_nodes_init_script.py +++ b/cosmos_curate/all_nodes_init_script.py @@ -5,15 +5,9 @@ SCRIPT = """ set -e -echo '---------------------------------------' -echo '---------------------------------------' -pwd -ls -hlart bash write_s3_creds_file.sh cp cosmos_curate_tokens.yaml /cosmos_curate/config/cosmos_curate.yaml pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models {models} -echo '---------------------------------------' -echo '---------------------------------------' """ @ray.remote(num_cpus=0) From b63c4fd62fd3779025b6f315aacebd55667ee65c Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sun, 1 Mar 2026 09:42:23 -0600 Subject: [PATCH 15/16] draft readme --- cosmos_curate/README.md | 116 ++++++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 40 deletions(-) diff --git a/cosmos_curate/README.md b/cosmos_curate/README.md index d1f4284..e880ec8 100644 --- a/cosmos_curate/README.md +++ b/cosmos_curate/README.md @@ -1,10 +1,8 @@ # Cosmos Curate -* can we eliminate code from being in the image at all for clarity? - This repository has example Anyscale Jobs for the `cosmos-curate` Hello World & Reference Video Pipelines. -To run these on Anyscale looks like: +To run these jobs on Anyscale we will run: ``` anyscale job submit -f hello_world.yaml ``` @@ -14,73 +12,111 @@ or: anyscale job submit -f reference_video_pipeline.yaml ``` -The `entrypoint:` in each job will run: +First we need a few things: + +### 1. Docker image + +Update the ECR information in `./push_anyscale.sh` with your own repo then: +``` +TAG=1 +./build_anyscale.sh $TAG && ./push_anyscale.sh $TAG +``` + +The `anyscale-cosmos-curate.Dockerfile` adds [Anyscale requirements](https://docs.anyscale.com/container-image/image-requirement) prior to building the `pixi` layers as `chown`'ing these layers later almost doubles the image size. This image used `./generate_dockerfile.sh` from `cosmos-curate` repo to generate the `cosmos-curate.Dockerfile` without `cuml` env then added the Anyscale portion to that generated Dockerfile. + +Can update the jobs `image_uri:` with your image once it is built and pushed. -1. **python all_nodes_init_script.py** +### 2. cosmos_curate.yaml (API auth) -This runs the same script on all nodes to initialize state of the cluster. In particular +`cosmos-curate` expects `/cosmos_curate/config/cosmos_curate.yaml` to control the authentication to APIs and model registrys. `huggingface` is all that is required to run the two examples in this repo. Can add your credentials locally and when the job runs there `entrypoint:` will distributed to all nodes at that path with `all_nodes_init_script.py`. -(a) We have to set the hardcoded **/cosmos_curate/config/cosmos_curate.yaml** on each node from the shared storage `/mnt/user_storage/`. This is how `cosmos-curate` does API and model authentication. +``` +μ cat cosmos_curate_tokens.yaml +huggingface: + user: "" + api_key: "" +``` -(b) We need to write our S3 creds to `COSMOS_S3_PROFILE_PATH` configurable default path of `/dev/shm/s3_creds_file`. +### 3. s3_creds_file.yaml (S3 auth) -(c) We run the `pixi run -e model-download python -m cosmos_curate.core.managers.model_cli download --models gpt2` commands to download models for the job. +`cosmos-curate` expects an S3 credential file at `/dev/shm/s3_creds_file`. This is configurable by `COSMOS_S3_PROFILE_PATH`. For this examplet the jobs run on AWS where the IAM has S3 permissions so we use the `aws` cli to write out temporary crednetials for the job to this path. -2. **pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline** +If you need to authenticate in a different way need to ensure this file is written and distributed to all nodes at the expected filepath. -This is the actual pipeline entrypoint command. The `pixi run` depends on `PIXI_PROJECT_MANIFEST` being properly set to match what was built into the image. +### Cosmos Curate on Anyscale -Where in turn +Let's breakdown the the `reference_video_pipeline.yaml` to get a sense for how the setup comes together, starting from defining the hardware we want to use up to the user code defining the pipeline. +### 1. Compute Config + +This defines the nodes we will require to run the pipeline. Typically the head nodes in Ray clusters should be set to have zero resources, but the `cosmos-curate` library expects it. ``` -name: custom-image-cosmos -image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/wagner-west-2:15 -ray_version: 2.48.0 -entrypoint: > - python all_nodes_init_script.py - && pixi run python -m cosmos_curate.pipelines.examples.hello_world_pipeline -py_modules: ["/Users/davidwagner/git/davidwagnerkc/cosmos-curate"] compute_config: head_node: - instance_type: m5.2xlarge + instance_type: m5.2xlarge resources: CPU: 8 GPU: 0 flags: {} worker_nodes: - - instance_type: g6e.2xlarge + - instance_type: g6e.4xlarge flags: {} - min_nodes: 1 - max_nodes: 1 + min_nodes: 4 + max_nodes: 4 market_type: ON_DEMAND -working_dir: "." -max_retries: 0 -env_vars: - PIXI_PROJECT_MANIFEST: /opt/cosmos-curate/pixi.toml ``` -Key state of the `cosmos-curate` setup: +The reference video pipeline defaults to 4 1xL40S instances. The logs at end of pipeline will report on runtimes. Here is 4 GPUs compared to 16 GPUs for ~1k videos, about 3h of video: + +4 GPUs takes 44m +``` +2026-02-28 19:12:46.030 | INFO | cosmos_curate.pipelines.video.splitting_pipeline:split:703 - Split-Transcode-Filter-Annotate pipeline: input_build_time=0.01 / pipeline_run_time=44.26 / summary_run_time=0.02 mins processing time for total_video_length=3.191 hours of raw videos +``` + +16 GPUs took 13m +``` +2026-03-01 05:56:58.599 | INFO | cosmos_curate.pipelines.video.splitting_pipeline:split:703 - Split-Transcode-Filter-Annotate pipeline: input_build_time=0.01 / pipeline_run_time=12.71 / summary_run_time=0.01 mins processing time for total_video_length=3.191 hours of raw videos +``` -**/cosmos_curate/config/cosmos_curate.yaml**: +### 2. Image -Where the +This block defines name of job, the image all the nodes will start and clarifies for a custom image built the expected Ray version we will be running. +``` +image_uri: 367974485317.dkr.ecr.us-west-2.amazonaws.com/anyscale-cosmos-curate:6 +ray_version: 2.48.0 +``` -To run `comsmos-curate` on Anyscale you need: +When the job runs it will acquire all the nodes and use our image which handles a few things for us: +* All `pixi` environments are built into the container +* While on Anyscale we typically just use `working_dir` or `py_modules` to ship code for use at runtime, `cosmos-curate` expects code at `/opt/cosmos-curate/cosmos_curate` so there is a copy of the code in there as well for referencing the `all_models.json` file and some other configurations. +* We set `PIXI_PROJECT_MANIFEST` in the image so that runtime `pixi run` calls (whether by the `entrypoint:` or in the pipeline model classes `py_executable` to enable switching between different envs for specific models) all know where these environments are built and cached. The `default` `pixi` environment is the default `python` on `PATH` if you call `python` directly outside of `pixi run`. -1. Anyscale compatible Docker image. +### 3. Runtime Environment -2. File based authentication +Anyscale will ship your `working_dir` which should be the `examples/cosmos-curate/` directory. This allows us to access files for setting up the nodes, addition python scripts to run, python packages, etc. This allows us to generally update code running on the image without requiring rebuild. -# Building Anyscale Compatible Docker Image +`py_modules` grabs local copy of `cosmos-curate` and makes it available vs. leaning on being in the `/opt/cosmos-curate/cosmos_curate/` directory for all execution required in the Docker image. -Can skip steps (0) and (1) as the `cosmos-curate.Dockerfile` is committed (and modified to layer per `pixi` env for faster pulling). This is how other `cosmos-curate` build configurations can be built. If you already have an image built you can start at (4) updating the image name and tag to build an Anyscale compatible image on top. +``` +py_modules: ["./cosmos-curate"] +working_dir: "." +``` -Inside the `docker/` folder: +### 4. Entrypoint -0. `pip install -e .` inside of the **cosmos-curate/** repo to make the `cosmos-curate` cli command available +The `entrypoint:` will be executed on the head node only. Typically this might be as simple as `entrypoint: python main.py`, but for `cosmos-curate` we want to coordinate some startup logic so use Ray to distribute initialization logic before executing the main entrypoint from the `cosmos_curate` library. -1. `./generate_dockerfile.sh` to create **cosmos-curate.Dockerfile** +``` +entrypoint: > + python all_nodes_init_script.py qwen2.5_vl,transnetv2,internvideo2_mm,bert + && pixi run python -m cosmos_curate.pipelines.video.run_pipeline split + --input-video-path "s3://ray-example-data/videos/Hollywood2-actions-videos/Hollywood2/AVIClips/" + --output-clip-path "/mnt/user_storage/output_clips/" +``` -2. `./build_cosmos.sh`. to produce `cosmos-curate:1` image to build Anyscale image on. +The `all_nodes_init_script.py` handles a few initialization steps for the cluster: +1. Use `write_s3_creds_file.sh` to put an S3 credential file where it is expected on each node. +2. Copy our local `cosmos_curate_tokens.yaml` to the expected location on each not for API and model registry auth. +3. Use the `model-download` `pixi` env to run `python -m cosmos_curate.core.managers.model_cli download` and pass the list of models needed for the pipeline we are going to run (if you do not specify the models it will download all models which takes a while and 500GB+ of space). -4. `./build_and_push_anyscale.sh` to build and push `anyscale-cosmos-curate:1` image. +Now the actual pipelne uses the default `pixi` env to run `python -m cosmos_curate.pipelines.video.run_pipeline split`. There are many cli options you can pass to the pipelines `cosmos-curate` provides, but here we just set the minimal input and output paths and accept the rest as default. From b4b77eb13d033f89eac39c0a89a856975b027cd4 Mon Sep 17 00:00:00 2001 From: David Wagner Date: Sun, 1 Mar 2026 09:57:50 -0600 Subject: [PATCH 16/16] re-org from editor in cheif claud --- cosmos_curate/README.md | 48 ++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/cosmos_curate/README.md b/cosmos_curate/README.md index e880ec8..9eebb1c 100644 --- a/cosmos_curate/README.md +++ b/cosmos_curate/README.md @@ -1,18 +1,26 @@ # Cosmos Curate -This repository has example Anyscale Jobs for the `cosmos-curate` Hello World & Reference Video Pipelines. +This repository is an example of running NVIDIA [cosmos-curate](https://github.com/nvidia-cosmos/cosmos-curate) pipelines on Anyscale. Examples include the Hello World and Reference Video Pipelines. -To run these jobs on Anyscale we will run: -``` -anyscale job submit -f hello_world.yaml -``` +## Prerequisites -or: +- An [Anyscale account](https://console.anyscale.com/) with the `anyscale` CLI installed (`pip install anyscale`) +- AWS account with ECR access (for pushing the Docker image) and S3 permissions on the nodes +- A local clone (or symlink, e.g. `ln -sf /path/to/cosmos-curate ./cosmos-curate`) of the [`cosmos-curate`](https://github.com/NVIDIA/cosmos-curate) repo (see [Runtime Environment](#3-runtime-environment) for details) + +Your directory layout should look like: ``` -anyscale job submit -f reference_video_pipeline.yaml +cosmos_curate/ # this directory +├── cosmos-curate/ # clone of the cosmos-curate repo +├── docker/ +├── hello_world.yaml +├── reference_pipeline.yaml +├── all_nodes_init_script.py +├── cosmos_curate_tokens.yaml +└── ... ``` -First we need a few things: +## Setup ### 1. Docker image @@ -43,7 +51,22 @@ huggingface: If you need to authenticate in a different way need to ensure this file is written and distributed to all nodes at the expected filepath. -### Cosmos Curate on Anyscale +## Run + +The Hello World Pipeline runs in a few minutes and only requires 1 T4 GPU node. + +``` +anyscale job submit -f hello_world.yaml +``` + +The Reference Video Pipeline will take ~45m with the default setup of 4 L40S GPUs on ~3h of video. +``` +anyscale job submit -f reference_pipeline.yaml +``` + +## How It Works + +### Cosmos Curate on Anyscale Let's breakdown the the `reference_video_pipeline.yaml` to get a sense for how the setup comes together, starting from defining the hardware we want to use up to the user code defining the pipeline. @@ -95,7 +118,7 @@ When the job runs it will acquire all the nodes and use our image which handles Anyscale will ship your `working_dir` which should be the `examples/cosmos-curate/` directory. This allows us to access files for setting up the nodes, addition python scripts to run, python packages, etc. This allows us to generally update code running on the image without requiring rebuild. -`py_modules` grabs local copy of `cosmos-curate` and makes it available vs. leaning on being in the `/opt/cosmos-curate/cosmos_curate/` directory for all execution required in the Docker image. +`py_modules` packages a local clone of the `cosmos-curate` repo (the `./cosmos-curate` directory listed in [Prerequisites](#prerequisites)) and ships it to all nodes at runtime. This lets you iterate on `cosmos-curate` source code without rebuilding the Docker image, overriding the copy baked into the image at `/opt/cosmos-curate/cosmos_curate/`. ``` py_modules: ["./cosmos-curate"] @@ -114,9 +137,14 @@ entrypoint: > --output-clip-path "/mnt/user_storage/output_clips/" ``` +#### python all_nodes_init_script.py + The `all_nodes_init_script.py` handles a few initialization steps for the cluster: + 1. Use `write_s3_creds_file.sh` to put an S3 credential file where it is expected on each node. 2. Copy our local `cosmos_curate_tokens.yaml` to the expected location on each not for API and model registry auth. 3. Use the `model-download` `pixi` env to run `python -m cosmos_curate.core.managers.model_cli download` and pass the list of models needed for the pipeline we are going to run (if you do not specify the models it will download all models which takes a while and 500GB+ of space). +#### python -m cosmos_curate.pipelines.video.run_pipeline split + Now the actual pipelne uses the default `pixi` env to run `python -m cosmos_curate.pipelines.video.run_pipeline split`. There are many cli options you can pass to the pipelines `cosmos-curate` provides, but here we just set the minimal input and output paths and accept the rest as default.