From dc40fb554680fbc30c842c0250238823a4542909 Mon Sep 17 00:00:00 2001 From: linnan wang Date: Mon, 26 Jan 2026 23:20:12 -0800 Subject: [PATCH] add weekly tests and align with huy's exp Signed-off-by: linnan wang --- .../launch_pretrain_wan21_weekly_image.sh | 89 +++++++++++++++++++ .../launch_pretrain_wan21_weekly_video.sh | 89 +++++++++++++++++++ .../cicd/wan21_cicd_weekly_image.yaml | 62 +++++++++++++ .../cicd/wan21_cicd_weekly_video.yaml | 62 +++++++++++++ 4 files changed, 302 insertions(+) create mode 100644 examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh create mode 100644 examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_video.sh create mode 100644 examples/automodel/pretrain/cicd/wan21_cicd_weekly_image.yaml create mode 100644 examples/automodel/pretrain/cicd/wan21_cicd_weekly_video.yaml diff --git a/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh b/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh new file mode 100644 index 00000000..26f039c9 --- /dev/null +++ b/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh @@ -0,0 +1,89 @@ +#!/bin/bash +#SBATCH -A coreai_dlalgo_llm +#SBATCH -p batch +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --gpus-per-node=8 +#SBATCH --time 04:00:00 +#SBATCH --exclusive +#SBATCH --output=./CICD_weekly_RUN_slurm_%x_%j.out +#SBATCH --error=./CICD_weekly_RUN_slurm_%x_%j.err +#SBATCH -J DFM_Multinode + +# Multi-node env +export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_PORT=29500 +export NUM_GPUS=8 +export WORLD_SIZE=$(($NUM_GPUS * $SLURM_NNODES)) + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_NVLS_ENABLE=0 + + +# Experiment env +# TODO: update the key +export WANDB_API_KEY="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" +export HF_HOME="/linnanw/hdvilla_sample/cache" +export HF_TOKEN="" + + +# SHARED paths on Lustre (visible to ALL nodes) +# TODO: update the path +UV_SHARED_DIR="/lustre/fsw/portfolios/coreai/users/linnanw/uv_cache/${SLURM_JOB_ID}" + +# Step 1: Pre-build on a SINGLE node first (avoids race conditions) +# Create a shared venv on LUSTRE that xALL nodes can access +read -r -d '' PREBUILD_CMD <