From dc40fb554680fbc30c842c0250238823a4542909 Mon Sep 17 00:00:00 2001
From: linnan wang <linnanw@nvidia.com>
Date: Mon, 26 Jan 2026 23:20:12 -0800
Subject: [PATCH] add weekly tests and align with huy's exp

Signed-off-by: linnan wang <linnanw@nvidia.com>
---
 .../launch_pretrain_wan21_weekly_image.sh     | 89 +++++++++++++++++++
 .../launch_pretrain_wan21_weekly_video.sh     | 89 +++++++++++++++++++
 .../cicd/wan21_cicd_weekly_image.yaml         | 62 +++++++++++++
 .../cicd/wan21_cicd_weekly_video.yaml         | 62 +++++++++++++
 4 files changed, 302 insertions(+)
 create mode 100644 examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh
 create mode 100644 examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_video.sh
 create mode 100644 examples/automodel/pretrain/cicd/wan21_cicd_weekly_image.yaml
 create mode 100644 examples/automodel/pretrain/cicd/wan21_cicd_weekly_video.yaml

diff --git a/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh b/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh
new file mode 100644
index 00000000..26f039c9
--- /dev/null
+++ b/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_image.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#SBATCH -A coreai_dlalgo_llm
+#SBATCH -p batch
+#SBATCH -N 1
+#SBATCH --ntasks-per-node 1
+#SBATCH --gpus-per-node=8
+#SBATCH --time 04:00:00
+#SBATCH --exclusive
+#SBATCH --output=./CICD_weekly_RUN_slurm_%x_%j.out
+#SBATCH --error=./CICD_weekly_RUN_slurm_%x_%j.err
+#SBATCH -J DFM_Multinode
+
+# Multi-node env
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=29500
+export NUM_GPUS=8
+export WORLD_SIZE=$(($NUM_GPUS * $SLURM_NNODES))
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_NVLS_ENABLE=0
+
+
+# Experiment env
+# TODO: update the key
+export WANDB_API_KEY="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"
+export HF_HOME="/linnanw/hdvilla_sample/cache"
+export HF_TOKEN=""
+
+
+# SHARED paths on Lustre (visible to ALL nodes)
+# TODO: update the path
+UV_SHARED_DIR="/lustre/fsw/portfolios/coreai/users/linnanw/uv_cache/${SLURM_JOB_ID}"
+
+# Step 1: Pre-build on a SINGLE node first (avoids race conditions)
+# Create a shared venv on LUSTRE that xALL nodes can access
+read -r -d '' PREBUILD_CMD <<EOF
+cd /opt/DFM/
+echo "=== Pre-building on single node ==="
+mkdir -p ${UV_SHARED_DIR}
+export UV_CACHE_DIR=${UV_SHARED_DIR}/cache
+export UV_PROJECT_ENVIRONMENT=${UV_SHARED_DIR}/.venv
+# Sync creates the venv and installs all packages (including building local packages)
+uv sync --group automodel
+echo "=== Pre-build complete ==="
+echo "Venv created at: ${UV_SHARED_DIR}/.venv"
+ls -la ${UV_SHARED_DIR}/.venv/bin/python
+EOF
+echo "$PREBUILD_CMD"
+
+#TODO: the container image should be updated, also the container-mounts
+echo "Running pre-build step on single node..."
+srun \
+    -N 1 \
+    --ntasks=1 \
+    --mpi=pmix \
+    --container-entrypoint \
+    --no-container-mount-home \
+    --container-image=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_genai/users/pthombre/containers/nvidian+dfm+19397877341.sqsh \
+    --container-mounts=/lustre:/lustre,/lustre/fsw/portfolios/coreai/users/linnanw:/linnanw,/lustre/fsw/portfolios/coreai/users/linnanw/Diffuser/DFM:/opt/DFM/ \
+    --export=ALL \
+    bash -c "$PREBUILD_CMD"
+
+# Step 2: Now run on all nodes using the SAME pre-built venv on Lustre
+read -r -d '' CMD <<EOF
+cd /opt/DFM/; whoami; date; pwd;
+# Activate the pre-built venv
+echo "Activating venv at: ${UV_SHARED_DIR}/.venv"
+source ${UV_SHARED_DIR}/.venv/bin/activate
+# CRITICAL: Set PYTHONPATH so that even when torchrun spawns workers using
+# /usr/bin/python directly (bypassing venv symlink), they still find packages
+export PYTHONPATH="${UV_SHARED_DIR}/.venv/lib/python3.12/site-packages:\${PYTHONPATH}"
+echo "PYTHONPATH: \$PYTHONPATH"
+which python
+python -c "import nemo_automodel; print('nemo_automodel OK')"
+# Now torchrun workers will find packages via PYTHONPATH
+torchrun --nnodes=\$SLURM_NNODES --nproc-per-node=\$NUM_GPUS --rdzv_backend=c10d --rdzv_endpoint=\$MASTER_ADDR:\$MASTER_PORT examples/automodel/pretrain/pretrain.py  -c examples/automodel/pretrain/cicd/wan21_cicd_weekly_image.yaml
+EOF
+echo "$CMD"
+
+echo "Running training on all nodes..."
+srun \
+    --mpi=pmix \
+    --container-entrypoint \
+    --no-container-mount-home \
+    --container-image=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_genai/users/pthombre/containers/nvidian+dfm+19397877341.sqsh \
+    --container-mounts=/lustre:/lustre,/lustre/fsw/portfolios/coreai/users/linnanw:/linnanw,/lustre/fsw/portfolios/coreai/users/linnanw/Diffuser/DFM:/opt/DFM/ \
+    --export=ALL \
+    bash -c "$CMD"
diff --git a/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_video.sh b/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_video.sh
new file mode 100644
index 00000000..4c970001
--- /dev/null
+++ b/examples/automodel/cicd_convergence_tests/wan21/weekly/launch_pretrain_wan21_weekly_video.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#SBATCH -A coreai_dlalgo_llm
+#SBATCH -p batch
+#SBATCH -N 1
+#SBATCH --ntasks-per-node 1
+#SBATCH --gpus-per-node=8
+#SBATCH --time 04:00:00
+#SBATCH --exclusive
+#SBATCH --output=./CICD_weekly_RUN_slurm_%x_%j.out
+#SBATCH --error=./CICD_weekly_RUN_slurm_%x_%j.err
+#SBATCH -J DFM_Multinode
+
+# Multi-node env
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=29500
+export NUM_GPUS=8
+export WORLD_SIZE=$(($NUM_GPUS * $SLURM_NNODES))
+
+export CUDA_DEVICE _MAX_CONNECTIONS=1
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_NVLS_ENABLE=0
+
+
+# Experiment env
+# TODO: update the key
+export WANDB_API_KEY="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"
+export HF_HOME="/linnanw/hdvilla_sample/cache"
+export HF_TOKEN=""
+
+
+# SHARED paths on Lustre (visible to ALL nodes)
+# TODO: update the path
+UV_SHARED_DIR="/lustre/fsw/portfolios/coreai/users/linnanw/uv_cache/${SLURM_JOB_ID}"
+
+# Step 1: Pre-build on a SINGLE node first (avoids race conditions)
+# Create a shared venv on LUSTRE that xALL nodes can access
+read -r -d '' PREBUILD_CMD <<EOF
+cd /opt/DFM/
+echo "=== Pre-building on single node ==="
+mkdir -p ${UV_SHARED_DIR}
+export UV_CACHE_DIR=${UV_SHARED_DIR}/cache
+export UV_PROJECT_ENVIRONMENT=${UV_SHARED_DIR}/.venv
+# Sync creates the venv and installs all packages (including building local packages)
+uv sync --group automodel
+echo "=== Pre-build complete ==="
+echo "Venv created at: ${UV_SHARED_DIR}/.venv"
+ls -la ${UV_SHARED_DIR}/.venv/bin/python
+EOF
+echo "$PREBUILD_CMD"
+
+#TODO: the container image should be updated, also the container-mounts
+echo "Running pre-build step on single node..."
+srun \
+    -N 1 \
+    --ntasks=1 \
+    --mpi=pmix \
+    --container-entrypoint \
+    --no-container-mount-home \
+    --container-image=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_genai/users/pthombre/containers/nvidian+dfm+19397877341.sqsh \
+    --container-mounts=/lustre:/lustre,/lustre/fsw/portfolios/coreai/users/linnanw:/linnanw,/lustre/fsw/portfolios/coreai/users/linnanw/Diffuser/DFM:/opt/DFM/ \
+    --export=ALL \
+    bash -c "$PREBUILD_CMD"
+
+# Step 2: Now run on all nodes using the SAME pre-built venv on Lustre
+read -r -d '' CMD <<EOF
+cd /opt/DFM/; whoami; date; pwd;
+# Activate the pre-built venv
+echo "Activating venv at: ${UV_SHARED_DIR}/.venv"
+source ${UV_SHARED_DIR}/.venv/bin/activate
+# CRITICAL: Set PYTHONPATH so that even when torchrun spawns workers using
+# /usr/bin/python directly (bypassing venv symlink), they still find packages
+export PYTHONPATH="${UV_SHARED_DIR}/.venv/lib/python3.12/site-packages:\${PYTHONPATH}"
+echo "PYTHONPATH: \$PYTHONPATH"
+which python
+python -c "import nemo_automodel; print('nemo_automodel OK')"
+# Now torchrun workers will find packages via PYTHONPATH
+torchrun --nnodes=\$SLURM_NNODES --nproc-per-node=\$NUM_GPUS --rdzv_backend=c10d --rdzv_endpoint=\$MASTER_ADDR:\$MASTER_PORT examples/automodel/pretrain/pretrain.py  -c examples/automodel/pretrain/cicd/wan21_cicd_weekly_video.yaml
+EOF
+echo "$CMD"
+
+echo "Running training on all nodes..."
+srun \
+    --mpi=pmix \
+    --container-entrypoint \
+    --no-container-mount-home \
+    --container-image=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_genai/users/pthombre/containers/nvidian+dfm+19397877341.sqsh \
+    --container-mounts=/lustre:/lustre,/lustre/fsw/portfolios/coreai/users/linnanw:/linnanw,/lustre/fsw/portfolios/coreai/users/linnanw/Diffuser/DFM:/opt/DFM/ \
+    --export=ALL \
+    bash -c "$CMD"
diff --git a/examples/automodel/pretrain/cicd/wan21_cicd_weekly_image.yaml b/examples/automodel/pretrain/cicd/wan21_cicd_weekly_image.yaml
new file mode 100644
index 00000000..91004390
--- /dev/null
+++ b/examples/automodel/pretrain/cicd/wan21_cicd_weekly_image.yaml
@@ -0,0 +1,62 @@
+seed: 42
+
+wandb:
+  project: wan-t2v-cicd-weekly-pretrain
+  mode: online
+  name: WAN21_WEEKLY_VERIFY
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  mode: pretrain
+
+step_scheduler:
+  global_batch_size: 80
+  local_batch_size: 10
+  ckpt_every_steps: 500
+  num_epochs: 40
+  log_every: 1
+
+data:
+  dataloader:
+    _target_: dfm.src.automodel.datasets.build_dataloader
+    meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/CICD/Wan21/weekly/image
+    num_workers: 10
+    device: cpu
+
+optim:
+  learning_rate: 5e-5
+  clip_grad: 2.0
+  optimizer:
+    weight_decay: 0.1
+    betas: [0.9, 0.95]
+
+fsdp:
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  dp_size: none
+
+flow_matching:
+  adapter_type: "simple"
+  adapter_kwargs: {}
+  use_sigma_noise: true
+  timestep_sampling: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.5
+  flow_shift: 2.5
+  mix_uniform_ratio: 0.2
+  sigma_min: 0.0
+  sigma_max: 1.0
+
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /opt/DFM/wan21_fsdp_weekly_image/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: null
diff --git a/examples/automodel/pretrain/cicd/wan21_cicd_weekly_video.yaml b/examples/automodel/pretrain/cicd/wan21_cicd_weekly_video.yaml
new file mode 100644
index 00000000..f5866b67
--- /dev/null
+++ b/examples/automodel/pretrain/cicd/wan21_cicd_weekly_video.yaml
@@ -0,0 +1,62 @@
+seed: 42
+
+wandb:
+  project: wan-t2v-cicd-weekly-pretrain
+  mode: online
+  name: WAN21_WEEKLY_VERIFY
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  mode: pretrain
+
+step_scheduler:
+  global_batch_size: 80
+  local_batch_size: 1
+  ckpt_every_steps: 100
+  num_epochs: 520
+  log_every: 1
+
+data:
+  dataloader:
+    _target_: dfm.src.automodel.datasets.build_dataloader
+    meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/CICD/Wan21/weekly/video
+    num_workers: 10
+    device: cpu
+
+optim:
+  learning_rate: 5e-5
+  clip_grad: 2.0
+  optimizer:
+    weight_decay: 0.1
+    betas: [0.9, 0.95]
+
+fsdp:
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  dp_size: none
+
+flow_matching:
+  adapter_type: "simple"
+  adapter_kwargs: {}
+  use_sigma_noise: true
+  timestep_sampling: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.5
+  flow_shift: 2.5
+  mix_uniform_ratio: 0.2
+  sigma_min: 0.0
+  sigma_max: 1.0
+
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /opt/DFM/wan21_fsdp_weekly_video/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: /opt/DFM/wan21_fsdp_weekly_image/LATEST