From 01234e59c0fb1fc618a7d933314f5699211a06fe Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Sat, 27 Dec 2025 15:01:10 +0800 Subject: [PATCH 1/7] =?UTF-8?q?=E6=96=87=E6=A1=A3=E7=8E=AF=E5=A2=83?= =?UTF-8?q?=E5=AE=89=E8=A3=85=E4=B8=8A=E6=89=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/zh/Pipeline_Usage/Setup_NPU_doc.md | 55 ++++++++++++++++++++++++ docs/zh/Pipeline_Usage/Setup_NPU_pip.md | 56 +++++++++++++++++++++++++ pyproject.toml | 18 +++++++- 3 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 docs/zh/Pipeline_Usage/Setup_NPU_doc.md create mode 100644 docs/zh/Pipeline_Usage/Setup_NPU_pip.md diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_doc.md b/docs/zh/Pipeline_Usage/Setup_NPU_doc.md new file mode 100644 index 000000000..a4f90ea64 --- /dev/null +++ b/docs/zh/Pipeline_Usage/Setup_NPU_doc.md @@ -0,0 +1,55 @@ +# 安装依赖 + +从源码安装(推荐): + +``` +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +从 pypi 安装(存在版本更新延迟,如需使用最新功能,请从源码安装) + +``` +pip install diffsynth +``` + +## GPU/NPU 支持 + +* NVIDIA GPU + +按照以上方式安装即可。 + +* AMD GPU + +需安装支持 ROCm 的 `torch` 包,以 ROCm 6.4(本文更新于 2025 年 12 月 15 日)、Linux 系统为例,请运行以下命令 + +```shell +pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 +``` + +* Ascend NPU + +1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit) + +2. 根据[安装依赖](#安装依赖)安装diffsynth仓 + +3. 安装torch-npu仓。Ascend NPU 通过 `torch-npu` 包提供支持,以 `2.7.1` 版本(本文更新于 2025 年 12 月 15 日)为例,请运行以下命令 + ```shell + # aarch64/ARM + pip install torch-npu==2.7.1 torchvision==0.22.1 + # x86 + pip install torch==2.7.1+cpu torchvision==0.22.1+cpu --extra-index-url "https://download.pytorch.org/whl/cpu" + pip install torch-npu==2.7.1 + ``` + +使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。 + +## 其他安装问题 + +如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: + +* [torch](https://pytorch.org/get-started/locally/) +* [Ascend/pytorch](https://github.com/Ascend/pytorch) +* [sentencepiece](https://github.com/google/sentencepiece) +* [cmake](https://cmake.org) diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_pip.md b/docs/zh/Pipeline_Usage/Setup_NPU_pip.md new file mode 100644 index 000000000..e0616cc12 --- /dev/null +++ b/docs/zh/Pipeline_Usage/Setup_NPU_pip.md @@ -0,0 +1,56 @@ +# 安装依赖 + +从源码安装(推荐): + +``` +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + +从 pypi 安装(存在版本更新延迟,如需使用最新功能,请从源码安装) + +``` +pip install diffsynth +``` + +## GPU/NPU 支持 + +* NVIDIA GPU + +按照以上方式安装即可。 + +* AMD GPU + +需安装支持 ROCm 的 `torch` 包,以 ROCm 6.4(本文更新于 2025 年 12 月 15 日)、Linux 系统为例,请运行以下命令 + +```shell +pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 +``` + +* Ascend NPU + +1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit) + +2. 从源码安装 + ```shell + git clone https://github.com/modelscope/DiffSynth-Studio.git + cd DiffSynth-Studio + # aarch64/ARM + pip install -e .[npu_aarch64] --extra-index-url "https://download.pytorch.org/whl/cpu" + # x86 + pip install -e .[npu] + ``` + + + +使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。 + +## 其他安装问题 + +如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: + +* [torch](https://pytorch.org/get-started/locally/) +* [Ascend/pytorch](https://github.com/Ascend/pytorch) +* [sentencepiece](https://github.com/google/sentencepiece) +* [cmake](https://cmake.org) diff --git a/pyproject.toml b/pyproject.toml index cb00b4d38..3b165d366 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,6 @@ authors = [{name = "ModelScope Team"}] license = {text = "Apache-2.0"} requires-python = ">=3.10" dependencies = [ - "torch>=2.0.0", - "torchvision", "transformers", "imageio", "imageio[ffmpeg]", @@ -32,6 +30,22 @@ classifiers = [ "Operating System :: OS Independent", ] +[project.optional-dependencies] +gpu = [ + "torch>=2.0.0", + "torchvision" +] +npu_aarch64 = [ + "torch==2.7.1", + "torch-npu==2.7.1", + "torchvision==0.22.1" +] +npu = [ + "torch==2.7.1+cpu", + "torch-npu==2.7.1", + "torchvision==0.22.1+cpu" +] + [tool.setuptools.packages.find] [tool.setuptools] From 9834d72e4de4dd499a2352329bb9336baa730b9b Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Sat, 27 Dec 2025 16:11:27 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E6=96=87=E6=A1=A3=E7=8E=AF=E5=A2=83?= =?UTF-8?q?=E5=AE=89=E8=A3=85=E4=B8=8A=E6=89=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" | 0 .../Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" | 0 pyproject.toml | 6 ++---- 3 files changed, 2 insertions(+), 4 deletions(-) rename docs/zh/Pipeline_Usage/Setup_NPU_doc.md => "docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" (100%) rename docs/zh/Pipeline_Usage/Setup_NPU_pip.md => "docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" (100%) diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_doc.md "b/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" similarity index 100% rename from docs/zh/Pipeline_Usage/Setup_NPU_doc.md rename to "docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" diff --git a/docs/zh/Pipeline_Usage/Setup_NPU_pip.md "b/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" similarity index 100% rename from docs/zh/Pipeline_Usage/Setup_NPU_pip.md rename to "docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" diff --git a/pyproject.toml b/pyproject.toml index 3b165d366..7597d8142 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,8 @@ authors = [{name = "ModelScope Team"}] license = {text = "Apache-2.0"} requires-python = ">=3.10" dependencies = [ + "torch>=2.0.0", + "torchvision", "transformers", "imageio", "imageio[ffmpeg]", @@ -31,10 +33,6 @@ classifiers = [ ] [project.optional-dependencies] -gpu = [ - "torch>=2.0.0", - "torchvision" -] npu_aarch64 = [ "torch==2.7.1", "torch-npu==2.7.1", From a5935e973a5efb252f8f8ee7fa9631461ea628b8 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Mon, 29 Dec 2025 09:23:59 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E8=AE=AD=E7=BB=83=E5=BF=AB=E9=80=9F?= =?UTF-8?q?=E4=B8=8A=E6=89=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../GPU_support_\346\240\267\344\276\2131.md" | 86 ++++++++++++ .../GPU_support_\346\240\267\344\276\2132.md" | 129 ++++++++++++++++++ .../full/Wan2.2-I2V-A14B-NPU.sh | 42 ++++++ 3 files changed, 257 insertions(+) create mode 100644 "docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" create mode 100644 "docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" create mode 100644 examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh diff --git "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" new file mode 100644 index 000000000..03138cc7e --- /dev/null +++ "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" @@ -0,0 +1,86 @@ +# GPU/NPU 支持 + +`DiffSynth-Studio` 支持多种 GPU/NPU,本文介绍如何在这些设备上运行模型推理和训练。 + +在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。 + +## NVIDIA GPU + +本项目提供的所有样例代码默认支持 NVIDIA GPU,无需额外修改。 + +## AMD GPU + +AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 + +## Ascend NPU + +### 推理 +使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 + +例如,Wan2.1-T2V-1.3B 的推理代码: + +```diff +import torch +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig + +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, +- "preparing_device": "cuda", ++ "preparing_device": "npu", + "computation_dtype": torch.bfloat16, +- "computation_device": "cuda", ++ "preparing_device": "npu", +} +pipe = WanVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, +- device="cuda", ++ device="npu", + model_configs=[ + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), +- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, ++ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, +) + +video = pipe( + prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", + negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", + seed=0, tiled=True, +) +save_video(video, "video.mp4", fps=15, quality=5) +``` + +### 训练 +当前已为每类模型添加NPU的启动脚本样例,例如 `examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh` + +NPU训练脚本中添加了优化性能的环境变量,针对特定模型,还添加一些参数 + +#### 环境变量 +``` +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +``` +expandable_segments:: 使能内存池扩展段功能,即虚拟内存特征 + +``` +export CPU_AFFINITY_CONF=1 +``` +设置0或未设置: 表示不启用绑核功能 + +1: 表示开启粗粒度绑核 + +2: 表示开启细粒度绑核 + +#### 特定模型参数 +| 模型 | 参数 | 备注 | +|-----------|------|-------------------| +| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 | + + diff --git "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" new file mode 100644 index 000000000..615cde631 --- /dev/null +++ "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" @@ -0,0 +1,129 @@ +# GPU/NPU 支持 + +`DiffSynth-Studio` 支持多种 GPU/NPU,本文介绍如何在这些设备上运行模型推理和训练。 + +在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。 + +## NVIDIA GPU + +本项目提供的所有样例代码默认支持 NVIDIA GPU,无需额外修改。 + +## AMD GPU + +AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 + +## Ascend NPU + +### 推理 +使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 + +例如,Wan2.1-T2V-1.3B 的推理代码: + +```diff +import torch +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig + +vram_config = { + "offload_dtype": "disk", + "offload_device": "disk", + "onload_dtype": torch.bfloat16, + "onload_device": "cpu", + "preparing_dtype": torch.bfloat16, +- "preparing_device": "cuda", ++ "preparing_device": "npu", + "computation_dtype": torch.bfloat16, +- "computation_device": "cuda", ++ "preparing_device": "npu", +} +pipe = WanVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, +- device="cuda", ++ device="npu", + model_configs=[ + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), +- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, ++ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, +) + +video = pipe( + prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", + negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", + seed=0, tiled=True, +) +save_video(video, "video.mp4", fps=15, quality=5) +``` + +### 训练 +使用 Ascend NPU 时,可以添加优化性能的环境变量,针对特定模型,还需添加参数 + +例如,Wan2.2-I2V-A14B 的训练代码: +```diff ++ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True ++ export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --num_frames 49 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \ + --trainable_models "dit" \ + --extra_inputs "input_image" \ + --use_gradient_checkpointing_offload \ + --max_timestep_boundary 0.358 \ + --min_timestep_boundary 0 \ ++ --initialize_model_on_cpu +# boundary corresponds to timesteps [900, 1000] + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --num_frames 49 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \ + --trainable_models "dit" \ + --extra_inputs "input_image" \ + --use_gradient_checkpointing_offload \ + --max_timestep_boundary 1 \ + --min_timestep_boundary 0.358 \ ++ --initialize_model_on_cpu +# boundary corresponds to timesteps [0, 900) +``` +#### 环境变量 +``` +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +``` +expandable_segments:: 使能内存池扩展段功能,即虚拟内存特征 + +``` +export CPU_AFFINITY_CONF=1 +``` +设置0或未设置: 表示不启用绑核功能 + +1: 表示开启粗粒度绑核 + +2: 表示开启细粒度绑核 + +#### 特定模型参数 +| 模型 | 参数 | 备注 | +|-----------|------|-------------------| +| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 | + + diff --git a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh new file mode 100644 index 000000000..b214af1ee --- /dev/null +++ b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh @@ -0,0 +1,42 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --num_frames 49 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \ + --trainable_models "dit" \ + --extra_inputs "input_image" \ + --use_gradient_checkpointing_offload \ + --max_timestep_boundary 0.358 \ + --min_timestep_boundary 0 \ + --initialize_model_on_cpu +# boundary corresponds to timesteps [900, 1000] + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --num_frames 49 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \ + --trainable_models "dit" \ + --extra_inputs "input_image" \ + --use_gradient_checkpointing_offload \ + --max_timestep_boundary 1 \ + --min_timestep_boundary 0.358 \ + --initialize_model_on_cpu +# boundary corresponds to timesteps [0, 900) \ No newline at end of file From c758769a024b7b1bde6301eff7a5fd50cc6253f1 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Mon, 29 Dec 2025 09:25:46 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E8=AE=AD=E7=BB=83=E5=BF=AB=E9=80=9F?= =?UTF-8?q?=E4=B8=8A=E6=89=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" | 2 +- .../zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" index 03138cc7e..cfec21c07 100644 --- "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" +++ "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" @@ -61,7 +61,7 @@ save_video(video, "video.mp4", fps=15, quality=5) ### 训练 当前已为每类模型添加NPU的启动脚本样例,例如 `examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh` -NPU训练脚本中添加了优化性能的环境变量,针对特定模型,还添加一些参数 +NPU训练脚本中添加了优化性能的环境变量,针对特定模型,还需添加必要参数 #### 环境变量 ``` diff --git "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" index 615cde631..95a94ad94 100644 --- "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" +++ "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" @@ -59,7 +59,7 @@ save_video(video, "video.mp4", fps=15, quality=5) ``` ### 训练 -使用 Ascend NPU 时,可以添加优化性能的环境变量,针对特定模型,还需添加参数 +使用 Ascend NPU 时,可以添加优化性能的环境变量,针对特定模型,还需添加必要参数 例如,Wan2.2-I2V-A14B 的训练代码: ```diff From 848bfd6993a0e8c1c542ab4c821dbd14d96f07b8 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Wed, 21 Jan 2026 10:25:31 +0800 Subject: [PATCH 5/7] [NPU]:Support USP feature in NPU --- diffsynth/utils/xfuser/xdit_context_parallel.py | 15 ++++++++++++--- docs/en/Pipeline_Usage/GPU_support.md | 8 ++++++++ docs/zh/Pipeline_Usage/GPU_support.md | 7 +++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/diffsynth/utils/xfuser/xdit_context_parallel.py b/diffsynth/utils/xfuser/xdit_context_parallel.py index 21dc3b33c..94b92c77a 100644 --- a/diffsynth/utils/xfuser/xdit_context_parallel.py +++ b/diffsynth/utils/xfuser/xdit_context_parallel.py @@ -1,10 +1,13 @@ import torch from typing import Optional from einops import rearrange +from yunchang.kernels import AttnType from xfuser.core.distributed import (get_sequence_parallel_rank, get_sequence_parallel_world_size, get_sp_group) from xfuser.core.long_ctx_attention import xFuserLongContextAttention + +from ... import IS_NPU_AVAILABLE from ...core.device import parse_nccl_backend, parse_device_type @@ -35,8 +38,9 @@ def pad_freqs(original_tensor, target_len): s1, s2, dtype=original_tensor.dtype, - device=original_tensor.device) - padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0) + device='cpu') + original_tensor_device = original_tensor.device + padded_tensor = torch.cat([original_tensor.cpu(), padding_tensor], dim=0).to(device=original_tensor_device) return padded_tensor def rope_apply(x, freqs, num_heads): @@ -133,7 +137,12 @@ def usp_attn_forward(self, x, freqs): k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads) v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads) - x = xFuserLongContextAttention()( + attn_type = AttnType.FA + ring_impl_type = "basic" + if IS_NPU_AVAILABLE: + attn_type = AttnType.NPU + ring_impl_type = "basic_npu" + x = xFuserLongContextAttention(attn_type=attn_type, ring_impl_type=ring_impl_type)( None, query=q, key=k, diff --git a/docs/en/Pipeline_Usage/GPU_support.md b/docs/en/Pipeline_Usage/GPU_support.md index aba570649..2c67fed42 100644 --- a/docs/en/Pipeline_Usage/GPU_support.md +++ b/docs/en/Pipeline_Usage/GPU_support.md @@ -58,6 +58,14 @@ video = pipe( save_video(video, "video.mp4", fps=15, quality=5) ``` +#### USP(Unified Sequence Parallel) +If you want to use this feature on NPU, please install additional third-party libraries as follows: +```shell +pip install git+https://github.com/feifeibear/long-context-attention.git +pip install git+https://github.com/xdit-project/xDiT.git +``` + + ### Training NPU startup script samples have been added for each type of model,the scripts are stored in the `examples/xxx/special/npu_training`, for example `examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh`. diff --git a/docs/zh/Pipeline_Usage/GPU_support.md b/docs/zh/Pipeline_Usage/GPU_support.md index 8124147e2..b5c0e3369 100644 --- a/docs/zh/Pipeline_Usage/GPU_support.md +++ b/docs/zh/Pipeline_Usage/GPU_support.md @@ -58,6 +58,13 @@ video = pipe( save_video(video, "video.mp4", fps=15, quality=5) ``` +#### USP(Unified Sequence Parallel) +如果想要在NPU上使用该特性,请通过如下方式安装额外的第三方库: +```shell +pip install git+https://github.com/feifeibear/long-context-attention.git +pip install git+https://github.com/xdit-project/xDiT.git +``` + ### 训练 当前已为每类模型添加NPU的启动脚本样例,脚本存放在`examples/xxx/special/npu_training`目录下,例如 `examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh`。 From d879d66c62242fc79c3353ea6d53f0c5fb03054c Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Wed, 21 Jan 2026 10:34:09 +0800 Subject: [PATCH 6/7] [NPU]:Support USP feature in NPU --- .../GPU_support_\346\240\267\344\276\2131.md" | 86 ------------ .../GPU_support_\346\240\267\344\276\2132.md" | 129 ------------------ .../Setup_NPU_\346\240\267\344\276\2131.md" | 55 -------- .../Setup_NPU_\346\240\267\344\276\2132.md" | 56 -------- .../full/Wan2.2-I2V-A14B-NPU.sh | 42 ------ pyproject.toml | 12 -- 6 files changed, 380 deletions(-) delete mode 100644 "docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" delete mode 100644 "docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" delete mode 100644 "docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" delete mode 100644 "docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" delete mode 100644 examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh diff --git "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" deleted file mode 100644 index cfec21c07..000000000 --- "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2131.md" +++ /dev/null @@ -1,86 +0,0 @@ -# GPU/NPU 支持 - -`DiffSynth-Studio` 支持多种 GPU/NPU,本文介绍如何在这些设备上运行模型推理和训练。 - -在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。 - -## NVIDIA GPU - -本项目提供的所有样例代码默认支持 NVIDIA GPU,无需额外修改。 - -## AMD GPU - -AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 - -## Ascend NPU - -### 推理 -使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 - -例如,Wan2.1-T2V-1.3B 的推理代码: - -```diff -import torch -from diffsynth.utils.data import save_video, VideoData -from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig - -vram_config = { - "offload_dtype": "disk", - "offload_device": "disk", - "onload_dtype": torch.bfloat16, - "onload_device": "cpu", - "preparing_dtype": torch.bfloat16, -- "preparing_device": "cuda", -+ "preparing_device": "npu", - "computation_dtype": torch.bfloat16, -- "computation_device": "cuda", -+ "preparing_device": "npu", -} -pipe = WanVideoPipeline.from_pretrained( - torch_dtype=torch.bfloat16, -- device="cuda", -+ device="npu", - model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), - ], - tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), -- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, -+ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, -) - -video = pipe( - prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", - negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", - seed=0, tiled=True, -) -save_video(video, "video.mp4", fps=15, quality=5) -``` - -### 训练 -当前已为每类模型添加NPU的启动脚本样例,例如 `examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh` - -NPU训练脚本中添加了优化性能的环境变量,针对特定模型,还需添加必要参数 - -#### 环境变量 -``` -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -``` -expandable_segments:: 使能内存池扩展段功能,即虚拟内存特征 - -``` -export CPU_AFFINITY_CONF=1 -``` -设置0或未设置: 表示不启用绑核功能 - -1: 表示开启粗粒度绑核 - -2: 表示开启细粒度绑核 - -#### 特定模型参数 -| 模型 | 参数 | 备注 | -|-----------|------|-------------------| -| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 | - - diff --git "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" "b/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" deleted file mode 100644 index 95a94ad94..000000000 --- "a/docs/zh/Pipeline_Usage/GPU_support_\346\240\267\344\276\2132.md" +++ /dev/null @@ -1,129 +0,0 @@ -# GPU/NPU 支持 - -`DiffSynth-Studio` 支持多种 GPU/NPU,本文介绍如何在这些设备上运行模型推理和训练。 - -在开始前,请参考[安装依赖](/docs/zh/Pipeline_Usage/Setup.md)安装好 GPU/NPU 相关的依赖包。 - -## NVIDIA GPU - -本项目提供的所有样例代码默认支持 NVIDIA GPU,无需额外修改。 - -## AMD GPU - -AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 - -## Ascend NPU - -### 推理 -使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 - -例如,Wan2.1-T2V-1.3B 的推理代码: - -```diff -import torch -from diffsynth.utils.data import save_video, VideoData -from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig - -vram_config = { - "offload_dtype": "disk", - "offload_device": "disk", - "onload_dtype": torch.bfloat16, - "onload_device": "cpu", - "preparing_dtype": torch.bfloat16, -- "preparing_device": "cuda", -+ "preparing_device": "npu", - "computation_dtype": torch.bfloat16, -- "computation_device": "cuda", -+ "preparing_device": "npu", -} -pipe = WanVideoPipeline.from_pretrained( - torch_dtype=torch.bfloat16, -- device="cuda", -+ device="npu", - model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", **vram_config), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", **vram_config), - ], - tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), -- vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, -+ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, -) - -video = pipe( - prompt="纪实摄影风格画面,一只活泼的小狗在绿茵茵的草地上迅速奔跑。小狗毛色棕黄,两只耳朵立起,神情专注而欢快。阳光洒在它身上,使得毛发看上去格外柔软而闪亮。背景是一片开阔的草地,偶尔点缀着几朵野花,远处隐约可见蓝天和几片白云。透视感鲜明,捕捉小狗奔跑时的动感和四周草地的生机。中景侧面移动视角。", - negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", - seed=0, tiled=True, -) -save_video(video, "video.mp4", fps=15, quality=5) -``` - -### 训练 -使用 Ascend NPU 时,可以添加优化性能的环境变量,针对特定模型,还需添加必要参数 - -例如,Wan2.2-I2V-A14B 的训练代码: -```diff -+ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -+ export CPU_AFFINITY_CONF=1 - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 0.358 \ - --min_timestep_boundary 0 \ -+ --initialize_model_on_cpu -# boundary corresponds to timesteps [900, 1000] - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 1 \ - --min_timestep_boundary 0.358 \ -+ --initialize_model_on_cpu -# boundary corresponds to timesteps [0, 900) -``` -#### 环境变量 -``` -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -``` -expandable_segments:: 使能内存池扩展段功能,即虚拟内存特征 - -``` -export CPU_AFFINITY_CONF=1 -``` -设置0或未设置: 表示不启用绑核功能 - -1: 表示开启粗粒度绑核 - -2: 表示开启细粒度绑核 - -#### 特定模型参数 -| 模型 | 参数 | 备注 | -|-----------|------|-------------------| -| Wan 14B系列 | --initialize_model_on_cpu | NPU显存不支持单卡存放14B模型 | - - diff --git "a/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" "b/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" deleted file mode 100644 index a4f90ea64..000000000 --- "a/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2131.md" +++ /dev/null @@ -1,55 +0,0 @@ -# 安装依赖 - -从源码安装(推荐): - -``` -git clone https://github.com/modelscope/DiffSynth-Studio.git -cd DiffSynth-Studio -pip install -e . -``` - -从 pypi 安装(存在版本更新延迟,如需使用最新功能,请从源码安装) - -``` -pip install diffsynth -``` - -## GPU/NPU 支持 - -* NVIDIA GPU - -按照以上方式安装即可。 - -* AMD GPU - -需安装支持 ROCm 的 `torch` 包,以 ROCm 6.4(本文更新于 2025 年 12 月 15 日)、Linux 系统为例,请运行以下命令 - -```shell -pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 -``` - -* Ascend NPU - -1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit) - -2. 根据[安装依赖](#安装依赖)安装diffsynth仓 - -3. 安装torch-npu仓。Ascend NPU 通过 `torch-npu` 包提供支持,以 `2.7.1` 版本(本文更新于 2025 年 12 月 15 日)为例,请运行以下命令 - ```shell - # aarch64/ARM - pip install torch-npu==2.7.1 torchvision==0.22.1 - # x86 - pip install torch==2.7.1+cpu torchvision==0.22.1+cpu --extra-index-url "https://download.pytorch.org/whl/cpu" - pip install torch-npu==2.7.1 - ``` - -使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。 - -## 其他安装问题 - -如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: - -* [torch](https://pytorch.org/get-started/locally/) -* [Ascend/pytorch](https://github.com/Ascend/pytorch) -* [sentencepiece](https://github.com/google/sentencepiece) -* [cmake](https://cmake.org) diff --git "a/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" "b/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" deleted file mode 100644 index e0616cc12..000000000 --- "a/docs/zh/Pipeline_Usage/Setup_NPU_\346\240\267\344\276\2132.md" +++ /dev/null @@ -1,56 +0,0 @@ -# 安装依赖 - -从源码安装(推荐): - -``` -git clone https://github.com/modelscope/DiffSynth-Studio.git -cd DiffSynth-Studio -pip install -e . -``` - -从 pypi 安装(存在版本更新延迟,如需使用最新功能,请从源码安装) - -``` -pip install diffsynth -``` - -## GPU/NPU 支持 - -* NVIDIA GPU - -按照以上方式安装即可。 - -* AMD GPU - -需安装支持 ROCm 的 `torch` 包,以 ROCm 6.4(本文更新于 2025 年 12 月 15 日)、Linux 系统为例,请运行以下命令 - -```shell -pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 -``` - -* Ascend NPU - -1. 通过官方文档安装[CANN](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=local&OS=openEuler&Software=cannToolKit) - -2. 从源码安装 - ```shell - git clone https://github.com/modelscope/DiffSynth-Studio.git - cd DiffSynth-Studio - # aarch64/ARM - pip install -e .[npu_aarch64] --extra-index-url "https://download.pytorch.org/whl/cpu" - # x86 - pip install -e .[npu] - ``` - - - -使用 Ascend NPU 时,请将 Python 代码中的 `"cuda"` 改为 `"npu"`,详见[NPU 支持](/docs/zh/Pipeline_Usage/GPU_support.md#ascend-npu)。 - -## 其他安装问题 - -如果在安装过程中遇到问题,可能是由上游依赖包导致的,请参考这些包的文档: - -* [torch](https://pytorch.org/get-started/locally/) -* [Ascend/pytorch](https://github.com/Ascend/pytorch) -* [sentencepiece](https://github.com/google/sentencepiece) -* [cmake](https://cmake.org) diff --git a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh b/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh deleted file mode 100644 index b214af1ee..000000000 --- a/examples/wanvideo/model_training/full/Wan2.2-I2V-A14B-NPU.sh +++ /dev/null @@ -1,42 +0,0 @@ -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export CPU_AFFINITY_CONF=1 - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_high_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 0.358 \ - --min_timestep_boundary 0 \ - --initialize_model_on_cpu -# boundary corresponds to timesteps [900, 1000] - -accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ - --dataset_base_path data/example_video_dataset \ - --dataset_metadata_path data/example_video_dataset/metadata.csv \ - --height 480 \ - --width 832 \ - --num_frames 49 \ - --dataset_repeat 100 \ - --model_id_with_origin_paths "Wan-AI/Wan2.2-I2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-I2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-I2V-A14B:Wan2.1_VAE.pth" \ - --learning_rate 1e-5 \ - --num_epochs 2 \ - --remove_prefix_in_ckpt "pipe.dit." \ - --output_path "./models/train/Wan2.2-I2V-A14B_low_noise_full" \ - --trainable_models "dit" \ - --extra_inputs "input_image" \ - --use_gradient_checkpointing_offload \ - --max_timestep_boundary 1 \ - --min_timestep_boundary 0.358 \ - --initialize_model_on_cpu -# boundary corresponds to timesteps [0, 900) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c82697ad0..059e21d31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,18 +32,6 @@ classifiers = [ "Operating System :: OS Independent", ] -[project.optional-dependencies] -npu_aarch64 = [ - "torch==2.7.1", - "torch-npu==2.7.1", - "torchvision==0.22.1" -] -npu = [ - "torch==2.7.1+cpu", - "torch-npu==2.7.1", - "torchvision==0.22.1+cpu" -] - [tool.setuptools.packages.find] where = ["./"] include = ["diffsynth", "diffsynth.*"] From b3cc652dea3a7022b3484f653ced899992328064 Mon Sep 17 00:00:00 2001 From: feng0w0 Date: Wed, 21 Jan 2026 10:38:27 +0800 Subject: [PATCH 7/7] [NPU]:Support USP feature in NPU --- diffsynth/utils/xfuser/xdit_context_parallel.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/diffsynth/utils/xfuser/xdit_context_parallel.py b/diffsynth/utils/xfuser/xdit_context_parallel.py index 94b92c77a..6f712fd6a 100644 --- a/diffsynth/utils/xfuser/xdit_context_parallel.py +++ b/diffsynth/utils/xfuser/xdit_context_parallel.py @@ -33,14 +33,16 @@ def sinusoidal_embedding_1d(dim, position): def pad_freqs(original_tensor, target_len): seq_len, s1, s2 = original_tensor.shape pad_size = target_len - seq_len + original_tensor_device = original_tensor.device + if original_tensor.device == "npu": + original_tensor = original_tensor.cpu() padding_tensor = torch.ones( pad_size, s1, s2, dtype=original_tensor.dtype, - device='cpu') - original_tensor_device = original_tensor.device - padded_tensor = torch.cat([original_tensor.cpu(), padding_tensor], dim=0).to(device=original_tensor_device) + device=original_tensor.device) + padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0).to(device=original_tensor_device) return padded_tensor def rope_apply(x, freqs, num_heads):