diff --git a/diffsynth/utils/xfuser/xdit_context_parallel.py b/diffsynth/utils/xfuser/xdit_context_parallel.py index 21dc3b33..6f712fd6 100644 --- a/diffsynth/utils/xfuser/xdit_context_parallel.py +++ b/diffsynth/utils/xfuser/xdit_context_parallel.py @@ -1,10 +1,13 @@ import torch from typing import Optional from einops import rearrange +from yunchang.kernels import AttnType from xfuser.core.distributed import (get_sequence_parallel_rank, get_sequence_parallel_world_size, get_sp_group) from xfuser.core.long_ctx_attention import xFuserLongContextAttention + +from ... import IS_NPU_AVAILABLE from ...core.device import parse_nccl_backend, parse_device_type @@ -30,13 +33,16 @@ def sinusoidal_embedding_1d(dim, position): def pad_freqs(original_tensor, target_len): seq_len, s1, s2 = original_tensor.shape pad_size = target_len - seq_len + original_tensor_device = original_tensor.device + if original_tensor.device == "npu": + original_tensor = original_tensor.cpu() padding_tensor = torch.ones( pad_size, s1, s2, dtype=original_tensor.dtype, device=original_tensor.device) - padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0) + padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0).to(device=original_tensor_device) return padded_tensor def rope_apply(x, freqs, num_heads): @@ -133,7 +139,12 @@ def usp_attn_forward(self, x, freqs): k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads) v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads) - x = xFuserLongContextAttention()( + attn_type = AttnType.FA + ring_impl_type = "basic" + if IS_NPU_AVAILABLE: + attn_type = AttnType.NPU + ring_impl_type = "basic_npu" + x = xFuserLongContextAttention(attn_type=attn_type, ring_impl_type=ring_impl_type)( None, query=q, key=k, diff --git a/docs/en/Pipeline_Usage/GPU_support.md b/docs/en/Pipeline_Usage/GPU_support.md index aba57064..2c67fed4 100644 --- a/docs/en/Pipeline_Usage/GPU_support.md +++ b/docs/en/Pipeline_Usage/GPU_support.md @@ -58,6 +58,14 @@ video = pipe( save_video(video, "video.mp4", fps=15, quality=5) ``` +#### USP(Unified Sequence Parallel) +If you want to use this feature on NPU, please install additional third-party libraries as follows: +```shell +pip install git+https://github.com/feifeibear/long-context-attention.git +pip install git+https://github.com/xdit-project/xDiT.git +``` + + ### Training NPU startup script samples have been added for each type of model,the scripts are stored in the `examples/xxx/special/npu_training`, for example `examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh`. diff --git a/docs/zh/Pipeline_Usage/GPU_support.md b/docs/zh/Pipeline_Usage/GPU_support.md index 8124147e..b5c0e336 100644 --- a/docs/zh/Pipeline_Usage/GPU_support.md +++ b/docs/zh/Pipeline_Usage/GPU_support.md @@ -58,6 +58,13 @@ video = pipe( save_video(video, "video.mp4", fps=15, quality=5) ``` +#### USP(Unified Sequence Parallel) +如果想要在NPU上使用该特性,请通过如下方式安装额外的第三方库: +```shell +pip install git+https://github.com/feifeibear/long-context-attention.git +pip install git+https://github.com/xdit-project/xDiT.git +``` + ### 训练 当前已为每类模型添加NPU的启动脚本样例,脚本存放在`examples/xxx/special/npu_training`目录下,例如 `examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh`。