[Feature] Optimize DeepSeek's DeepEP on Ascend NPU (#8355)

Co-authored-by: ronnie_zheng <zl19940307@163.com>
Co-authored-by: Hexq0210 <hexq0809521@gmail.com>
This commit is contained in:
Even Zhou
2025-08-09 16:35:00 +08:00
committed by GitHub
parent 52e1f52f32
commit 137e75daa1
7 changed files with 210 additions and 61 deletions

View File

@@ -50,6 +50,8 @@ from sglang.srt.utils import (
supports_custom_op,
)
_is_npu = is_npu()
@dataclass
class GraphCaptureContext:
@@ -591,7 +593,7 @@ class GroupCoordinator:
)
def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
if not supports_custom_op():
if _is_npu or not supports_custom_op():
self._all_gather_into_tensor(output, input)
else:
torch.ops.sglang.reg_all_gather_into_tensor(
@@ -1127,7 +1129,7 @@ def init_model_parallel_group(
group_ranks=group_ranks,
local_rank=local_rank,
torch_distributed_backend=backend,
use_pynccl=not is_npu(),
use_pynccl=not _is_npu,
use_pymscclpp=use_mscclpp_allreduce,
use_custom_allreduce=use_custom_allreduce,
use_hpu_communicator=True,