[Fix] Fix update_aclgraph_sizes when running MoE models (#913)

### What this PR does / why we need it?
Fix update_aclgraph_sizes when running MoE models.

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
yiz-liu
2025-05-30 15:17:11 +08:00
committed by GitHub
parent 3442fbdb23
commit 5a1689fc64
5 changed files with 47 additions and 35 deletions

View File

@@ -22,18 +22,17 @@ def get_etp_group() -> GroupCoordinator:
def init_ascend_model_parallel(
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
expert_parallel_size: int = 1,
expert_tensor_parallel_size: int = 1,
world_size: Optional[int] = None,
backend: Optional[str] = None,
):
assert torch.distributed.is_initialized()
world_size: int = torch.distributed.get_world_size()
world_size = world_size or torch.distributed.get_world_size()
backend = backend or torch.distributed.get_backend(
get_world_group().device_group)
num_expert_parallel_groups: int = expert_tensor_parallel_size
num_expert_tensor_parallel_groups: int = (world_size //
expert_tensor_parallel_size)
num_expert_parallel_groups = expert_tensor_parallel_size
num_expert_tensor_parallel_groups = expert_parallel_size
global _EP
group_ranks = []