[Fix] Fix update_aclgraph_sizes when running MoE models (#913)
### What this PR does / why we need it? Fix update_aclgraph_sizes when running MoE models. --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -22,18 +22,17 @@ def get_etp_group() -> GroupCoordinator:
|
||||
|
||||
|
||||
def init_ascend_model_parallel(
|
||||
tensor_model_parallel_size: int = 1,
|
||||
pipeline_model_parallel_size: int = 1,
|
||||
expert_parallel_size: int = 1,
|
||||
expert_tensor_parallel_size: int = 1,
|
||||
world_size: Optional[int] = None,
|
||||
backend: Optional[str] = None,
|
||||
):
|
||||
assert torch.distributed.is_initialized()
|
||||
world_size: int = torch.distributed.get_world_size()
|
||||
world_size = world_size or torch.distributed.get_world_size()
|
||||
backend = backend or torch.distributed.get_backend(
|
||||
get_world_group().device_group)
|
||||
num_expert_parallel_groups: int = expert_tensor_parallel_size
|
||||
num_expert_tensor_parallel_groups: int = (world_size //
|
||||
expert_tensor_parallel_size)
|
||||
num_expert_parallel_groups = expert_tensor_parallel_size
|
||||
num_expert_tensor_parallel_groups = expert_parallel_size
|
||||
|
||||
global _EP
|
||||
group_ranks = []
|
||||
|
||||
Reference in New Issue
Block a user