[EPLB][bugfix] Bugfix for fused mc2 (#6794)
### What this PR does / why we need it?
This pull request addresses a bug related to the fused mc2 functionality
within the EPLB (Expert Parallelism Load Balancing) system, specifically
impacting quantization and MoE communication.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
Signed-off-by: Spicy-Stick <873805887@qq.com>
Signed-off-by: root <root@localhost.localdomain>
This commit is contained in:
@@ -70,7 +70,7 @@ class FusedExpertsResult:
|
||||
before_dispatch_evt: torch.npu.Event | None = None
|
||||
before_combine_evt: torch.npu.Event | None = None
|
||||
# For dynamic_eplb
|
||||
group_list_type: int | None = None
|
||||
group_list_type: int = 1
|
||||
expert_tokens: torch.Tensor | None = None
|
||||
|
||||
|
||||
@@ -355,7 +355,6 @@ class FusedMC2CommImpl(MoECommMethod):
|
||||
if log2phy is not None:
|
||||
topk_ids = log2phy[topk_ids]
|
||||
|
||||
group_list_type = None
|
||||
expert_tokens = None
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
|
||||
out = torch.empty_like(hidden_states)
|
||||
@@ -375,7 +374,6 @@ class FusedMC2CommImpl(MoECommMethod):
|
||||
expert_tokens = self.expert_token_nums
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
|
||||
assert expert_map is not None, "expert_map cannot be None."
|
||||
group_list_type = 1
|
||||
out, expert_tokens = torch.ops._C_ascend.dispatch_gmm_combine_decode( # type: ignore
|
||||
x=hidden_states,
|
||||
expert_ids=topk_ids,
|
||||
@@ -393,4 +391,4 @@ class FusedMC2CommImpl(MoECommMethod):
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Wrong value of {envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2=}")
|
||||
return FusedExpertsResult(routed_out=out, group_list_type=group_list_type, expert_tokens=expert_tokens)
|
||||
return FusedExpertsResult(routed_out=out, expert_tokens=expert_tokens)
|
||||
|
||||
Reference in New Issue
Block a user