[Refactor] Adjustments to moe_comm_method selection process (#3001)
### What this PR does / why we need it?
Fix issues mentioned in
https://github.com/vllm-project/vllm-ascend/pull/2791 and some minor
refactoring.
1. Use Enum instead of string.
2. Avoid setting a new property to forward_context in
AscendFusedMoE.forward().
3. Enabling TokenDispatcherWithMoge.
4. Remove redundant code.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing:
1. Enable/Disable EP
2. Aclgraph & eager
- vLLM version: v0.10.2
- vLLM main:
9607d5eb44
Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
This commit is contained in:
@@ -117,11 +117,11 @@ class EagleProposer(Proposer):
|
||||
skip_attn: bool = False,
|
||||
num_reqs: int = 0,
|
||||
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||
moe_comm_method = self.runner._select_moe_comm_method(
|
||||
moe_comm_type = self.runner._select_moe_comm_method(
|
||||
num_tokens, with_prefill)
|
||||
with set_ascend_forward_context(None,
|
||||
self.vllm_config,
|
||||
moe_comm_method=moe_comm_method,
|
||||
moe_comm_type=moe_comm_type,
|
||||
num_tokens=num_tokens):
|
||||
self.model(
|
||||
input_ids=self.input_ids[:num_tokens],
|
||||
@@ -454,7 +454,7 @@ class EagleProposer(Proposer):
|
||||
with_prefill = attn_metadata.attn_state not in [
|
||||
AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
|
||||
]
|
||||
moe_comm_method = self.runner._select_moe_comm_method(
|
||||
moe_comm_type = self.runner._select_moe_comm_method(
|
||||
num_input_tokens, with_prefill)
|
||||
|
||||
# copy inputs to buffer for cudagraph
|
||||
@@ -463,7 +463,7 @@ class EagleProposer(Proposer):
|
||||
attn_metadata.block_tables = block_table.to(device)
|
||||
with set_ascend_forward_context(attn_metadata,
|
||||
self.vllm_config,
|
||||
moe_comm_method=moe_comm_method,
|
||||
moe_comm_type=moe_comm_type,
|
||||
num_tokens=num_input_tokens):
|
||||
last_hidden_states, hidden_states = self.model(
|
||||
input_ids=self.input_ids[:num_input_tokens],
|
||||
@@ -495,7 +495,7 @@ class EagleProposer(Proposer):
|
||||
else:
|
||||
input_batch_size = batch_size
|
||||
|
||||
moe_comm_method = self.runner._select_moe_comm_method(
|
||||
moe_comm_type = self.runner._select_moe_comm_method(
|
||||
input_batch_size, False)
|
||||
|
||||
attn_metadata.num_actual_tokens = batch_size
|
||||
@@ -568,7 +568,7 @@ class EagleProposer(Proposer):
|
||||
# Run the model.
|
||||
with set_ascend_forward_context(attn_metadata,
|
||||
self.vllm_config,
|
||||
moe_comm_method=moe_comm_method,
|
||||
moe_comm_type=moe_comm_type,
|
||||
num_tokens=input_batch_size):
|
||||
|
||||
last_hidden_states, hidden_states = self.model(
|
||||
|
||||
Reference in New Issue
Block a user