[BugFix]A2 MOE method&& layerwise MTP bugfix && Mamba gdn_metadata bugfix (#7364)

### What this PR does / why we need it?
Some bug fixes, mainly including:
1. For A2, the number of experts each single card cannot be greater than
16 when using MC2. The PR fixed the error in the A2 moe communication
method selection, which would cause the selection of an incorrect
communication method when the number of model experts exceeds 256. For
example, when using an A2 16-cards model to load the PD-disaggregation D
node with Qwen3.5 series models, the incorrect MC2 method would be
chosen.
2. Fixed the issue where the layerwise connector sends the kv-cache of
the MTP layer multiple times when `num_spec_tokens` > 1. Now, the
kv-cache is sent only when the MTP layer is forward for the first time.
3. Fix the accuracy issue of qwen3.5 when using MTP for PD
disaggregation. The cause is that `num_decode_draft_tokens` does not
consider that `spec_tokens` are not existed during the first inference
when PD disaggregation (`spec_tokens` are generated during the first
inference). However, `spec_tokens_padding` is added by
`recomputed_scheduler`. As a result, `gdn_metadata` incorrectly
considers that the prefill with a length of 2 is performed.
---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: zxr2333 <64738772+nwpu-zxr@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
zxr2333
2026-03-17 23:03:45 +08:00
committed by GitHub
parent a457d0f0e8
commit 5645ca8392
3 changed files with 16 additions and 12 deletions

View File

@@ -233,11 +233,12 @@ def select_moe_comm_method(num_tokens: int, vllm_config: VllmConfig, is_draft_mo
if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group().world_size == 1:
moe_comm_type = MoECommType.ALLGATHER
elif soc_version in {AscendDeviceType.A2}:
if (
num_tokens <= mc2_tokens_capacity
and vllm_config.parallel_config.world_size_across_dp / vllm_config.parallel_config.pipeline_parallel_size
>= 16
):
num_experts = vllm_config.model_config.get_num_experts()
ep_world_size = (
vllm_config.parallel_config.world_size_across_dp // vllm_config.parallel_config.pipeline_parallel_size
)
num_experts_per_device = num_experts // ep_world_size
if num_experts_per_device <= 24 and ep_world_size >= 16 and num_tokens <= mc2_tokens_capacity:
moe_comm_type = MoECommType.MC2
else:
moe_comm_type = MoECommType.ALLGATHER

View File

@@ -1493,6 +1493,9 @@ class MooncakeLayerwiseConnectorWorker:
) -> None:
"""MooncakeLayerwiseConnector does not save explicitly."""
if self.vllm_config.kv_transfer_config.is_kv_producer and connector_metadata.requests.keys():
if self.current_layer >= self.total_layers:
self.current_layer += 1
return
# get reshape and cache event
if layer_name == "":
layer_name = self.index_to_name[self.current_layer][0]

View File

@@ -831,6 +831,7 @@ class NPUModelRunner(GPUModelRunner):
num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
# For chunked prefills, use -1 as mask rather than 0, as guided
# decoding may rollback speculative tokens.
new_schedule_reqs = [x.req_id for x in scheduler_output.scheduled_new_reqs]
num_decode_draft_tokens = np.full(num_reqs, -1, dtype=np.int32)
for (
req_id,
@@ -838,13 +839,12 @@ class NPUModelRunner(GPUModelRunner):
) in scheduler_output.scheduled_spec_decode_tokens.items():
req_idx = self.input_batch.req_id_to_index[req_id]
num_draft_tokens[req_idx] = len(draft_token_ids)
num_decode_draft_tokens[req_idx] = (
len(draft_token_ids)
if (
self.input_batch.num_computed_tokens_cpu[req_idx] >= self.input_batch.num_prompt_tokens[req_idx]
)
else -1
)
if (self.is_kv_consumer and req_id in new_schedule_reqs) or \
(self.input_batch.num_computed_tokens_cpu[req_idx] >= \
self.input_batch.num_prompt_tokens[req_idx]):
num_decode_draft_tokens[req_idx] = len(draft_token_ids)
else:
num_decode_draft_tokens[req_idx] = -1
spec_decode_metadata = self._calc_spec_decode_metadata(
num_draft_tokens,