From eb4c08f05dac1ae5d12bd9aa7e4aa76a4ab72032 Mon Sep 17 00:00:00 2001
From: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
Date: Wed, 17 Dec 2025 01:35:26 +0800
Subject: [PATCH] [bugfix] fix mtp accept rate (#5093)

### What this PR does / why we need it?
1. now, npu_model_runner reuses gpu_model_runner, this pr deletes some
attrs already defined in gpu_model_runner
2. fix mtp accept rate by disabling in_profile_run
3. remove redundant moe method selection logic
4. Reverts vllm-project/vllm-ascend#5082, which broke CI in
https://github.com/vllm-project/vllm-ascend/actions/runs/20266314048/job/58190426832?pr=5088

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
vLLM version: v0.12.0
vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

vLLM version: v0.12.0
vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 .../aclnn_matmul_allreduce_add_rmsnorm.cpp    |  8 ++---
 vllm_ascend/ascend_forward_context.py         |  4 +--
 vllm_ascend/spec_decode/eagle_proposer.py     |  1 -
 vllm_ascend/spec_decode/mtp_proposer.py       |  1 -
 vllm_ascend/worker/model_runner_v1.py         | 32 +++----------------
 5 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp b/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp
index 396da512..ec71fa91 100644
--- a/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp
+++ b/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp
@@ -26,10 +26,6 @@ enum NnopbaseHcclServerType {
 };
 extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnormGetWorkspaceSize(
     const aclTensor *x1,
     const aclTensor *x2,
@@ -52,6 +48,10 @@ extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnorm(
     aclOpExecutor *executor,
     aclrtStream stream);
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 aclnnStatus aclnnMatmulAllreduceAddRmsnormGetWorkspaceSize(
     const aclTensor *x1,
     const aclTensor *x2,
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
index b4343e76..8618792f 100644
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -64,7 +64,7 @@ def set_ascend_forward_context(
             get_moe_comm_method
         moe_comm_type = select_moe_comm_method(num_tokens, vllm_config)
         # TODO: remove this after moe_comm_type selection logic is finalized
-        if in_profile_run and is_mtp_model:
+        if is_mtp_model:
             moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
                              == MoECommType.FUSED_ALLTOALL else moe_comm_type)
         forward_context.moe_comm_type = moe_comm_type
@@ -298,8 +298,6 @@ def select_moe_comm_method(num_tokens: int,
                          if fused_all2all_enable else MoECommType.ALLTOALL)
     else:
         raise ValueError(f"Unsupported soc_version: {soc_version}")
-    moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
-                     == MoECommType.FUSED_ALLTOALL else moe_comm_type)
     # PanguProMoE only supports allgather
     if model_type == "PanguProMoE":
         moe_comm_type = MoECommType.ALLGATHER
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 24a846d9..3ec37245 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -145,7 +145,6 @@ class EagleProposer(Proposer):
                   dummy_compute_logits=lambda hidden_states: None):
         with set_ascend_forward_context(None,
                                         self.vllm_config,
-                                        in_profile_run=True,
                                         num_tokens=num_tokens):
             self.model(
                 input_ids=self.input_ids[:num_tokens],
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
index 348956d6..8cb46fa2 100644
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -293,7 +293,6 @@ class MtpProposer(Proposer):
                     self.vllm_config,
                     num_tokens=num_tokens,
                     with_prefill=with_prefill,
-                    in_profile_run=True,
                     num_tokens_across_dp=num_tokens_across_dp,
                     num_actual_tokens=0,
                     aclgraph_runtime_mode=aclgraph_runtime_mode,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index d4b4b25b..9fc89f53 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -244,8 +244,6 @@ class NPUModelRunner(GPUModelRunner):
         self.need_accepted_tokens: bool = False
 
         self.is_multimodal_model = self.model_config.is_multimodal_model
-        self.is_pooling_model = self.model_config.pooler_config is not None
-        self.enable_prompt_embeds = self.model_config.enable_prompt_embeds
         self.block_size = vllm_config.cache_config.block_size
         # Set up Attention
         self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
@@ -338,24 +336,6 @@ class NPUModelRunner(GPUModelRunner):
             ascend_config = get_ascend_config()
             self.eplb_updator = EplbUpdator(ascend_config, self.eplb_loader,
                                             self.eplb_process, self.process)
-
-        self.use_async_scheduling = self.scheduler_config.async_scheduling
-        self.async_output_copy_stream = torch.npu.Stream() if \
-            self.use_async_scheduling else None
-        self.num_spec_tokens = 0
-        if self.speculative_config:
-            self.num_spec_tokens = self.speculative_config.num_speculative_tokens  # noqa
-        self.valid_sampled_token_count_event: torch.npu.Event | None = None
-        self.valid_sampled_token_count_copy_stream: torch.npu.Stream | None = None
-        if self.use_async_scheduling and self.num_spec_tokens:
-            self.valid_sampled_token_count_event = torch.npu.Event()
-            self.valid_sampled_token_count_copy_stream = torch.npu.Stream()
-        self.valid_sampled_token_count_cpu = torch.empty(
-            self.max_num_reqs,
-            dtype=torch.int64,
-            device="cpu",
-            pin_memory=self.pin_memory,
-        )
         # Input Batch
         # NOTE(Chen): Ideally, we should initialize the input batch inside
         # `initialize_kv_cache` based on the kv cache config. However, as in
@@ -386,23 +366,20 @@ class NPUModelRunner(GPUModelRunner):
             cp_kv_cache_interleave_size=self.parallel_config.
             cp_kv_cache_interleave_size,
         )
-        self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
-                                                     dtype=torch.int64)
         self.num_draft_tokens = self._make_buffer(self.max_num_reqs,
                                                   dtype=torch.int32)
+        # here we use int32
         self.sampled_token_ids_pinned_cpu = torch.empty(
             (self.max_num_reqs, 1),
             dtype=torch.int32,
             device="cpu",
             pin_memory=self.pin_memory,
         )
-        # None in the first PP rank. The rest are set after load_model.
-        # the attr below is in gpu_modelrunner, but occurs lint so add them here
-        self.intermediate_tensors: IntermediateTensors | None = None
+        # for cleancode , actually the three attrs is defined in gpu_model_runner
         self.execute_model_state: ExecuteModelState | None = None
+        # None in the first PP rank. The rest are set after load_model.
+        self.intermediate_tensors: IntermediateTensors | None = None
         self.reorder_batch_threshold: int | None = None
-        self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
-                                                 dtype=torch.int32)
 
     def _init_device_properties(self) -> None:
         self.num_sms = None
@@ -3395,6 +3372,7 @@ def _torch_cuda_wrapper():
 
     try:
         # replace cuda APIs with xpu APIs, this should work by default
+        torch.Event = torch.npu.Event
         torch.cuda.Event = torch.npu.Event
         torch.cuda.Stream = torch.npu.Stream
         torch.cuda.default_stream = torch.npu.default_stream