From eb4c08f05dac1ae5d12bd9aa7e4aa76a4ab72032 Mon Sep 17 00:00:00 2001 From: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com> Date: Wed, 17 Dec 2025 01:35:26 +0800 Subject: [PATCH] [bugfix] fix mtp accept rate (#5093) ### What this PR does / why we need it? 1. now, npu_model_runner reuses gpu_model_runner, this pr deletes some attrs already defined in gpu_model_runner 2. fix mtp accept rate by disabling in_profile_run 3. remove redundant moe method selection logic 4. Reverts vllm-project/vllm-ascend#5082, which broke CI in https://github.com/vllm-project/vllm-ascend/actions/runs/20266314048/job/58190426832?pr=5088 ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? vLLM version: v0.12.0 vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 vLLM version: v0.12.0 vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: zhenwenqi2024 Signed-off-by: Mengqing Cao Co-authored-by: Mengqing Cao --- .../aclnn_matmul_allreduce_add_rmsnorm.cpp | 8 ++--- vllm_ascend/ascend_forward_context.py | 4 +-- vllm_ascend/spec_decode/eagle_proposer.py | 1 - vllm_ascend/spec_decode/mtp_proposer.py | 1 - vllm_ascend/worker/model_runner_v1.py | 32 +++---------------- 5 files changed, 10 insertions(+), 36 deletions(-) diff --git a/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp b/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp index 396da512..ec71fa91 100644 --- a/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp +++ b/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.cpp @@ -26,10 +26,6 @@ enum NnopbaseHcclServerType { }; extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType); -#ifdef __cplusplus -extern "C" { -#endif - extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnormGetWorkspaceSize( const aclTensor *x1, const aclTensor *x2, @@ -52,6 +48,10 @@ extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnorm( aclOpExecutor *executor, aclrtStream stream); +#ifdef __cplusplus +extern "C" { +#endif + aclnnStatus aclnnMatmulAllreduceAddRmsnormGetWorkspaceSize( const aclTensor *x1, const aclTensor *x2, diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index b4343e76..8618792f 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -64,7 +64,7 @@ def set_ascend_forward_context( get_moe_comm_method moe_comm_type = select_moe_comm_method(num_tokens, vllm_config) # TODO: remove this after moe_comm_type selection logic is finalized - if in_profile_run and is_mtp_model: + if is_mtp_model: moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type == MoECommType.FUSED_ALLTOALL else moe_comm_type) forward_context.moe_comm_type = moe_comm_type @@ -298,8 +298,6 @@ def select_moe_comm_method(num_tokens: int, if fused_all2all_enable else MoECommType.ALLTOALL) else: raise ValueError(f"Unsupported soc_version: {soc_version}") - moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type - == MoECommType.FUSED_ALLTOALL else moe_comm_type) # PanguProMoE only supports allgather if model_type == "PanguProMoE": moe_comm_type = MoECommType.ALLGATHER diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 24a846d9..3ec37245 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -145,7 +145,6 @@ class EagleProposer(Proposer): dummy_compute_logits=lambda hidden_states: None): with set_ascend_forward_context(None, self.vllm_config, - in_profile_run=True, num_tokens=num_tokens): self.model( input_ids=self.input_ids[:num_tokens], diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 348956d6..8cb46fa2 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -293,7 +293,6 @@ class MtpProposer(Proposer): self.vllm_config, num_tokens=num_tokens, with_prefill=with_prefill, - in_profile_run=True, num_tokens_across_dp=num_tokens_across_dp, num_actual_tokens=0, aclgraph_runtime_mode=aclgraph_runtime_mode, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d4b4b25b..9fc89f53 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -244,8 +244,6 @@ class NPUModelRunner(GPUModelRunner): self.need_accepted_tokens: bool = False self.is_multimodal_model = self.model_config.is_multimodal_model - self.is_pooling_model = self.model_config.pooler_config is not None - self.enable_prompt_embeds = self.model_config.enable_prompt_embeds self.block_size = vllm_config.cache_config.block_size # Set up Attention self.use_sparse = hasattr(self.vllm_config.model_config.hf_config, @@ -338,24 +336,6 @@ class NPUModelRunner(GPUModelRunner): ascend_config = get_ascend_config() self.eplb_updator = EplbUpdator(ascend_config, self.eplb_loader, self.eplb_process, self.process) - - self.use_async_scheduling = self.scheduler_config.async_scheduling - self.async_output_copy_stream = torch.npu.Stream() if \ - self.use_async_scheduling else None - self.num_spec_tokens = 0 - if self.speculative_config: - self.num_spec_tokens = self.speculative_config.num_speculative_tokens # noqa - self.valid_sampled_token_count_event: torch.npu.Event | None = None - self.valid_sampled_token_count_copy_stream: torch.npu.Stream | None = None - if self.use_async_scheduling and self.num_spec_tokens: - self.valid_sampled_token_count_event = torch.npu.Event() - self.valid_sampled_token_count_copy_stream = torch.npu.Stream() - self.valid_sampled_token_count_cpu = torch.empty( - self.max_num_reqs, - dtype=torch.int64, - device="cpu", - pin_memory=self.pin_memory, - ) # Input Batch # NOTE(Chen): Ideally, we should initialize the input batch inside # `initialize_kv_cache` based on the kv cache config. However, as in @@ -386,23 +366,20 @@ class NPUModelRunner(GPUModelRunner): cp_kv_cache_interleave_size=self.parallel_config. cp_kv_cache_interleave_size, ) - self.num_accepted_tokens = self._make_buffer(self.max_num_reqs, - dtype=torch.int64) self.num_draft_tokens = self._make_buffer(self.max_num_reqs, dtype=torch.int32) + # here we use int32 self.sampled_token_ids_pinned_cpu = torch.empty( (self.max_num_reqs, 1), dtype=torch.int32, device="cpu", pin_memory=self.pin_memory, ) - # None in the first PP rank. The rest are set after load_model. - # the attr below is in gpu_modelrunner, but occurs lint so add them here - self.intermediate_tensors: IntermediateTensors | None = None + # for cleancode , actually the three attrs is defined in gpu_model_runner self.execute_model_state: ExecuteModelState | None = None + # None in the first PP rank. The rest are set after load_model. + self.intermediate_tensors: IntermediateTensors | None = None self.reorder_batch_threshold: int | None = None - self.query_start_loc = self._make_buffer(self.max_num_reqs + 1, - dtype=torch.int32) def _init_device_properties(self) -> None: self.num_sms = None @@ -3395,6 +3372,7 @@ def _torch_cuda_wrapper(): try: # replace cuda APIs with xpu APIs, this should work by default + torch.Event = torch.npu.Event torch.cuda.Event = torch.npu.Event torch.cuda.Stream = torch.npu.Stream torch.cuda.default_stream = torch.npu.default_stream