【Feature】refactor npu_modelrunner for profile_run (#4993)

### What this PR does / why we need it? (1)refactor npu_model_runner for profile_run (2) move _select_moe_comm_method to ascend_forward_context (3) delete _init_model_kwargs in npu_model_runner ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Na - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
2025-12-16 17:44:04 +08:00
parent af64087732
commit 4ed2951400
6 changed files with 127 additions and 205 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -28,8 +28,7 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch

-from vllm_ascend.ascend_forward_context import (MoECommType,
-                                                set_ascend_forward_context)
+from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
@@ -242,11 +241,6 @@ class MtpProposer(Proposer):
            # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
            # and _propose.
            aclgraph_runtime_mode = CUDAGraphMode.NONE
-        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
-        # TODO: remove this after moe_comm_type selection logic is finalized
-        moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
-                         == MoECommType.FUSED_ALLTOALL else moe_comm_type)
-
        if aclgraph_runtime_mode == CUDAGraphMode.FULL:
            if len(self.runner.attn_groups) > 0:
                num_computed_tokens_cpu = (
@@ -299,9 +293,8 @@ class MtpProposer(Proposer):
                    self.vllm_config,
                    num_tokens=num_tokens,
                    with_prefill=with_prefill,
+                    in_profile_run=True,
                    num_tokens_across_dp=num_tokens_across_dp,
-                    moe_comm_type=moe_comm_type,
-                    in_profile_run=self.runner.in_profile_run,
                    num_actual_tokens=0,
                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                    batch_descriptor=batch_descriptor,
@@ -720,11 +713,6 @@ class MtpProposer(Proposer):
         with_prefill) = self.runner._sync_metadata_across_dp(
             num_input_tokens, self.runner.with_prefill)

-        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
-        # TODO: remove this after moe_comm_type selection logic is finalized
-        moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
-                         == MoECommType.FUSED_ALLTOALL else moe_comm_type)
-
        # Enable shared_expert_dp and MTP FULL graph may cause accuracy issues.
        if scheduler_output and not self.enable_shared_expert_dp:
            max_query_len = common_attn_metadata.max_query_len
@@ -771,7 +759,6 @@ class MtpProposer(Proposer):
                    num_tokens=num_input_tokens,
                    with_prefill=with_prefill,
                    num_tokens_across_dp=num_tokens_across_dp,
-                    moe_comm_type=moe_comm_type,
                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                    batch_descriptor=batch_descriptor,
                    in_profile_run=self.runner.in_profile_run,