【Feature】refactor npu_modelrunner for profile_run (#4993)

### What this PR does / why we need it? (1)refactor npu_model_runner for profile_run (2) move _select_moe_comm_method to ascend_forward_context (3) delete _init_model_kwargs in npu_model_runner ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Na - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
2025-12-16 17:44:04 +08:00
parent af64087732
commit 4ed2951400
6 changed files with 127 additions and 205 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -123,10 +123,9 @@ class EagleProposer(Proposer):
                  aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                  batch_descriptor=None,
                  dummy_compute_logits=lambda hidden_states: None):
-        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
        with set_ascend_forward_context(None,
                                        self.vllm_config,
-                                        moe_comm_type=moe_comm_type,
+                                        in_profile_run=True,
                                        num_tokens=num_tokens):
            self.model(
                input_ids=self.input_ids[:num_tokens],
@@ -458,15 +457,12 @@ class EagleProposer(Proposer):
        else:
            num_input_tokens = num_tokens

-        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
-
        # copy inputs to buffer for cudagraph
        self.positions[:num_tokens] = target_positions.to(device)
        self.hidden_states[:num_tokens] = target_hidden_states
        attn_metadata.block_tables = block_table.to(device)
        with set_ascend_forward_context(attn_metadata,
                                        self.vllm_config,
-                                        moe_comm_type=moe_comm_type,
                                        num_tokens=num_input_tokens):
            last_hidden_states, hidden_states = self.model(
                input_ids=self.input_ids[:num_input_tokens],
@@ -498,8 +494,6 @@ class EagleProposer(Proposer):
        else:
            input_batch_size = batch_size

-        moe_comm_type = self.runner._select_moe_comm_method(input_batch_size)
-
        attn_metadata.num_actual_tokens = batch_size
        attn_metadata.max_query_len = 1
        attn_metadata.query_start_loc = self.arange[:batch_size + 1]
@@ -575,7 +569,6 @@ class EagleProposer(Proposer):
            # Run the model.
            with set_ascend_forward_context(attn_metadata,
                                            self.vllm_config,
-                                            moe_comm_type=moe_comm_type,
                                            num_tokens=input_batch_size):

                last_hidden_states, hidden_states = self.model(