【Feature】refactor npu_modelrunner for profile_run (#4993)
### What this PR does / why we need it?
(1)refactor npu_model_runner for profile_run
(2) move _select_moe_comm_method to ascend_forward_context
(3) delete _init_model_kwargs in npu_model_runner
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Na
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
This commit is contained in:
@@ -123,10 +123,9 @@ class EagleProposer(Proposer):
|
||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||
batch_descriptor=None,
|
||||
dummy_compute_logits=lambda hidden_states: None):
|
||||
moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
|
||||
with set_ascend_forward_context(None,
|
||||
self.vllm_config,
|
||||
moe_comm_type=moe_comm_type,
|
||||
in_profile_run=True,
|
||||
num_tokens=num_tokens):
|
||||
self.model(
|
||||
input_ids=self.input_ids[:num_tokens],
|
||||
@@ -458,15 +457,12 @@ class EagleProposer(Proposer):
|
||||
else:
|
||||
num_input_tokens = num_tokens
|
||||
|
||||
moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
|
||||
|
||||
# copy inputs to buffer for cudagraph
|
||||
self.positions[:num_tokens] = target_positions.to(device)
|
||||
self.hidden_states[:num_tokens] = target_hidden_states
|
||||
attn_metadata.block_tables = block_table.to(device)
|
||||
with set_ascend_forward_context(attn_metadata,
|
||||
self.vllm_config,
|
||||
moe_comm_type=moe_comm_type,
|
||||
num_tokens=num_input_tokens):
|
||||
last_hidden_states, hidden_states = self.model(
|
||||
input_ids=self.input_ids[:num_input_tokens],
|
||||
@@ -498,8 +494,6 @@ class EagleProposer(Proposer):
|
||||
else:
|
||||
input_batch_size = batch_size
|
||||
|
||||
moe_comm_type = self.runner._select_moe_comm_method(input_batch_size)
|
||||
|
||||
attn_metadata.num_actual_tokens = batch_size
|
||||
attn_metadata.max_query_len = 1
|
||||
attn_metadata.query_start_loc = self.arange[:batch_size + 1]
|
||||
@@ -575,7 +569,6 @@ class EagleProposer(Proposer):
|
||||
# Run the model.
|
||||
with set_ascend_forward_context(attn_metadata,
|
||||
self.vllm_config,
|
||||
moe_comm_type=moe_comm_type,
|
||||
num_tokens=input_batch_size):
|
||||
|
||||
last_hidden_states, hidden_states = self.model(
|
||||
|
||||
Reference in New Issue
Block a user