【Feature】refactor npu_modelrunner for profile_run (#4993)
### What this PR does / why we need it?
(1)refactor npu_model_runner for profile_run
(2) move _select_moe_comm_method to ascend_forward_context
(3) delete _init_model_kwargs in npu_model_runner
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Na
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
This commit is contained in:
@@ -28,8 +28,7 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
from vllm.v1.utils import CpuGpuBuffer
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
from vllm_ascend.ascend_forward_context import (MoECommType,
|
||||
set_ascend_forward_context)
|
||||
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
|
||||
@@ -242,11 +241,6 @@ class MtpProposer(Proposer):
|
||||
# NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run
|
||||
# and _propose.
|
||||
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
||||
moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
|
||||
# TODO: remove this after moe_comm_type selection logic is finalized
|
||||
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
||||
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
||||
|
||||
if aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
||||
if len(self.runner.attn_groups) > 0:
|
||||
num_computed_tokens_cpu = (
|
||||
@@ -299,9 +293,8 @@ class MtpProposer(Proposer):
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens,
|
||||
with_prefill=with_prefill,
|
||||
in_profile_run=True,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
moe_comm_type=moe_comm_type,
|
||||
in_profile_run=self.runner.in_profile_run,
|
||||
num_actual_tokens=0,
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
@@ -720,11 +713,6 @@ class MtpProposer(Proposer):
|
||||
with_prefill) = self.runner._sync_metadata_across_dp(
|
||||
num_input_tokens, self.runner.with_prefill)
|
||||
|
||||
moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
|
||||
# TODO: remove this after moe_comm_type selection logic is finalized
|
||||
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
||||
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
||||
|
||||
# Enable shared_expert_dp and MTP FULL graph may cause accuracy issues.
|
||||
if scheduler_output and not self.enable_shared_expert_dp:
|
||||
max_query_len = common_attn_metadata.max_query_len
|
||||
@@ -771,7 +759,6 @@ class MtpProposer(Proposer):
|
||||
num_tokens=num_input_tokens,
|
||||
with_prefill=with_prefill,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
moe_comm_type=moe_comm_type,
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
in_profile_run=self.runner.in_profile_run,
|
||||
|
||||
Reference in New Issue
Block a user