[bugfix] fix mtp accept rate (#5093)
### What this PR does / why we need it? 1. now, npu_model_runner reuses gpu_model_runner, this pr deletes some attrs already defined in gpu_model_runner 2. fix mtp accept rate by disabling in_profile_run 3. remove redundant moe method selection logic 4. Reverts vllm-project/vllm-ascend#5082, which broke CI in https://github.com/vllm-project/vllm-ascend/actions/runs/20266314048/job/58190426832?pr=5088 ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? vLLM version: v0.12.0 vLLM main:ad32e3e19cvLLM version: v0.12.0 vLLM main:ad32e3e19c- vLLM version: v0.12.0 - vLLM main:ad32e3e19c--------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -26,10 +26,6 @@ enum NnopbaseHcclServerType {
|
|||||||
};
|
};
|
||||||
extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
|
extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnormGetWorkspaceSize(
|
extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnormGetWorkspaceSize(
|
||||||
const aclTensor *x1,
|
const aclTensor *x1,
|
||||||
const aclTensor *x2,
|
const aclTensor *x2,
|
||||||
@@ -52,6 +48,10 @@ extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnorm(
|
|||||||
aclOpExecutor *executor,
|
aclOpExecutor *executor,
|
||||||
aclrtStream stream);
|
aclrtStream stream);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
aclnnStatus aclnnMatmulAllreduceAddRmsnormGetWorkspaceSize(
|
aclnnStatus aclnnMatmulAllreduceAddRmsnormGetWorkspaceSize(
|
||||||
const aclTensor *x1,
|
const aclTensor *x1,
|
||||||
const aclTensor *x2,
|
const aclTensor *x2,
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ def set_ascend_forward_context(
|
|||||||
get_moe_comm_method
|
get_moe_comm_method
|
||||||
moe_comm_type = select_moe_comm_method(num_tokens, vllm_config)
|
moe_comm_type = select_moe_comm_method(num_tokens, vllm_config)
|
||||||
# TODO: remove this after moe_comm_type selection logic is finalized
|
# TODO: remove this after moe_comm_type selection logic is finalized
|
||||||
if in_profile_run and is_mtp_model:
|
if is_mtp_model:
|
||||||
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
||||||
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
||||||
forward_context.moe_comm_type = moe_comm_type
|
forward_context.moe_comm_type = moe_comm_type
|
||||||
@@ -298,8 +298,6 @@ def select_moe_comm_method(num_tokens: int,
|
|||||||
if fused_all2all_enable else MoECommType.ALLTOALL)
|
if fused_all2all_enable else MoECommType.ALLTOALL)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported soc_version: {soc_version}")
|
raise ValueError(f"Unsupported soc_version: {soc_version}")
|
||||||
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
|
||||||
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
|
||||||
# PanguProMoE only supports allgather
|
# PanguProMoE only supports allgather
|
||||||
if model_type == "PanguProMoE":
|
if model_type == "PanguProMoE":
|
||||||
moe_comm_type = MoECommType.ALLGATHER
|
moe_comm_type = MoECommType.ALLGATHER
|
||||||
|
|||||||
@@ -145,7 +145,6 @@ class EagleProposer(Proposer):
|
|||||||
dummy_compute_logits=lambda hidden_states: None):
|
dummy_compute_logits=lambda hidden_states: None):
|
||||||
with set_ascend_forward_context(None,
|
with set_ascend_forward_context(None,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
in_profile_run=True,
|
|
||||||
num_tokens=num_tokens):
|
num_tokens=num_tokens):
|
||||||
self.model(
|
self.model(
|
||||||
input_ids=self.input_ids[:num_tokens],
|
input_ids=self.input_ids[:num_tokens],
|
||||||
|
|||||||
@@ -293,7 +293,6 @@ class MtpProposer(Proposer):
|
|||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
num_tokens=num_tokens,
|
num_tokens=num_tokens,
|
||||||
with_prefill=with_prefill,
|
with_prefill=with_prefill,
|
||||||
in_profile_run=True,
|
|
||||||
num_tokens_across_dp=num_tokens_across_dp,
|
num_tokens_across_dp=num_tokens_across_dp,
|
||||||
num_actual_tokens=0,
|
num_actual_tokens=0,
|
||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
|
|||||||
@@ -244,8 +244,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.need_accepted_tokens: bool = False
|
self.need_accepted_tokens: bool = False
|
||||||
|
|
||||||
self.is_multimodal_model = self.model_config.is_multimodal_model
|
self.is_multimodal_model = self.model_config.is_multimodal_model
|
||||||
self.is_pooling_model = self.model_config.pooler_config is not None
|
|
||||||
self.enable_prompt_embeds = self.model_config.enable_prompt_embeds
|
|
||||||
self.block_size = vllm_config.cache_config.block_size
|
self.block_size = vllm_config.cache_config.block_size
|
||||||
# Set up Attention
|
# Set up Attention
|
||||||
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
|
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
|
||||||
@@ -338,24 +336,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
ascend_config = get_ascend_config()
|
ascend_config = get_ascend_config()
|
||||||
self.eplb_updator = EplbUpdator(ascend_config, self.eplb_loader,
|
self.eplb_updator = EplbUpdator(ascend_config, self.eplb_loader,
|
||||||
self.eplb_process, self.process)
|
self.eplb_process, self.process)
|
||||||
|
|
||||||
self.use_async_scheduling = self.scheduler_config.async_scheduling
|
|
||||||
self.async_output_copy_stream = torch.npu.Stream() if \
|
|
||||||
self.use_async_scheduling else None
|
|
||||||
self.num_spec_tokens = 0
|
|
||||||
if self.speculative_config:
|
|
||||||
self.num_spec_tokens = self.speculative_config.num_speculative_tokens # noqa
|
|
||||||
self.valid_sampled_token_count_event: torch.npu.Event | None = None
|
|
||||||
self.valid_sampled_token_count_copy_stream: torch.npu.Stream | None = None
|
|
||||||
if self.use_async_scheduling and self.num_spec_tokens:
|
|
||||||
self.valid_sampled_token_count_event = torch.npu.Event()
|
|
||||||
self.valid_sampled_token_count_copy_stream = torch.npu.Stream()
|
|
||||||
self.valid_sampled_token_count_cpu = torch.empty(
|
|
||||||
self.max_num_reqs,
|
|
||||||
dtype=torch.int64,
|
|
||||||
device="cpu",
|
|
||||||
pin_memory=self.pin_memory,
|
|
||||||
)
|
|
||||||
# Input Batch
|
# Input Batch
|
||||||
# NOTE(Chen): Ideally, we should initialize the input batch inside
|
# NOTE(Chen): Ideally, we should initialize the input batch inside
|
||||||
# `initialize_kv_cache` based on the kv cache config. However, as in
|
# `initialize_kv_cache` based on the kv cache config. However, as in
|
||||||
@@ -386,23 +366,20 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
cp_kv_cache_interleave_size=self.parallel_config.
|
cp_kv_cache_interleave_size=self.parallel_config.
|
||||||
cp_kv_cache_interleave_size,
|
cp_kv_cache_interleave_size,
|
||||||
)
|
)
|
||||||
self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
|
|
||||||
dtype=torch.int64)
|
|
||||||
self.num_draft_tokens = self._make_buffer(self.max_num_reqs,
|
self.num_draft_tokens = self._make_buffer(self.max_num_reqs,
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
|
# here we use int32
|
||||||
self.sampled_token_ids_pinned_cpu = torch.empty(
|
self.sampled_token_ids_pinned_cpu = torch.empty(
|
||||||
(self.max_num_reqs, 1),
|
(self.max_num_reqs, 1),
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
device="cpu",
|
device="cpu",
|
||||||
pin_memory=self.pin_memory,
|
pin_memory=self.pin_memory,
|
||||||
)
|
)
|
||||||
# None in the first PP rank. The rest are set after load_model.
|
# for cleancode , actually the three attrs is defined in gpu_model_runner
|
||||||
# the attr below is in gpu_modelrunner, but occurs lint so add them here
|
|
||||||
self.intermediate_tensors: IntermediateTensors | None = None
|
|
||||||
self.execute_model_state: ExecuteModelState | None = None
|
self.execute_model_state: ExecuteModelState | None = None
|
||||||
|
# None in the first PP rank. The rest are set after load_model.
|
||||||
|
self.intermediate_tensors: IntermediateTensors | None = None
|
||||||
self.reorder_batch_threshold: int | None = None
|
self.reorder_batch_threshold: int | None = None
|
||||||
self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
|
|
||||||
dtype=torch.int32)
|
|
||||||
|
|
||||||
def _init_device_properties(self) -> None:
|
def _init_device_properties(self) -> None:
|
||||||
self.num_sms = None
|
self.num_sms = None
|
||||||
@@ -3395,6 +3372,7 @@ def _torch_cuda_wrapper():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# replace cuda APIs with xpu APIs, this should work by default
|
# replace cuda APIs with xpu APIs, this should work by default
|
||||||
|
torch.Event = torch.npu.Event
|
||||||
torch.cuda.Event = torch.npu.Event
|
torch.cuda.Event = torch.npu.Event
|
||||||
torch.cuda.Stream = torch.npu.Stream
|
torch.cuda.Stream = torch.npu.Stream
|
||||||
torch.cuda.default_stream = torch.npu.default_stream
|
torch.cuda.default_stream = torch.npu.default_stream
|
||||||
|
|||||||
Reference in New Issue
Block a user