[bugfix] fix mtp accept rate (#5093)
### What this PR does / why we need it? 1. now, npu_model_runner reuses gpu_model_runner, this pr deletes some attrs already defined in gpu_model_runner 2. fix mtp accept rate by disabling in_profile_run 3. remove redundant moe method selection logic 4. Reverts vllm-project/vllm-ascend#5082, which broke CI in https://github.com/vllm-project/vllm-ascend/actions/runs/20266314048/job/58190426832?pr=5088 ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? vLLM version: v0.12.0 vLLM main:ad32e3e19cvLLM version: v0.12.0 vLLM main:ad32e3e19c- vLLM version: v0.12.0 - vLLM main:ad32e3e19c--------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -26,10 +26,6 @@ enum NnopbaseHcclServerType {
|
||||
};
|
||||
extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnormGetWorkspaceSize(
|
||||
const aclTensor *x1,
|
||||
const aclTensor *x2,
|
||||
@@ -52,6 +48,10 @@ extern aclnnStatus aclnnInnerMatmulAllreduceAddRmsnorm(
|
||||
aclOpExecutor *executor,
|
||||
aclrtStream stream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
aclnnStatus aclnnMatmulAllreduceAddRmsnormGetWorkspaceSize(
|
||||
const aclTensor *x1,
|
||||
const aclTensor *x2,
|
||||
|
||||
Reference in New Issue
Block a user