Remove VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE (#5272)

`VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE` is only used together with
`VLLM_ASCEND_ENABLE_PREFETCH_MLP` which is useless totally. This PR
remove it.
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-12-25 11:09:56 +08:00
committed by GitHub
parent 13cd6362c6
commit 2ae0bad96d
8 changed files with 7 additions and 21 deletions

View File

@@ -102,8 +102,7 @@ def set_ascend_forward_context(
# TODO(rjg-lyh): refactor mlp weight prefetch method
# set for mlp weight prefetch
prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE and \
envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \
prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \
forward_context.layer_idx is not None and \
num_tokens is not None and num_tokens < 500
if prefetch_mlp_enabled:

View File

@@ -108,11 +108,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
"VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE":
lambda: int(
os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
# Whether to enable dense model and general optimizations for better performance.
# Since we modified the base parent class `linear`, this optimization is also applicable to other model types.
# However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models.
"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE":
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE", '0'))),
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
"MSMONITOR_USE_DAEMON":
lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),

View File

@@ -53,13 +53,13 @@ from vllm.distributed import (split_tensor_along_last_dim,
from vllm.distributed.parallel_state import get_tp_group
from vllm.forward_context import get_forward_context
from vllm_ascend import envs as envs_ascend
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.parallel_state import (get_flashcomm2_odp_group,
get_flashcomm2_otp_group,
get_mlp_tp_group,
get_otp_group)
from vllm_ascend.utils import (dense_optim_enable, enable_sp,
flashcomm2_enable,
from vllm_ascend.utils import (enable_sp, flashcomm2_enable,
get_flashcomm2_reorgnized_batch_ids,
matmul_allreduce_enable, mlp_tp_enable,
oproj_tp_enable, shared_expert_dp_enabled)
@@ -135,7 +135,7 @@ class CustomRowParallelOp(CustomLinearOp):
def apply(self, input_):
output, output_bias = self.apply_impl(input_)
if dense_optim_enable():
if envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP:
torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
if not self.return_bias:
return output

View File

@@ -772,10 +772,6 @@ def matmul_allreduce_enable() -> bool:
return envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE
def dense_optim_enable() -> bool:
return envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
global _ENABLE_SP
if _ENABLE_SP is None: