Remove VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE (#5272)

`VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE` is only used together with `VLLM_ASCEND_ENABLE_PREFETCH_MLP` which is useless totally. This PR remove it. - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-25 11:09:56 +08:00
parent 13cd6362c6
commit 2ae0bad96d
8 changed files with 7 additions and 21 deletions
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -102,8 +102,7 @@ def set_ascend_forward_context(

        # TODO(rjg-lyh): refactor mlp weight prefetch method
        # set for mlp weight prefetch
-        prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE and \
-            envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \
+        prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \
            forward_context.layer_idx is not None and \
            num_tokens is not None and num_tokens < 500
        if prefetch_mlp_enabled:
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -108,11 +108,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE":
    lambda: int(
        os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
-    # Whether to enable dense model and general optimizations for better performance.
-    # Since we modified the base parent class `linear`, this optimization is also applicable to other model types.
-    # However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models.
-    "VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE":
-    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE", '0'))),
    # Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
    "MSMONITOR_USE_DAEMON":
    lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -53,13 +53,13 @@ from vllm.distributed import (split_tensor_along_last_dim,
 from vllm.distributed.parallel_state import get_tp_group
 from vllm.forward_context import get_forward_context

+from vllm_ascend import envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import (get_flashcomm2_odp_group,
                                                    get_flashcomm2_otp_group,
                                                    get_mlp_tp_group,
                                                    get_otp_group)
-from vllm_ascend.utils import (dense_optim_enable, enable_sp,
-                               flashcomm2_enable,
+from vllm_ascend.utils import (enable_sp, flashcomm2_enable,
                               get_flashcomm2_reorgnized_batch_ids,
                               matmul_allreduce_enable, mlp_tp_enable,
                               oproj_tp_enable, shared_expert_dp_enabled)
@@ -135,7 +135,7 @@ class CustomRowParallelOp(CustomLinearOp):

    def apply(self, input_):
        output, output_bias = self.apply_impl(input_)
-        if dense_optim_enable():
+        if envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP:
            torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
        if not self.return_bias:
            return output
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -772,10 +772,6 @@ def matmul_allreduce_enable() -> bool:
    return envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE


-def dense_optim_enable() -> bool:
-    return envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
-
-
 def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
    global _ENABLE_SP
    if _ENABLE_SP is None: