From 2ae0bad96dd51d4b6de68af2c0dcf7afe747fc39 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 25 Dec 2025 11:09:56 +0800 Subject: [PATCH] Remove VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE (#5272) `VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE` is only used together with `VLLM_ASCEND_ENABLE_PREFETCH_MLP` which is useless totally. This PR remove it. - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: wangxiyuan --- docs/source/tutorials/Qwen3-Dense.md | 6 +++--- tests/e2e/multicard/test_offline_inference_distributed.py | 2 -- .../features/test_qwen3_32b_int8_a3_feature_stack3.py | 1 - tests/e2e/nightly/models/test_qwen3_32b_int8.py | 1 - vllm_ascend/ascend_forward_context.py | 3 +-- vllm_ascend/envs.py | 5 ----- vllm_ascend/ops/linear_op.py | 6 +++--- vllm_ascend/utils.py | 4 ---- 8 files changed, 7 insertions(+), 21 deletions(-) diff --git a/docs/source/tutorials/Qwen3-Dense.md b/docs/source/tutorials/Qwen3-Dense.md index b49053d3..6b5ced14 100644 --- a/docs/source/tutorials/Qwen3-Dense.md +++ b/docs/source/tutorials/Qwen3-Dense.md @@ -165,8 +165,8 @@ export TASK_QUEUE_ENABLE=1 # Enable the AIVector core to directly schedule ROCE communication export HCCL_OP_EXPANSION_MODE="AIV" -# Enable dense model and general optimizations for better performance. -export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1 +# Enable MLP prefetch for better performance. +export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1 # Enable FlashComm_v1 optimization when tensor parallel is enabled. export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 @@ -334,7 +334,7 @@ In dense model scenarios, the MLP's gate_up_proj and down_proj linear layers oft It is important to emphasize that, since we use vector computations to hide the weight prefetching pipeline, the setting of the prefetch buffer size is crucial. If the buffer size is too small, the optimization benefits will not be fully realized, while a larger buffer size may lead to resource contention, resulting in performance degradation. To accommodate different scenarios, we have exposed two environment variables `VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE` and `VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE` to allow for flexible buffer size configuration based on the specific workload. -This optimization requires setting the environment variable `VLLM_ASCEND_ENABLE_PREFETCH_MLP = 1` and `VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE = 1` to be enabled. +This optimization requires setting the environment variable `VLLM_ASCEND_ENABLE_PREFETCH_MLP = 1` to be enabled. ### 6. Zerolike Elimination This elimination removes unnecessary operations related to zero-like tensors in Attention forward, improving the efficiency of matrix operations and reducing memory usage. diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index e5f4b2c2..34b5b3d4 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -178,7 +178,6 @@ def test_deepseek_v2_lite_fc1_tp2() -> None: @pytest.mark.parametrize("model", QWEN_DENSE_MODELS) -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) def test_qwen3_dense_fc1_tp2(model): example_prompts = [ @@ -197,7 +196,6 @@ def test_qwen3_dense_fc1_tp2(model): @pytest.mark.parametrize("model", QWEN_DENSE_MODELS) -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"}) def test_qwen3_dense_prefetch_mlp_weight_tp2(model): example_prompts = [ diff --git a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py index 9fa2d1e5..add4c960 100644 --- a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py +++ b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py @@ -72,7 +72,6 @@ async def test_models(model: str, tp_size: int) -> None: "OMP_PROC_BIND": "false", "VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1", "VLLM_ASCEND_ENABLE_FLASHCOMM": "1", - "VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1", "VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1" } server_args = [ diff --git a/tests/e2e/nightly/models/test_qwen3_32b_int8.py b/tests/e2e/nightly/models/test_qwen3_32b_int8.py index 0b047cc3..9005e732 100644 --- a/tests/e2e/nightly/models/test_qwen3_32b_int8.py +++ b/tests/e2e/nightly/models/test_qwen3_32b_int8.py @@ -81,7 +81,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None: port = get_open_port() env_dict = { "TASK_QUEUE_ENABLE": "1", - "VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1", "HCCL_OP_EXPANSION_MODE": "AIV", "VLLM_ASCEND_ENABLE_FLASHCOMM": "1", "VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1" diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index ada22242..c28caa09 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -102,8 +102,7 @@ def set_ascend_forward_context( # TODO(rjg-lyh): refactor mlp weight prefetch method # set for mlp weight prefetch - prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE and \ - envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \ + prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \ forward_context.layer_idx is not None and \ num_tokens is not None and num_tokens < 500 if prefetch_mlp_enabled: diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 8d8af052..4bd3987b 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -108,11 +108,6 @@ env_variables: Dict[str, Callable[[], Any]] = { "VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE": lambda: int( os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)), - # Whether to enable dense model and general optimizations for better performance. - # Since we modified the base parent class `linear`, this optimization is also applicable to other model types. - # However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models. - "VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": - lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE", '0'))), # Whether to enable msMonitor tool to monitor the performance of vllm-ascend. "MSMONITOR_USE_DAEMON": lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))), diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index 980fd2a2..674dab54 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -53,13 +53,13 @@ from vllm.distributed import (split_tensor_along_last_dim, from vllm.distributed.parallel_state import get_tp_group from vllm.forward_context import get_forward_context +from vllm_ascend import envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import (get_flashcomm2_odp_group, get_flashcomm2_otp_group, get_mlp_tp_group, get_otp_group) -from vllm_ascend.utils import (dense_optim_enable, enable_sp, - flashcomm2_enable, +from vllm_ascend.utils import (enable_sp, flashcomm2_enable, get_flashcomm2_reorgnized_batch_ids, matmul_allreduce_enable, mlp_tp_enable, oproj_tp_enable, shared_expert_dp_enabled) @@ -135,7 +135,7 @@ class CustomRowParallelOp(CustomLinearOp): def apply(self, input_): output, output_bias = self.apply_impl(input_) - if dense_optim_enable(): + if envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP: torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix) if not self.return_bias: return output diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index c8bf448c..8ab33b49 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -772,10 +772,6 @@ def matmul_allreduce_enable() -> bool: return envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE -def dense_optim_enable() -> bool: - return envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE - - def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool: global _ENABLE_SP if _ENABLE_SP is None: