diff --git a/docs/source/tutorials/DeepSeek-R1.md b/docs/source/tutorials/DeepSeek-R1.md index 0d301e56..f0e16f96 100644 --- a/docs/source/tutorials/DeepSeek-R1.md +++ b/docs/source/tutorials/DeepSeek-R1.md @@ -94,7 +94,6 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name -export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_USE_MODELSCOPE=True @@ -121,7 +120,6 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ **Notice:** The parameters are explained as follows: -- Setting the environment variable `VLLM_ASCEND_ENABLE_MLAPO=1` enables a fusion operator that can significantly improve performance, though it requires more NPU memory. It is therefore recommended to enable this option when sufficient NPU memory is available. - Setting the environment variable `VLLM_ASCEND_BALANCE_SCHEDULING=1` enables balance scheduling. This may help increase output throughput and reduce TPOT in v1 scheduler. However, TTFT may degrade in some scenarios. Furthermore, enabling this feature is not recommended in scenarios where PD is separated. - For single-node deployment, we recommend using `dp4tp4` instead of `dp2tp8`. - `--max-model-len` specifies the maximum context length - that is, the sum of input and output tokens for a single request. For performance testing with an input length of 3.5K and output length of 1.5K, a value of `16384` is sufficient, however, for precision testing, please set it at least `35000`. @@ -151,7 +149,6 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 @@ -198,7 +195,6 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 diff --git a/docs/source/tutorials/DeepSeek-V3.1.md b/docs/source/tutorials/DeepSeek-V3.1.md index 1201f1d8..37c82125 100644 --- a/docs/source/tutorials/DeepSeek-V3.1.md +++ b/docs/source/tutorials/DeepSeek-V3.1.md @@ -106,7 +106,6 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name -export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True @@ -133,7 +132,6 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \ **Notice:** The parameters are explained as follows: -- Setting the environment variable `VLLM_ASCEND_ENABLE_MLAPO=1` enables a fusion operator that can significantly improve performance, though it requires more NPU memory. It is therefore recommended to enable this option when sufficient NPU memory is available. - Setting the environment variable `VLLM_ASCEND_BALANCE_SCHEDULING=1` enables balance scheduling. This may help increase output throughput and reduce TPOT in v1 scheduler. However, TTFT may degrade in some scenarios. Furthermore, enabling this feature is not recommended in scenarios where PD is separated. - For single-node deployment, we recommend using `dp4tp4` instead of `dp2tp8`. - `--max-model-len` specifies the maximum context length - that is, the sum of input and output tokens for a single request. For performance testing with an input length of 3.5K and output length of 1.5K, a value of `16384` is sufficient, however, for precision testing, please set it at least `35000`. @@ -171,7 +169,6 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 @@ -224,7 +221,6 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=200 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export VLLM_ASCEND_BALANCE_SCHEDULING=1 export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 @@ -449,7 +445,6 @@ export HCCL_CONNECT_TIMEOUT=120 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export HCCL_BUFFSIZE=1100 export TASK_QUEUE_ENABLE=1 export HCCL_OP_EXPANSION_MODE="AIV" @@ -526,7 +521,6 @@ export HCCL_CONNECT_TIMEOUT=120 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export HCCL_BUFFSIZE=1100 export TASK_QUEUE_ENABLE=1 export HCCL_OP_EXPANSION_MODE="AIV" diff --git a/docs/source/tutorials/DeepSeek-V3.2.md b/docs/source/tutorials/DeepSeek-V3.2.md index 9b8e2cdf..a0ac1696 100644 --- a/docs/source/tutorials/DeepSeek-V3.2.md +++ b/docs/source/tutorials/DeepSeek-V3.2.md @@ -441,8 +441,6 @@ Before you start, please export ASCEND_RT_VISIBLE_DEVICES=$1 - export VLLM_ASCEND_ENABLE_MLAPO=1 - vllm serve /root/.cache/Eco-Tech/DeepSeek-V3.2-w8a8-mtp-QuaRot \ --host 0.0.0.0 \ @@ -520,8 +518,6 @@ Before you start, please export ASCEND_RT_VISIBLE_DEVICES=$1 - export VLLM_ASCEND_ENABLE_MLAPO=1 - vllm serve /root/.cache/Eco-Tech/DeepSeek-V3.2-w8a8-mtp-QuaRot \ --host 0.0.0.0 \ diff --git a/docs/source/tutorials/long_sequence_context_parallel_multi_node.md b/docs/source/tutorials/long_sequence_context_parallel_multi_node.md index 081ceb0e..9754fe26 100644 --- a/docs/source/tutorials/long_sequence_context_parallel_multi_node.md +++ b/docs/source/tutorials/long_sequence_context_parallel_multi_node.md @@ -224,7 +224,6 @@ export OMP_NUM_THREADS=1 export HCCL_OP_EXPANSION_MODE="AIV" export VLLM_USE_V1=1 export TASK_QUEUE_ENABLE=1 -export VLLM_ASCEND_ENABLE_MLAPO="1" export VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL=1 vllm serve /path_to_weight/DeepSeek-V3.1_w8a8mix_mtp \ diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md b/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md index fd51b804..c9472c49 100644 --- a/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md +++ b/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md @@ -370,7 +370,6 @@ export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export HCCL_BUFFSIZE=600 export TASK_QUEUE_ENABLE=1 export HCCL_OP_EXPANSION_MODE="AIV" @@ -429,7 +428,6 @@ export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export HCCL_BUFFSIZE=600 export TASK_QUEUE_ENABLE=1 export HCCL_OP_EXPANSION_MODE="AIV" @@ -615,7 +613,6 @@ export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export HCCL_BUFFSIZE=600 export TASK_QUEUE_ENABLE=1 export HCCL_OP_EXPANSION_MODE="AIV" @@ -674,7 +671,6 @@ export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_MLAPO=1 export HCCL_BUFFSIZE=600 export TASK_QUEUE_ENABLE=1 export HCCL_OP_EXPANSION_MODE="AIV" diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml index 262ee15b..f4296009 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml @@ -3,7 +3,6 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8" num_nodes: 2 npu_per_node: 8 env_common: - VLLM_ASCEND_ENABLE_MLAPO: 1 VLLM_ASCEND_BALANCE_SCHEDULING: 1 HCCL_INTRA_PCIE_ENABLE: 1 HCCL_INTRA_ROCE_ENABLE: 0 diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml index 985b3ea4..0bb313be 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml @@ -10,7 +10,6 @@ env_common: SERVER_PORT: 8080 OMP_PROC_BIND: false OMP_NUM_THREADS: 1 - VLLM_ASCEND_ENABLE_MLAPO: 1 PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" VLLM_ASCEND_ENABLE_FLASHCOMM1: 0 ASCEND_A3_EBA_ENABLE: 1 diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py index 0f1e4ff1..b71d8854 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py @@ -31,7 +31,6 @@ MODELS = [ MODES = [ "single", "aclgraph", - "aclgraph_mlapo", ] prompts = [ @@ -88,8 +87,6 @@ async def test_models(model: str, mode: str) -> None: ] if mode == "single": server_args.append("--enforce-eager") - if mode == "aclgraph_mlapo": - env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1" server_args.extend(["--additional-config", json.dumps(additional_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py index c26c8ec2..08db9a15 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py @@ -56,7 +56,6 @@ async def test_models(model: str) -> None: "OMP_NUM_THREADS": "100", "OMP_PROC_BIND": "false", "HCCL_BUFFSIZE": "200", - "VLLM_ASCEND_ENABLE_MLAPO": "1", "VLLM_RPC_TIMEOUT": "3600000", "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000", "DISABLE_L2_CACHE": "1", diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index e76c64a7..f695be26 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -22,7 +22,8 @@ from vllm_ascend.attention.context_parallel.common_cp import ( from vllm_ascend.attention.utils import ( AscendCommonAttentionMetadata, ascend_chunked_prefill_workspace_size, enable_cp, maybe_save_kv_layer_to_connector, split_decodes_and_prefills, - trans_rope_weight, transdata, wait_for_kv_layer_from_connector) + trans_rope_weight, transdata, wait_for_kv_layer_from_connector, + enabling_malpo) from vllm_ascend.compilation.acl_graph import ( get_draft_graph_params, get_graph_params, update_draft_graph_params_workspaces, update_graph_params_workspaces) @@ -741,7 +742,7 @@ class AscendMLAImpl(MLAAttentionImpl): self.ring_mla_mask_size = 512 self.speculative_config = self.vllm_config.speculative_config - self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO + self.enable_mlapo = enabling_malpo(self.vllm_config) self.is_kv_producer = self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer self.layer_sharding_kwargs = [] @@ -1491,7 +1492,6 @@ class AscendMLAImpl(MLAAttentionImpl): # MLA Preprocess if self.enable_mlapo and \ - not has_prefill and \ attn_metadata.num_decode_tokens <= MLAPO_MAX_SUPPORTED_TOKENS: hidden_states = torch.ops.vllm.maybe_all_gather_and_maybe_unpad( hidden_states.contiguous(), need_gather_q_kv) diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index f362e62c..5bc2641a 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -375,6 +375,9 @@ class AscendSFAImpl(MLAAttentionImpl): ascend_config = get_ascend_config() self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp self.enable_prefetch = ascend_config.weight_prefetch_config.enabled + + # In sfa, prefill and decode have the same calculation formula, + # so do not distinguish between prefill and decode here. self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO assert self.indexer is not None, "Indexer is required for DSA." diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py index 79d2300c..8414fc5d 100644 --- a/vllm_ascend/attention/utils.py +++ b/vllm_ascend/attention/utils.py @@ -9,6 +9,7 @@ from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_ from vllm.forward_context import ForwardContext, get_forward_context from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm_ascend import envs from vllm_ascend.utils import AscendDeviceType, get_ascend_config, get_ascend_device_type @@ -302,3 +303,8 @@ def transdata(nd_mat, block_size: tuple = (16, 16)): ) nz_mat = torch.reshape(nz_mat, (nz_mat.shape[0], nz_mat.shape[1] * nz_mat.shape[2], nz_mat.shape[3])) return nz_mat + + +def enabling_malpo(vllm_config: VllmConfig) -> bool: + is_decode_instance = vllm_config.kv_transfer_config is not None and vllm_config.kv_transfer_config.is_kv_consumer + return bool(envs.VLLM_ASCEND_ENABLE_MLAPO and is_decode_instance) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 94645b18..8298a5f5 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -92,7 +92,11 @@ env_variables: dict[str, Callable[[], Any]] = { ), # Whether to enable msMonitor tool to monitor the performance of vllm-ascend. "MSMONITOR_USE_DAEMON": lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", "0"))), - "VLLM_ASCEND_ENABLE_MLAPO": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", "0"))), + # Whether to enable MLAPO optimization for DeepSeek W8A8 series models. + # This option is enabled by default. MLAPO can improve performance, but + # it will consume more NPU memory. If reducing NPU memory usage is a higher priority + # for your DeepSeek W8A8 scene, then disable it. + "VLLM_ASCEND_ENABLE_MLAPO": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", "1"))), # Whether to enable weight cast format to FRACTAL_NZ. # 0: close nz; # 1: only quant case enable nz;