Default enable MLAPO (#5952)
### What this PR does / why we need it?
1) Default enable MLAPO for deepseek MLA Attention W8A8 models on PD
disagregation D Instance, for example: DeepSeekV3-W8A8,
DeepSeek-R1-W8A8.
2) Default enable MLAPO for DeepSeek SFA Attention W8A8 models,
currently is DeepSeek-V3.2-W8A8.
### Does this PR introduce _any_ user-facing change?
Don't need use manully to VLLM_ASCEND_ENABLE_MLAPO=1, to enable MLAPO
feature for deepseek w8a8 model
The effect of enabling MLAPO SFA model deployed on a single A3 Node:
Test
with:tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py
dataset: gsm8k-lite,without set MTP, FULL GRAPH, has 19% promote:
未默认开启 MLAPO 时:
├─────────────────────────┤
│ TTFT │ 14055.8836 ms │
├─────────────────────────┤
│ ITL │ 66.8171 ms. │
├─────────────────────────┤
│ Output Token Throughput │ 104.9105 token/s │
├─────────────────────────┤
默认开启 MLAPO 时:
├─────────────────────────┤
│ TTFT │ 3753.1547 ms │
├─────────────────────────┤
│ ITL. │ 61.4236 ms. │
├─────────────────────────┤
│ Output Token Throughput │ 125.2075 token/s│
├─────────────────────────┤
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
---------
Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -3,7 +3,6 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||
num_nodes: 2
|
||||
npu_per_node: 8
|
||||
env_common:
|
||||
VLLM_ASCEND_ENABLE_MLAPO: 1
|
||||
VLLM_ASCEND_BALANCE_SCHEDULING: 1
|
||||
HCCL_INTRA_PCIE_ENABLE: 1
|
||||
HCCL_INTRA_ROCE_ENABLE: 0
|
||||
|
||||
@@ -10,7 +10,6 @@ env_common:
|
||||
SERVER_PORT: 8080
|
||||
OMP_PROC_BIND: false
|
||||
OMP_NUM_THREADS: 1
|
||||
VLLM_ASCEND_ENABLE_MLAPO: 1
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM1: 0
|
||||
ASCEND_A3_EBA_ENABLE: 1
|
||||
|
||||
@@ -31,7 +31,6 @@ MODELS = [
|
||||
MODES = [
|
||||
"single",
|
||||
"aclgraph",
|
||||
"aclgraph_mlapo",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
@@ -88,8 +87,6 @@ async def test_models(model: str, mode: str) -> None:
|
||||
]
|
||||
if mode == "single":
|
||||
server_args.append("--enforce-eager")
|
||||
if mode == "aclgraph_mlapo":
|
||||
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
|
||||
@@ -56,7 +56,6 @@ async def test_models(model: str) -> None:
|
||||
"OMP_NUM_THREADS": "100",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "200",
|
||||
"VLLM_ASCEND_ENABLE_MLAPO": "1",
|
||||
"VLLM_RPC_TIMEOUT": "3600000",
|
||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000",
|
||||
"DISABLE_L2_CACHE": "1",
|
||||
|
||||
Reference in New Issue
Block a user