[CI] Add DeepSeek-V3.2-W8A8 nightly ci test (#4633)

### What this PR does / why we need it? Add DeepSeek-V3.2-W8A8 nightly ci test： DeepSeek-V3.2-W8A8 1node DP2+TP8 :tests/e2e/nightly/models/test_deepseek_v3_2_w8a8.py ### Does this PR introduce _any_ user-facing change - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-20 21:05:15 +08:00
parent cea48c2a34
commit 750c06c78a
6 changed files with 30 additions and 93 deletions
--- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py
--- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py
@@ -23,13 +23,10 @@ from vllm.utils.network_utils import get_open_port
 from tests.e2e.conftest import RemoteOpenAIServer
 from tools.aisbench import run_aisbench_cases

-MODELS = [
-    "vllm-ascend/DeepSeek-V3.2-Exp-W8A8",
-]
+MODELS = ["vllm-ascend/DeepSeek-V3.2-W8A8"]

 TENSOR_PARALLELS = [8]
 DATA_PARALLELS = [2]
-FULL_GRAPH = [True, False]

 prompts = [
    "San Francisco is a",
@@ -53,11 +50,11 @@ aisbench_cases = [{
    "dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
    "request_conf": "vllm_api_stream_chat",
    "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
-    "num_prompts": 16,
+    "num_prompts": 100,
    "max_out_len": 1500,
-    "batch_size": 8,
-    "request_rate": 0,
-    "baseline": 1,
+    "batch_size": 4,
+    "request_rate": 11.2,
+    "baseline": 120,
    "threshold": 0.97
 }]

@@ -66,25 +63,30 @@ aisbench_cases = [{
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
-@pytest.mark.parametrize("full_graph", FULL_GRAPH)
-async def test_models(model: str, tp_size: int, dp_size: int,
-                      full_graph: bool) -> None:
+async def test_models(model: str, tp_size: int, dp_size: int) -> None:
    port = get_open_port()
-    env_dict = {"HCCL_BUFFSIZE": "1024", "VLLM_ASCEND_ENABLE_MLAPO": "0"}
+    env_dict = {
+        "HCCL_OP_EXPANSION_MODE": "AIV",
+        "OMP_PROC_BIND": "false",
+        "OMP_NUM_THREADS": "1",
+        "HCCL_BUFFSIZE": "1024",
+        "VLLM_ASCEND_ENABLE_MLAPO": "1",
+        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
+        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "0",
+    }
+
    server_args = [
-        "--no-enable-prefix-caching", "--enable-expert-parallel",
-        "--tensor-parallel-size",
+        "--enable-expert-parallel", "--tensor-parallel-size",
        str(tp_size), "--data-parallel-size",
        str(dp_size), "--port",
-        str(port), "--max-model-len", "16384", "--max-num-batched-tokens",
-        "16384", "--block-size", "16", "--trust-remote-code", "--quantization",
-        "ascend", "--gpu-memory-utilization", "0.9"
+        str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
+        "8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization",
+        "ascend", "--gpu-memory-utilization", "0.92", "--compilation-config",
+        '{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
+        "--speculative-config",
+        '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
+        "--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
    ]
-    if full_graph:
-        server_args += [
-            "--compilation-config",
-            '{"cudagraph_capture": [16], "cudagraph_model":"FULL_DECODE_ONLY"}'
-        ]
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }