[CI] Add DeepSeek-V3.2-W8A8-Pruning e2e test (#5922)

### What this PR does / why we need it? 1. Fix DeepSeek-V3.2-W8A8-Pruning mtp 2. Add DeepSeek-V3.2-W8A8-Pruning e2e test ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 11b6af5280 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-16 15:49:57 +08:00
parent 69b170b8b5
commit 4f446aec4c
4 changed files with 32 additions and 2 deletions
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -235,3 +235,29 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
            quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0"})
+@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
+                    tensor_parallel_size=2,
+                    quantization="ascend",
+                    enable_expert_parallel=True,
+                    compilation_config={
+                        "cudagraph_capture_sizes": [3, 6, 9, 12],
+                        "cudagraph_mode": "FULL_DECODE_ONLY"
+                    },
+                    speculative_config={
+                        "num_speculative_tokens": 2,
+                        "method": "deepseek_mtp"
+                    },
+                    reasoning_parser="deepseek_v3",
+                    tokenizer_mode="deepseek_v32") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)