[CI] Add long and short prompt tests for DeepSeek-V3.2 (#6499)

### What this PR does / why we need it? This PR enhances the test_deepseek3_2_w8a8_pruning_mtp_tp2_ep E2E test by adding both short and long prompt test cases: - Short test: Validates basic functionality with minimal input ("Hello ") - Long test: Validates the model can handle prompts near its maximum context length (~163K tokens, approaching the max_position_embeddings limit of 163,840) Additionally, explicitly sets max_model_len=163840 to ensure the test properly exercises the model's full context window capability. ### Does this PR introduce _any_ user-facing change? No. This change only affects internal E2E testing infrastructure. ### How was this patch tested? The modified test case will be executed as part of the E2E test suite and has been validated [here](https://github.com/vllm-project/vllm-ascend/actions/runs/21620195055/job/62308026205?pr=6499). - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: guozr <guozr1997@hotmail.com> Co-authored-by: guozr <guozr1997@hotmail.com>
2026-02-04 09:10:50 +08:00
parent 78fad4e348
commit bfcc372f75
1 changed files with 10 additions and 4 deletions
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -246,14 +246,19 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
 def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
-    example_prompts = [
-        "Hello, my name is",
+    short_example_prompts = [
+        "Hello ",
    ]
-    max_tokens = 5
+    # "max_position_embeddings": 163840,
+    long_example_prompts = [
+        "Hello " * (163839 - 500) + "Hello"
+    ]
+    max_tokens = 500 
    with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
                    tensor_parallel_size=2,
                    quantization="ascend",
                    enable_expert_parallel=True,
+                    max_model_len=163840,
                    compilation_config={
                        "cudagraph_capture_sizes": [3, 6, 9, 12],
                        "cudagraph_mode": "FULL_DECODE_ONLY"
@@ -267,7 +272,8 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
                    },
                    reasoning_parser="deepseek_v3",
                    tokenizer_mode="deepseek_v32") as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_model.generate_greedy(short_example_prompts, max_tokens)
+        vllm_model.generate_greedy(long_example_prompts, max_tokens)


@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)