[CI] Add long and short prompt tests for DeepSeek-V3.2 (#6499)
### What this PR does / why we need it?
This PR enhances the test_deepseek3_2_w8a8_pruning_mtp_tp2_ep E2E test
by adding both short and long prompt test cases:
- Short test: Validates basic functionality with minimal input ("Hello
")
- Long test: Validates the model can handle prompts near its maximum
context length (~163K tokens, approaching the max_position_embeddings
limit of 163,840)
Additionally, explicitly sets max_model_len=163840 to ensure the test
properly exercises the model's full context window capability.
### Does this PR introduce _any_ user-facing change?
No. This change only affects internal E2E testing infrastructure.
### How was this patch tested?
The modified test case will be executed as part of the E2E test suite
and has been validated
[here](https://github.com/vllm-project/vllm-ascend/actions/runs/21620195055/job/62308026205?pr=6499).
- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0
Signed-off-by: guozr <guozr1997@hotmail.com>
Co-authored-by: guozr <guozr1997@hotmail.com>
This commit is contained in:
@@ -246,14 +246,19 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
||||
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||
def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
short_example_prompts = [
|
||||
"Hello ",
|
||||
]
|
||||
max_tokens = 5
|
||||
# "max_position_embeddings": 163840,
|
||||
long_example_prompts = [
|
||||
"Hello " * (163839 - 500) + "Hello"
|
||||
]
|
||||
max_tokens = 500
|
||||
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
max_model_len=163840,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [3, 6, 9, 12],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
@@ -267,7 +272,8 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||
},
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32") as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
||||
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)
|
||||
|
||||
Reference in New Issue
Block a user