# What this PR does / why we need it?
This PR reverts commit 8134146ab6, which
modified the DeepSeek V3.2 (W8A8) single-node nightly test
configuration. as there is no limit between tp_size and MTP.
# Does this PR introduce any user-facing change?
No. This PR only affects CI/CD test configurations and does not
introduce any user-facing changes.
# How was this patch tested?
N/A for a revert PR. The changes restore the previously known working
configuration.
- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0
Signed-off-by: guozr <guozr1997@hotmail.com>
Co-authored-by: guozr <guozr1997@hotmail.com>
This commit is contained in:
@@ -54,7 +54,7 @@ aisbench_cases = [{
|
|||||||
"max_out_len": 1500,
|
"max_out_len": 1500,
|
||||||
"batch_size": 4,
|
"batch_size": 4,
|
||||||
"request_rate": 11.2,
|
"request_rate": 11.2,
|
||||||
"baseline": 110.5681,
|
"baseline": 134,
|
||||||
"threshold": 0.97
|
"threshold": 0.97
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@@ -80,12 +80,13 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
|||||||
str(tp_size), "--data-parallel-size",
|
str(tp_size), "--data-parallel-size",
|
||||||
str(dp_size), "--port",
|
str(dp_size), "--port",
|
||||||
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
|
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
|
||||||
"8192", "--max-num-seqs", "8", "--trust-remote-code", "--quantization",
|
"8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization",
|
||||||
"ascend", "--gpu-memory-utilization", "0.98", "--compilation-config",
|
"ascend", "--gpu-memory-utilization", "0.92", "--compilation-config",
|
||||||
'{"cudagraph_capture_sizes":[8, 16, 24, 32], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
'{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
||||||
"--speculative-config",
|
"--speculative-config",
|
||||||
'{"num_speculative_tokens": 3, "method":"deepseek_mtp"}',
|
'{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
|
||||||
"--additional-config", '{"layer_sharding": ["q_b_proj", "o_proj"]}',
|
"--additional-config",
|
||||||
|
'{"layer_sharding": ["q_b_proj", "o_proj"]}',
|
||||||
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
||||||
]
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
|
|||||||
Reference in New Issue
Block a user