[BugFix][CI]Fix DeepSeek-R1-W8A8-longseq nightly CI (#6297)
### What this PR does / why we need it?
The precision issue arose because the kv cache of the p-node had not
been fetched for an extended period(>6min) and was forcibly freed. To
avoid this problem, the batch size was reduced and the timeout period
has also been extended.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
Signed-off-by: dsxsteven <dsxsteven@sina.com>
This commit is contained in:
@@ -13,6 +13,7 @@ env_common:
|
|||||||
HCCL_DETERMINISTIC: True
|
HCCL_DETERMINISTIC: True
|
||||||
TASK_QUEUE_ENABLE: 1
|
TASK_QUEUE_ENABLE: 1
|
||||||
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
|
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
|
||||||
|
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: 300000
|
||||||
|
|
||||||
disaggregated_prefill:
|
disaggregated_prefill:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -34,7 +35,7 @@ deployment:
|
|||||||
--enable-expert-parallel
|
--enable-expert-parallel
|
||||||
--seed 1024
|
--seed 1024
|
||||||
--quantization ascend
|
--quantization ascend
|
||||||
--max-num-seqs 4
|
--max-num-seqs 3
|
||||||
--max-model-len 32768
|
--max-model-len 32768
|
||||||
--max-num-batched-tokens 16384
|
--max-num-batched-tokens 16384
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
@@ -71,12 +72,12 @@ deployment:
|
|||||||
--enable-expert-parallel
|
--enable-expert-parallel
|
||||||
--seed 1024
|
--seed 1024
|
||||||
--quantization ascend
|
--quantization ascend
|
||||||
--max-num-seqs 4
|
--max-num-seqs 8
|
||||||
--max-model-len 32768
|
--max-model-len 32768
|
||||||
--max-num-batched-tokens 256
|
--max-num-batched-tokens 256
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--gpu-memory-utilization 0.85
|
--gpu-memory-utilization 0.85
|
||||||
--compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
--compilation_config '{"cudagraph_capture_sizes":[4,8,16,32],"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||||
--enable-chunked-prefill
|
--enable-chunked-prefill
|
||||||
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
|
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
|
||||||
--kv-transfer-config
|
--kv-transfer-config
|
||||||
@@ -103,6 +104,6 @@ benchmarks:
|
|||||||
request_conf: vllm_api_general_chat
|
request_conf: vllm_api_general_chat
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||||
max_out_len: 24576
|
max_out_len: 24576
|
||||||
batch_size: 512
|
batch_size: 16
|
||||||
baseline: 95
|
baseline: 95
|
||||||
threshold: 5
|
threshold: 5
|
||||||
|
|||||||
Reference in New Issue
Block a user