2026-06-25 17:36:43 +08:00
|
|
|
concurrency: 1
|
2026-06-23 17:17:22 +08:00
|
|
|
command:
|
|
|
|
|
- python3
|
|
|
|
|
- -m
|
|
|
|
|
- vllm.entrypoints.openai.api_server
|
|
|
|
|
- --model
|
|
|
|
|
- /model
|
|
|
|
|
- --served-model-name
|
|
|
|
|
- llm
|
|
|
|
|
- --max-model-len
|
|
|
|
|
- '100000'
|
|
|
|
|
- --gpu-memory-utilization
|
2026-06-26 13:27:52 +08:00
|
|
|
- '0.9'
|
2026-06-23 17:17:22 +08:00
|
|
|
- --trust-remote-code
|
|
|
|
|
- -tp
|
|
|
|
|
- '4'
|
|
|
|
|
- --max-num-seqs
|
|
|
|
|
- '1'
|
|
|
|
|
- --disable-log-requests
|
|
|
|
|
- --disable-frontend-multiprocessing
|
|
|
|
|
- --max-num-batched-tokens
|
2026-06-26 12:55:02 +08:00
|
|
|
- '8192'
|
2026-06-23 17:17:22 +08:00
|
|
|
- --enable-chunked-prefill
|
|
|
|
|
- --max-seq-len-to-capture
|
|
|
|
|
- '32768'
|
|
|
|
|
- --enable-auto-tool-choice
|
|
|
|
|
- --tool-call-parser
|
|
|
|
|
- qwen3_coder
|
|
|
|
|
- --reasoning-parser
|
|
|
|
|
- qwen3
|
2026-06-26 13:27:52 +08:00
|
|
|
- --enable-prefix-caching
|
2026-06-23 17:17:22 +08:00
|
|
|
env:
|
|
|
|
|
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
|
2026-06-25 17:36:43 +08:00
|
|
|
value: 3600
|