34 lines
678 B
YAML
34 lines
678 B
YAML
concurrency: 1
|
|
command:
|
|
- python3
|
|
- -m
|
|
- vllm.entrypoints.openai.api_server
|
|
- --model
|
|
- /model
|
|
- --served-model-name
|
|
- llm
|
|
- --max-model-len
|
|
- '100000'
|
|
- --gpu-memory-utilization
|
|
- '0.95'
|
|
- --trust-remote-code
|
|
- -tp
|
|
- '4'
|
|
- --max-num-seqs
|
|
- '1'
|
|
- --disable-log-requests
|
|
- --disable-frontend-multiprocessing
|
|
- --max-num-batched-tokens
|
|
- '4096'
|
|
- --enable-chunked-prefill
|
|
- --max-seq-len-to-capture
|
|
- '32768'
|
|
- --enable-auto-tool-choice
|
|
- --tool-call-parser
|
|
- qwen3_coder
|
|
- --reasoning-parser
|
|
- qwen3
|
|
env:
|
|
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
|
|
value: 3600
|