Files
enginex-vllm-bi100-qwen36/computility-run.yaml

33 lines
673 B
YAML
Raw Normal View History

2026-06-23 17:17:22 +08:00
gpu_num: 4
command:
- python3
- -m
- vllm.entrypoints.openai.api_server
- --model
- /model
- --served-model-name
- llm
- --max-model-len
- '100000'
- --gpu-memory-utilization
- '0.95'
- --trust-remote-code
- -tp
- '4'
- --max-num-seqs
- '1'
- --disable-log-requests
- --disable-frontend-multiprocessing
- --max-num-batched-tokens
- '4096'
- --enable-chunked-prefill
- --max-seq-len-to-capture
- '32768'
- --enable-auto-tool-choice
- --tool-call-parser
- qwen3_coder
- --reasoning-parser
- qwen3
env:
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
value: 3600