Files
enginex-vllm-bi100-qwen36/computility-run.yaml
2026-06-26 13:27:52 +08:00

35 lines
707 B
YAML

concurrency: 1
command:
- python3
- -m
- vllm.entrypoints.openai.api_server
- --model
- /model
- --served-model-name
- llm
- --max-model-len
- '100000'
- --gpu-memory-utilization
- '0.9'
- --trust-remote-code
- -tp
- '4'
- --max-num-seqs
- '1'
- --disable-log-requests
- --disable-frontend-multiprocessing
- --max-num-batched-tokens
- '8192'
- --enable-chunked-prefill
- --max-seq-len-to-capture
- '32768'
- --enable-auto-tool-choice
- --tool-call-parser
- qwen3_coder
- --reasoning-parser
- qwen3
- --enable-prefix-caching
env:
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
value: 3600