Files
enginex-vllm-bi100-qwen36/computility-run.yaml

35 lines
707 B
YAML
Raw Normal View History

2026-06-25 17:36:43 +08:00
concurrency: 1
2026-06-23 17:17:22 +08:00
command:
- python3
- -m
- vllm.entrypoints.openai.api_server
- --model
- /model
- --served-model-name
- llm
- --max-model-len
- '100000'
- --gpu-memory-utilization
2026-06-26 13:27:52 +08:00
- '0.9'
2026-06-23 17:17:22 +08:00
- --trust-remote-code
- -tp
- '4'
- --max-num-seqs
- '1'
- --disable-log-requests
- --disable-frontend-multiprocessing
- --max-num-batched-tokens
2026-06-26 12:55:02 +08:00
- '8192'
2026-06-23 17:17:22 +08:00
- --enable-chunked-prefill
- --max-seq-len-to-capture
- '32768'
- --enable-auto-tool-choice
- --tool-call-parser
- qwen3_coder
- --reasoning-parser
- qwen3
2026-06-26 13:27:52 +08:00
- --enable-prefix-caching
2026-06-23 17:17:22 +08:00
env:
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
2026-06-25 17:36:43 +08:00
value: 3600