enginex-vllm-bi100-qwen36/computility-run.yaml

concurrency: 1
command:
    - python3
    - -m
    - vllm.entrypoints.openai.api_server
    - --model
    - /model
    - --served-model-name
    - llm
    - --max-model-len
    - '100000'
    - --gpu-memory-utilization
    - '0.9'
    - --trust-remote-code
    - -tp
    - '4'
    - --max-num-seqs
    - '1'
    - --disable-log-requests
    - --disable-frontend-multiprocessing
    - --max-num-batched-tokens
    - '8192'
    - --enable-chunked-prefill
    - --max-seq-len-to-capture
    - '32768'
    - --enable-auto-tool-choice
    - --tool-call-parser
    - qwen3_coder
    - --reasoning-parser
    - qwen3
    - --enable-prefix-caching
env:
    - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
      value: 3600
调整配置参数 2026-06-25 17:36:43 +08:00			`concurrency: 1`
Add README and start commands 2026-06-23 17:17:22 +08:00			`command:`
			`- python3`
			`- -m`
			`- vllm.entrypoints.openai.api_server`
			`- --model`
			`- /model`
			`- --served-model-name`
			`- llm`
			`- --max-model-len`
			`- '100000'`
			`- --gpu-memory-utilization`
enable prefix caching 2026-06-26 13:27:52 +08:00			`- '0.9'`
Add README and start commands 2026-06-23 17:17:22 +08:00			`- --trust-remote-code`
			`- -tp`
			`- '4'`
			`- --max-num-seqs`
			`- '1'`
			`- --disable-log-requests`
			`- --disable-frontend-multiprocessing`
			`- --max-num-batched-tokens`
fix issues 2026-06-26 12:55:02 +08:00			`- '8192'`
Add README and start commands 2026-06-23 17:17:22 +08:00			`- --enable-chunked-prefill`
			`- --max-seq-len-to-capture`
			`- '32768'`
			`- --enable-auto-tool-choice`
			`- --tool-call-parser`
			`- qwen3_coder`
			`- --reasoning-parser`
			`- qwen3`
enable prefix caching 2026-06-26 13:27:52 +08:00			`- --enable-prefix-caching`
Add README and start commands 2026-06-23 17:17:22 +08:00			`env:`
			`- name: VLLM_ENGINE_ITERATION_TIMEOUT_S`
调整配置参数 2026-06-25 17:36:43 +08:00			`value: 3600`