gpu_num: 4 command: - python3 - -m - vllm.entrypoints.openai.api_server - --model - /model - --served-model-name - llm - --max-model-len - '100000' - --gpu-memory-utilization - '0.95' - --trust-remote-code - -tp - '4' - --max-num-seqs - '1' - --disable-log-requests - --disable-frontend-multiprocessing - --max-num-batched-tokens - '4096' - --enable-chunked-prefill - --max-seq-len-to-capture - '32768' - --enable-auto-tool-choice - --tool-call-parser - qwen3_coder - --reasoning-parser - qwen3 env: - name: VLLM_ENGINE_ITERATION_TIMEOUT_S value: 3600