提交vllm0.11.0开发分支

This commit is contained in:
chenyili
2025-12-10 17:51:24 +08:00
parent deab7dd0b6
commit 7c22d621fb
175 changed files with 31856 additions and 8683 deletions

View File

@@ -42,7 +42,7 @@ Online example:
python -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--port 8000 \
--model /models/Qwen3-8B-Instruct\
--model /models/Qwen3-8B\
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--max-model-len 32768 \
@@ -52,9 +52,17 @@ python -m vllm.entrypoints.openai.api_server \
--no-enable-chunked-prefill \
--distributed-executor-backend mp \
--served-model-name Qwen3-8B-Instruct \
--compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun",
"vllm.unified_attention", "vllm.unified_attention_with_output",
"vllm.mamba_mixer2"]}' \
--compilation-config '{"splitting_ops": ["vllm.unified_attention",
"vllm.unified_attention_with_output",
"vllm.unified_attention_with_output_kunlun",
"vllm.mamba_mixer2",
"vllm.mamba_mixer",
"vllm.short_conv",
"vllm.linear_attention",
"vllm.plamo2_mamba_mixer",
"vllm.gdn_attention",
"vllm.sparse_attn_indexer"]}' \
```