提交vllm0.11.0开发分支

2025-12-10 17:51:24 +08:00
parent deab7dd0b6
commit 7c22d621fb
175 changed files with 31856 additions and 8683 deletions
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -11,7 +11,7 @@ This document describes how to install vllm-kunlun manually.
  - vLLM (same version as vllm-kunlun)

 ## Setup environment using container
-We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:v0.0.1`.You can pull it using the `docker pull` command.
+We provide a clean, minimal base image for your use`iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32`.You can pull it using the `docker pull` command.
 ### Container startup script

 :::::{tab-set}
@@ -31,7 +31,7 @@ if [ $XPU_NUM -gt 0 ]; then
    done
    DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl"
 fi
-export build_image="wjie520/vllm_kunlun:v0.0.1"
+export build_image="iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32"
 docker run -itd ${DOCKER_DEVICE_CONFIG} \
    --net=host \
    --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -46,16 +46,16 @@ docker run -itd ${DOCKER_DEVICE_CONFIG} \
 ::::
 :::::
 ## Install vLLM-kunlun
-### Install vLLM 0.10.1.1
+### Install vLLM 0.11.0
 ```
 conda activate python310_torch25_cuda

-pip install vllm==0.10.1.1 --no-build-isolation --no-deps 
+pip install vllm==0.11.0 
 ```
 ### Build and Install
 Navigate to the vllm-kunlun directory and build the package:
 ```
-git clone https://github.com/baidu/vLLM-Kunlun # TODO: replace with Github Url to install vllm-kunlun
+git clone xxxx # TODO: replace with Github Url to install vllm-kunlun

 cd vllm-kunlun

@@ -71,28 +71,33 @@ Copy the eval_frame.py patch:
 ```
 cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/python310_torch25_cuda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py
 ```
-## Update xpytorch
+## Install the KL3-customized build of PyTorch
 ```
-wget https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/0830/xpytorch-cp310-torch251-ubuntu2004-x64.run
-
-bash xpytorch-cp310-torch251-ubuntu2004-x64.run
+wget https://klx-sdk-release-public.su.bcebos.com/xpytorch/release/3.3.2.7/xpytorch-cp310-torch251-ubuntu2004-x64.run && bash xpytorch-cp310-torch251-ubuntu2004-x64.run
 ```

 ## Install custom ops
 ```
-pip install \
-https://xtorch_ops
-
-pip install \
-https://xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl
+pip uninstall xtorch_ops -y && pip install \
+"https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/xtorch_ops-0.1.2028%2B1baf1b15-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-10-31T10%3A38%3A24Z%2F-1%2Fhost%2Faa1969b70a4a97c407d69614a5d5a3e26ea07286d13f0a2ab8daccc288152903"
 ```

+## Install the KLX3 custom Triton build
+```
+pip install \
+"https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKxPW2jzoJUuFZmI19s3yry%2F2025-11-05T02%3A47%3A29Z%2F-1%2Fhost%2Fd8c95dbd06187a3140ca3e681e00c6941c30e14bb1d4112a0c8bc3c93e5c9c3f"
+```
+## Install the AIAK custom ops library
+```
+pip install \
+"https://cce-ai-models.bj.bcebos.com/v1/chenyili/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKxPW2jzoJUuFZmI19s3yry%2F2025-11-18T01%3A56%3A21Z%2F-1%2Fhost%2F28b57cbc5dc62ac1bf946e74146b3ea4952d2ffff448617f0303980dcaf6cb49"
+```
 ## Quick Start

 ### Set up the environment

 ```
-chmod +x /workspace/vllm-kunlun/setup_env.sh && source /workspace/vllm-kunlun/setup_env.sh
+chmod +x /workspace/baidu/hac-aiacc/vllm-kunlun/setup_env.sh && source /workspace/baidu/hac-aiacc/vllm-kunlun/setup_env.sh
 ```

 ### Run the server
@@ -107,7 +112,7 @@ chmod +x /workspace/vllm-kunlun/setup_env.sh && source /workspace/vllm-kunlun/se
 python -m vllm.entrypoints.openai.api_server \
      --host 0.0.0.0 \
      --port 8356 \
-      --model /models/Qwen3-8B\
+      --model models/Qwen3-VL-30B-A3B-Instruct \
      --gpu-memory-utilization 0.9 \
      --trust-remote-code \
      --max-model-len 32768 \
@@ -115,15 +120,22 @@ python -m vllm.entrypoints.openai.api_server \
      --dtype float16 \
      --max_num_seqs 128 \
      --max_num_batched_tokens 32768 \
-      --max-seq-len-to-capture 32768 \
      --block-size 128 \
      --no-enable-prefix-caching \
      --no-enable-chunked-prefill \
      --distributed-executor-backend mp \
-      --served-model-name Qwen3-8B \
-      --compilation-config '{"splitting_ops": ["vllm.unified_attention_with_output_kunlun",
-            "vllm.unified_attention", "vllm.unified_attention_with_output",
-            "vllm.mamba_mixer2"]}' \
+      --served-model-name Qwen3-VL-30B-A3B-Instruct \
+      --compilation-config '{"splitting_ops": ["vllm.unified_attention", 
+                                                "vllm.unified_attention_with_output",
+                                                "vllm.unified_attention_with_output_kunlun",
+                                                "vllm.mamba_mixer2", 
+                                                "vllm.mamba_mixer", 
+                                                "vllm.short_conv", 
+                                                "vllm.linear_attention", 
+                                                "vllm.plamo2_mamba_mixer", 
+                                                "vllm.gdn_attention", 
+                                                "vllm.sparse_attn_indexer"]}' \  
+
 ```
 ::::
 :::::