[Doc] add DeepSeek-V3.2-Exp-w8a8 to installation.md and tutorials (#186)

Signed-off-by: WeiJie_Hong <1462519292@qq.com>
This commit is contained in:
WeiJie_Hong
2026-02-10 17:18:32 +08:00
committed by GitHub
parent 6f30bc439d
commit 42c7ef2f27
3 changed files with 271 additions and 106 deletions

View File

@@ -1,76 +1,76 @@
name: e2e-test
# name: e2e-test
on:
workflow_call:
pull_request:
branches: [main]
types: [opened, synchronize, reopened]
push:
branches: [main]
# on:
# workflow_call:
# pull_request:
# branches: [main]
# types: [opened, synchronize, reopened]
# push:
# branches: [main]
concurrency:
group: e2e-singlecard
cancel-in-progress: false
# concurrency:
# group: e2e-singlecard
# cancel-in-progress: false
jobs:
e2e:
name: e2e-test-singlecard
runs-on:
- self-hosted
- Linux
- X64
# jobs:
# e2e:
# name: e2e-test-singlecard
# runs-on:
# - self-hosted
# - Linux
# - X64
steps:
- name: Checkout PR code
uses: actions/checkout@v4
with:
fetch-depth: 0
# steps:
# - name: Checkout PR code
# uses: actions/checkout@v4
# with:
# fetch-depth: 0
- name: Verify PR workspace
run: |
echo "===== WORKSPACE ====="
pwd
ls -l
echo "===== GIT INFO ====="
git rev-parse HEAD
git log -1 --oneline
git status --porcelain
# - name: Verify PR workspace
# run: |
# echo "===== WORKSPACE ====="
# pwd
# ls -l
# echo "===== GIT INFO ====="
# git rev-parse HEAD
# git log -1 --oneline
# git status --porcelain
- name: Start docker
run: |
bash ci/scripts/docker/start_docker.sh
# - name: Start docker
# run: |
# bash ci/scripts/docker/start_docker.sh
- name: Install enviroments
env:
PROXY_URL: ${{ secrets.PROXY_URL }}
NO_PROXY_LIST: ${{ secrets.NO_PROXY_LIST }}
run: |
bash ci/scripts/env/install_env.sh
# - name: Install enviroments
# env:
# PROXY_URL: ${{ secrets.PROXY_URL }}
# NO_PROXY_LIST: ${{ secrets.NO_PROXY_LIST }}
# run: |
# bash ci/scripts/env/install_env.sh
- name: Start vLLM server
run: |
bash ci/scripts/server/start_vllm.sh
# - name: Start vLLM server
# run: |
# bash ci/scripts/server/start_vllm.sh
- name: Wait for vLLM ready
run: |
bash ci/scripts/server/wait_vllm.sh
# - name: Wait for vLLM ready
# run: |
# bash ci/scripts/server/wait_vllm.sh
- name: API Test
run: |
docker exec aiak-e2e-singlecard bash -lc '
curl http://localhost:8356/v1/chat/completions \
-H "Content-Type: application/json" \
-d @- << "EOF"
{
"model": "Qwen3-8B",
"messages": [
{ "role": "user", "content": "Who are you?" }
],
"max_tokens": 200,
"temperature": 0
}
EOF
'
# - name: API Test
# run: |
# docker exec aiak-e2e-singlecard bash -lc '
# curl http://localhost:8356/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d @- << "EOF"
# {
# "model": "Qwen3-8B",
# "messages": [
# { "role": "user", "content": "Who are you?" }
# ],
# "max_tokens": 200,
# "temperature": 0
# }
# EOF
# '
# - name: Accuracy testing
# run: |

View File

@@ -11,7 +11,9 @@ This document describes how to install vllm-kunlun manually.
- vLLM (same version as vllm-kunlun)
## Setup environment using container
We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:base_v0.0.2` and `wjie520/vllm_kunlun:base_mimo_v0.0.2`(Only MIMO_V2 and GPT-OSS).You can pull it using the `docker pull` command.
We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:uv_base`.You can pull it using the `docker pull wjie520/vllm_kunlun:uv_base` command.
We also provide images with xpytorch and ops installed.You can pull it using the `wjie520/vllm_kunlun:base_v0.0.2 and wjie520/vllm_kunlun:base_mimo_v0.0.2 (Only MIMO_V2 and GPT-OSS)` command
### Container startup script
:::::{tab-set}
@@ -19,9 +21,8 @@ We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:base_v0.
::::{tab-item} start_docker.sh
:selected:
:sync: pip
:sync: uv pip
```{code-block} bash
:substitutions:
#!/bin/bash
XPU_NUM=8
DOCKER_DEVICE_CONFIG=""
@@ -31,7 +32,7 @@ if [ $XPU_NUM -gt 0 ]; then
done
DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl"
fi
export build_image="wjie520/vllm_kunlun:base_v0.0.2"
export build_image="wjie520/vllm_kunlun:uv_base"
# or export build_image="iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32"
docker run -itd ${DOCKER_DEVICE_CONFIG} \
@@ -63,8 +64,71 @@ uv pip install -r requirements.txt
python setup.py build
python setup.py install
```
### Replace eval_frame.py
Copy the eval_frame.py patch:
```
cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/vllm_kunlun_0.10.1.1/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py
```
## Choose to download customized xpytorch
### Install the KL3-customized build of PyTorch
```
wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-02T05%3A01%3A27Z%2F-1%2Fhost%2Ff3cf499234f82303891aed2bcb0628918e379a21e841a3fac6bd94afef491ff7
(for the conda)
bash xpytorch-cp310-torch251-ubuntu2004-x64.run
(for the uv)
bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \
sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g' setup.sh && bash setup.sh
```
### Install the KL3-customized build of PyTorch (Only MIMO V2)
```
wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/1231/xpytorch-cp310-torch251-ubuntu2004-x64.run
(for the conda)
bash xpytorch-cp310-torch251-ubuntu2004-x64.run
(for the uv)
bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \
sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g' setup.sh && bash setup.sh
```
### Install the KL3-customized build of PyTorch (Only DeepSeek-V3.2-Exp-w8a8)
```
wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://aihc-private-hcd.bj.bcebos.com/v1/vllm-kunlun-ds/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKvz6x4eqcmSsKjQxq3vZdB%2F2026-02-03T01%3A59%3A40Z%2F-1%2Fhost%2Ffc4b6f5b83c2fde70d48fdfc23c40c396efc9cb3c36d6f811fdca5f109073321
(for the conda)
bash xpytorch-cp310-torch251-ubuntu2004-x64.run
(for the uv)
bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \
mv torch_xray-999.9.9-cp310-cp310-linux_x86_64.whl torch_xray-2.0.3-cp310-cp310-linux_x86_64.whl && \
sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g; s/torch_xray-999.9.9/torch_xray-2.0.3/' setup.sh && bash setup.sh
```
## Choose to download customized ops
### Install custom ops
```
uv pip install "https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xtorch_ops-0.1.2209%2B6752ad20-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-05T06%3A18%3A00Z%2F-1%2Fhost%2F14936c2b7e7c557c1400e4c467c79f7a9217374a7aa4a046711ac4d948f460cd"
```
### Install custom ops (Only MIMO V2)
```
uv pip install "https://vllm-ai-models.bj.bcebos.com/v1/vLLM-Kunlun/ops/swa/xtorch_ops-0.1.2109%252B523cb26d-cp310-cp310-linux_x86_64.whl"
```
### Install custom ops (Only DeepSeek-V3.2-Exp-w8a8)
```
uv pip install "https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/1215/xtorch_ops-0.1.2263%2Bc030eebd-cp310-cp310-linux_x86_64.whl"
```
## Install the KLX3 custom Triton build
```
uv pip install "https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl"
```
## Install the AIAK custom ops library
```
uv pip install "https://cce-ai-models.bj.bcebos.com/XSpeedGate-whl/release_merge/20251219_152418/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl"
```
## Quick Start
### Set up the environment
@@ -81,7 +145,6 @@ chmod +x /workspace/vLLM-Kunlun/setup_env.sh && source /workspace/vLLM-Kunlun/se
:selected:
:sync: pip
```{code-block} bash
:substitutions:
python -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--port 8356 \
@@ -112,41 +175,3 @@ python -m vllm.entrypoints.openai.api_server \
```
::::
:::::
### xpytorch and ops install
We also provide xpytorch and ops link for custom installation.
### Replace eval_frame.py
Copy the eval_frame.py patch:
```
cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/vllm_kunlun_0.10.1.1/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py
```
## Install the KL3-customized build of PyTorch
```
wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-02T05%3A01%3A27Z%2F-1%2Fhost%2Ff3cf499234f82303891aed2bcb0628918e379a21e841a3fac6bd94afef491ff7
bash xpytorch-cp310-torch251-ubuntu2004-x64.run
```
## Install the KL3-customized build of PyTorch(Only MIMO V2)
```
wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/1231/xpytorch-cp310-torch251-ubuntu2004-x64.run
bash xpytorch-cp310-torch251-ubuntu2004-x64.run
```
## Install custom ops
```
pip install "https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xtorch_ops-0.1.2209%2B6752ad20-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-05T06%3A18%3A00Z%2F-1%2Fhost%2F14936c2b7e7c557c1400e4c467c79f7a9217374a7aa4a046711ac4d948f460cd"
```
## Install custom ops(Only MIMO V2)
```
pip install "https://vllm-ai-models.bj.bcebos.com/v1/vLLM-Kunlun/ops/swa/xtorch_ops-0.1.2109%252B523cb26d-cp310-cp310-linux_x86_64.whl"
```
## Install the KLX3 custom Triton build
```
pip install "https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl"
```
## Install the AIAK custom ops library
```
pip install "https://cce-ai-models.bj.bcebos.com/XSpeedGate-whl/release_merge/20251219_152418/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl"
```

View File

@@ -0,0 +1,140 @@
# Multi XPU (DeepSeek-V3.2-Exp-w8a8)
## Run vllm-kunlun on Multi XPU
Setup environment using container:
Please follow the [installation.md](../installation.md) document to set up the environment first.
Create a container
```bash
# !/bin/bash
# rundocker.sh
XPU_NUM=8
DOCKER_DEVICE_CONFIG=""
if [ $XPU_NUM -gt 0 ]; then
for idx in $(seq 0 $((XPU_NUM-1))); do
DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpu${idx}:/dev/xpu${idx}"
done
DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl"
fi
export build_image="xxx"
docker run -itd ${DOCKER_DEVICE_CONFIG} \
--net=host \
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--tmpfs /dev/shm:rw,nosuid,nodev,exec,size=32g \
--cap-add=SYS_PTRACE \
-v /home/users/vllm-kunlun:/home/vllm-kunlun \
-v /usr/local/bin/xpu-smi:/usr/local/bin/xpu-smi \
--name "$1" \
-w /workspace \
"$build_image" /bin/bash
```
### Preparation Weight
- Pull DeepSeek-V3.2-Exp-w8a8-int8 weights
```
wget -O DeepSeek-V3.2-Exp-w8a8-int8.tar.gz https://aihc-private-hcd.bj.bcebos.com/v1/LLM/DeepSeek/DeepSeek-V3.2-Exp-w8a8-int8.tar.gz?authorization=bce-auth-v1%2FALTAKvz6x4eqcmSsKjQxq3vZdB%2F2025-12-24T06%3A07%3A10Z%2F-1%2Fhost%2Fa324bf469176934a05f75d3acabc3c1fb891be150f43fb1976e65b7ec68733db
```
- Ensure that the field "quantization_config" is included.If not, deployment will result in an OOM (Out of Memory) error.
vim model/DeepSeek-V3.2-Exp-w8a8-int8/config.json
```config.json
"quantization_config": {
"config_groups": {
"group_0": {
"format": "int-quantized",
"input_activations": {
"actorder": null,
"block_structure": null,
"dynamic": true,
"group_size": null,
"num_bits": 8,
"observer": null,
"observer_kwargs": {},
"strategy": "token",
"symmetric": true,
"type": "int"
},
"output_activations": null,
"targets": [
"Linear"
],
"weights": {
"actorder": null,
"block_structure": null,
"dynamic": false,
"group_size": null,
"num_bits": 8,
"observer": "minmax",
"observer_kwargs": {},
"strategy": "channel",
"symmetric": true,
"type": "int"
}
}
},
"format": "int-quantized",
"global_compression_ratio": null,
"ignore": [
"lm_head"
],
"kv_cache_scheme": null,
"quant_method": "compressed-tensors",
"quantization_status": "compressed",
"sparsity_config": {},
"transform_config": {},
"version": "0.12.2"
},
```
### Online Serving on Multi XPU
Start the vLLM server on multi XPU:
```bash
unset XPU_DUMMY_EVENT && \
export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && \
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && \
export XMLIR_CUDNN_ENABLED=1 && \
export XPU_USE_DEFAULT_CTX=1 && \
export XMLIR_FORCE_USE_XPU_GRAPH=1 && \
export XMLIR_ENABLE_FAST_FC=1 && \
export XPU_USE_FAST_SWIGLU=1 && \
export CUDA_GRAPH_OPTIMIZE_STREAM=1 && \
export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false && \
export XPU_USE_MOE_SORTED_THRES=1 && \
export USE_ORI_ROPE=1 && \
export VLLM_USE_V1=1
python -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--port 8806 \
--model /data/DeepSeek-V3.2-Exp-w8a8-int8 \
--gpu-memory-utilization 0.95 \
--trust-remote-code \
--max-model-len 32768 \
--tensor-parallel-size 8 \
--dtype float16 \
--max_num_seqs 32 \
--max_num_batched_tokens 8192 \
--block-size 64 \
--no-enable-chunked-prefill \
--distributed-executor-backend mp \
--disable-log-requests \
--no-enable-prefix-caching --kv-cache-dtype bfloat16 \
--compilation-config '{"splitting_ops":["vllm.unified_attention",
"vllm.unified_attention_with_output",
"vllm.unified_attention_with_output_kunlun",
"vllm.mamba_mixer2",
"vllm.mamba_mixer",
"vllm.short_conv",
"vllm.linear_attention",
"vllm.plamo2_mamba_mixer",
"vllm.gdn_attention",
"vllm.sparse_attn_indexer",
"vllm.sparse_attn_indexer_vllm_kunlun"]}'
```