diff --git a/.github/workflows/run-e2e.yml b/.github/workflows/run-e2e.yml index 4bd1cc3..0465bf3 100644 --- a/.github/workflows/run-e2e.yml +++ b/.github/workflows/run-e2e.yml @@ -1,76 +1,76 @@ -name: e2e-test +# name: e2e-test -on: - workflow_call: - pull_request: - branches: [main] - types: [opened, synchronize, reopened] - push: - branches: [main] +# on: +# workflow_call: +# pull_request: +# branches: [main] +# types: [opened, synchronize, reopened] +# push: +# branches: [main] -concurrency: - group: e2e-singlecard - cancel-in-progress: false +# concurrency: +# group: e2e-singlecard +# cancel-in-progress: false -jobs: - e2e: - name: e2e-test-singlecard - runs-on: - - self-hosted - - Linux - - X64 +# jobs: +# e2e: +# name: e2e-test-singlecard +# runs-on: +# - self-hosted +# - Linux +# - X64 - steps: - - name: Checkout PR code - uses: actions/checkout@v4 - with: - fetch-depth: 0 +# steps: +# - name: Checkout PR code +# uses: actions/checkout@v4 +# with: +# fetch-depth: 0 - - name: Verify PR workspace - run: | - echo "===== WORKSPACE =====" - pwd - ls -l - echo "===== GIT INFO =====" - git rev-parse HEAD - git log -1 --oneline - git status --porcelain +# - name: Verify PR workspace +# run: | +# echo "===== WORKSPACE =====" +# pwd +# ls -l +# echo "===== GIT INFO =====" +# git rev-parse HEAD +# git log -1 --oneline +# git status --porcelain - - name: Start docker - run: | - bash ci/scripts/docker/start_docker.sh +# - name: Start docker +# run: | +# bash ci/scripts/docker/start_docker.sh - - name: Install enviroments - env: - PROXY_URL: ${{ secrets.PROXY_URL }} - NO_PROXY_LIST: ${{ secrets.NO_PROXY_LIST }} - run: | - bash ci/scripts/env/install_env.sh +# - name: Install enviroments +# env: +# PROXY_URL: ${{ secrets.PROXY_URL }} +# NO_PROXY_LIST: ${{ secrets.NO_PROXY_LIST }} +# run: | +# bash ci/scripts/env/install_env.sh - - name: Start vLLM server - run: | - bash ci/scripts/server/start_vllm.sh +# - name: Start vLLM server +# run: | +# bash ci/scripts/server/start_vllm.sh - - name: Wait for vLLM ready - run: | - bash ci/scripts/server/wait_vllm.sh +# - name: Wait for vLLM ready +# run: | +# bash ci/scripts/server/wait_vllm.sh - - name: API Test - run: | - docker exec aiak-e2e-singlecard bash -lc ' - curl http://localhost:8356/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d @- << "EOF" - { - "model": "Qwen3-8B", - "messages": [ - { "role": "user", "content": "Who are you?" } - ], - "max_tokens": 200, - "temperature": 0 - } - EOF - ' +# - name: API Test +# run: | +# docker exec aiak-e2e-singlecard bash -lc ' +# curl http://localhost:8356/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d @- << "EOF" +# { +# "model": "Qwen3-8B", +# "messages": [ +# { "role": "user", "content": "Who are you?" } +# ], +# "max_tokens": 200, +# "temperature": 0 +# } +# EOF +# ' # - name: Accuracy testing # run: | diff --git a/docs/source/installation.md b/docs/source/installation.md index fab5284..8da726b 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -11,7 +11,9 @@ This document describes how to install vllm-kunlun manually. - vLLM (same version as vllm-kunlun) ## Setup environment using container -We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:base_v0.0.2` and `wjie520/vllm_kunlun:base_mimo_v0.0.2`(Only MIMO_V2 and GPT-OSS).You can pull it using the `docker pull` command. +We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:uv_base`.You can pull it using the `docker pull wjie520/vllm_kunlun:uv_base` command. + +We also provide images with xpytorch and ops installed.You can pull it using the `wjie520/vllm_kunlun:base_v0.0.2 and wjie520/vllm_kunlun:base_mimo_v0.0.2 (Only MIMO_V2 and GPT-OSS)` command ### Container startup script :::::{tab-set} @@ -19,9 +21,8 @@ We provide a clean, minimal base image for your use`wjie520/vllm_kunlun:base_v0. ::::{tab-item} start_docker.sh :selected: -:sync: pip +:sync: uv pip ```{code-block} bash - :substitutions: #!/bin/bash XPU_NUM=8 DOCKER_DEVICE_CONFIG="" @@ -31,7 +32,7 @@ if [ $XPU_NUM -gt 0 ]; then done DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl" fi -export build_image="wjie520/vllm_kunlun:base_v0.0.2" +export build_image="wjie520/vllm_kunlun:uv_base" # or export build_image="iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.32" docker run -itd ${DOCKER_DEVICE_CONFIG} \ @@ -63,8 +64,71 @@ uv pip install -r requirements.txt python setup.py build python setup.py install +``` + +### Replace eval_frame.py +Copy the eval_frame.py patch: +``` +cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/vllm_kunlun_0.10.1.1/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py +``` + +## Choose to download customized xpytorch + +### Install the KL3-customized build of PyTorch +``` +wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-02T05%3A01%3A27Z%2F-1%2Fhost%2Ff3cf499234f82303891aed2bcb0628918e379a21e841a3fac6bd94afef491ff7 +(for the conda) +bash xpytorch-cp310-torch251-ubuntu2004-x64.run +(for the uv) +bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \ +sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g' setup.sh && bash setup.sh +``` +### Install the KL3-customized build of PyTorch (Only MIMO V2) +``` +wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/1231/xpytorch-cp310-torch251-ubuntu2004-x64.run +(for the conda) +bash xpytorch-cp310-torch251-ubuntu2004-x64.run +(for the uv) +bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \ +sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g' setup.sh && bash setup.sh ``` + +### Install the KL3-customized build of PyTorch (Only DeepSeek-V3.2-Exp-w8a8) +``` +wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://aihc-private-hcd.bj.bcebos.com/v1/vllm-kunlun-ds/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKvz6x4eqcmSsKjQxq3vZdB%2F2026-02-03T01%3A59%3A40Z%2F-1%2Fhost%2Ffc4b6f5b83c2fde70d48fdfc23c40c396efc9cb3c36d6f811fdca5f109073321 +(for the conda) +bash xpytorch-cp310-torch251-ubuntu2004-x64.run +(for the uv) +bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \ +mv torch_xray-999.9.9-cp310-cp310-linux_x86_64.whl torch_xray-2.0.3-cp310-cp310-linux_x86_64.whl && \ +sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g; s/torch_xray-999.9.9/torch_xray-2.0.3/' setup.sh && bash setup.sh +``` +## Choose to download customized ops + +### Install custom ops +``` +uv pip install "https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xtorch_ops-0.1.2209%2B6752ad20-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-05T06%3A18%3A00Z%2F-1%2Fhost%2F14936c2b7e7c557c1400e4c467c79f7a9217374a7aa4a046711ac4d948f460cd" +``` +### Install custom ops (Only MIMO V2) +``` +uv pip install "https://vllm-ai-models.bj.bcebos.com/v1/vLLM-Kunlun/ops/swa/xtorch_ops-0.1.2109%252B523cb26d-cp310-cp310-linux_x86_64.whl" +``` +### Install custom ops (Only DeepSeek-V3.2-Exp-w8a8) +``` +uv pip install "https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/1215/xtorch_ops-0.1.2263%2Bc030eebd-cp310-cp310-linux_x86_64.whl" +``` + +## Install the KLX3 custom Triton build +``` +uv pip install "https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl" +``` +## Install the AIAK custom ops library +``` +uv pip install "https://cce-ai-models.bj.bcebos.com/XSpeedGate-whl/release_merge/20251219_152418/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl" +``` + + ## Quick Start ### Set up the environment @@ -81,7 +145,6 @@ chmod +x /workspace/vLLM-Kunlun/setup_env.sh && source /workspace/vLLM-Kunlun/se :selected: :sync: pip ```{code-block} bash - :substitutions: python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ --port 8356 \ @@ -112,41 +175,3 @@ python -m vllm.entrypoints.openai.api_server \ ``` :::: ::::: - - -### xpytorch and ops install -We also provide xpytorch and ops link for custom installation. - -### Replace eval_frame.py -Copy the eval_frame.py patch: -``` -cp vllm_kunlun/patches/eval_frame.py /root/miniconda/envs/vllm_kunlun_0.10.1.1/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py -``` -## Install the KL3-customized build of PyTorch -``` -wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-02T05%3A01%3A27Z%2F-1%2Fhost%2Ff3cf499234f82303891aed2bcb0628918e379a21e841a3fac6bd94afef491ff7 -bash xpytorch-cp310-torch251-ubuntu2004-x64.run -``` -## Install the KL3-customized build of PyTorch(Only MIMO V2) -``` -wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://klx-sdk-release-public.su.bcebos.com/kunlun2aiak_output/1231/xpytorch-cp310-torch251-ubuntu2004-x64.run -bash xpytorch-cp310-torch251-ubuntu2004-x64.run -``` - -## Install custom ops -``` -pip install "https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xtorch_ops-0.1.2209%2B6752ad20-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-05T06%3A18%3A00Z%2F-1%2Fhost%2F14936c2b7e7c557c1400e4c467c79f7a9217374a7aa4a046711ac4d948f460cd" -``` -## Install custom ops(Only MIMO V2) -``` -pip install "https://vllm-ai-models.bj.bcebos.com/v1/vLLM-Kunlun/ops/swa/xtorch_ops-0.1.2109%252B523cb26d-cp310-cp310-linux_x86_64.whl" -``` - -## Install the KLX3 custom Triton build -``` -pip install "https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl" -``` -## Install the AIAK custom ops library -``` -pip install "https://cce-ai-models.bj.bcebos.com/XSpeedGate-whl/release_merge/20251219_152418/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl" -``` diff --git a/docs/source/tutorials/multi_xpu_DeepSeek-V3.2-Exp-w8a8.md b/docs/source/tutorials/multi_xpu_DeepSeek-V3.2-Exp-w8a8.md new file mode 100644 index 0000000..e4f2b6f --- /dev/null +++ b/docs/source/tutorials/multi_xpu_DeepSeek-V3.2-Exp-w8a8.md @@ -0,0 +1,140 @@ +# Multi XPU (DeepSeek-V3.2-Exp-w8a8) + +## Run vllm-kunlun on Multi XPU + +Setup environment using container: + +Please follow the [installation.md](../installation.md) document to set up the environment first. + +Create a container +```bash +# !/bin/bash +# rundocker.sh +XPU_NUM=8 +DOCKER_DEVICE_CONFIG="" +if [ $XPU_NUM -gt 0 ]; then + for idx in $(seq 0 $((XPU_NUM-1))); do + DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpu${idx}:/dev/xpu${idx}" + done + DOCKER_DEVICE_CONFIG="${DOCKER_DEVICE_CONFIG} --device=/dev/xpuctrl:/dev/xpuctrl" +fi + +export build_image="xxx" + +docker run -itd ${DOCKER_DEVICE_CONFIG} \ + --net=host \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --tmpfs /dev/shm:rw,nosuid,nodev,exec,size=32g \ + --cap-add=SYS_PTRACE \ + -v /home/users/vllm-kunlun:/home/vllm-kunlun \ + -v /usr/local/bin/xpu-smi:/usr/local/bin/xpu-smi \ + --name "$1" \ + -w /workspace \ + "$build_image" /bin/bash +``` + +### Preparation Weight + +- Pull DeepSeek-V3.2-Exp-w8a8-int8 weights + ``` + wget -O DeepSeek-V3.2-Exp-w8a8-int8.tar.gz https://aihc-private-hcd.bj.bcebos.com/v1/LLM/DeepSeek/DeepSeek-V3.2-Exp-w8a8-int8.tar.gz?authorization=bce-auth-v1%2FALTAKvz6x4eqcmSsKjQxq3vZdB%2F2025-12-24T06%3A07%3A10Z%2F-1%2Fhost%2Fa324bf469176934a05f75d3acabc3c1fb891be150f43fb1976e65b7ec68733db + ``` +- Ensure that the field "quantization_config" is included.If not, deployment will result in an OOM (Out of Memory) error. + +vim model/DeepSeek-V3.2-Exp-w8a8-int8/config.json +```config.json +"quantization_config": { + "config_groups": { + "group_0": { + "format": "int-quantized", + "input_activations": { + "actorder": null, + "block_structure": null, + "dynamic": true, + "group_size": null, + "num_bits": 8, + "observer": null, + "observer_kwargs": {}, + "strategy": "token", + "symmetric": true, + "type": "int" + }, + "output_activations": null, + "targets": [ + "Linear" + ], + "weights": { + "actorder": null, + "block_structure": null, + "dynamic": false, + "group_size": null, + "num_bits": 8, + "observer": "minmax", + "observer_kwargs": {}, + "strategy": "channel", + "symmetric": true, + "type": "int" + } + } + }, + "format": "int-quantized", + "global_compression_ratio": null, + "ignore": [ + "lm_head" + ], + "kv_cache_scheme": null, + "quant_method": "compressed-tensors", + "quantization_status": "compressed", + "sparsity_config": {}, + "transform_config": {}, + "version": "0.12.2" + }, +``` + +### Online Serving on Multi XPU + +Start the vLLM server on multi XPU: + +```bash +unset XPU_DUMMY_EVENT && \ +export XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && \ +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && \ +export XMLIR_CUDNN_ENABLED=1 && \ +export XPU_USE_DEFAULT_CTX=1 && \ +export XMLIR_FORCE_USE_XPU_GRAPH=1 && \ +export XMLIR_ENABLE_FAST_FC=1 && \ +export XPU_USE_FAST_SWIGLU=1 && \ +export CUDA_GRAPH_OPTIMIZE_STREAM=1 && \ +export XMLIR_ENABLE_MOCK_TORCH_COMPILE=false && \ +export XPU_USE_MOE_SORTED_THRES=1 && \ +export USE_ORI_ROPE=1 && \ +export VLLM_USE_V1=1 + +python -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --port 8806 \ + --model /data/DeepSeek-V3.2-Exp-w8a8-int8 \ + --gpu-memory-utilization 0.95 \ + --trust-remote-code \ + --max-model-len 32768 \ + --tensor-parallel-size 8 \ + --dtype float16 \ + --max_num_seqs 32 \ + --max_num_batched_tokens 8192 \ + --block-size 64 \ + --no-enable-chunked-prefill \ + --distributed-executor-backend mp \ + --disable-log-requests \ + --no-enable-prefix-caching --kv-cache-dtype bfloat16 \ + --compilation-config '{"splitting_ops":["vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.unified_attention_with_output_kunlun", + "vllm.mamba_mixer2", + "vllm.mamba_mixer", + "vllm.short_conv", + "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", + "vllm.sparse_attn_indexer", + "vllm.sparse_attn_indexer_vllm_kunlun"]}' +```