diff --git a/docs/source/conf.py b/docs/source/conf.py index 149f3275..72428023 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,6 +80,9 @@ myst_substitutions = { 'ci_vllm_version': 'v0.11.0', } +# For cross-file header anchors +myst_heading_anchors = 5 + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md index 16a80ded..8bc6894b 100644 --- a/docs/source/developer_guide/evaluation/index.md +++ b/docs/source/developer_guide/evaluation/index.md @@ -5,6 +5,7 @@ :maxdepth: 1 using_evalscope using_lm_eval +using_ais_bench using_opencompass accuracy_report/index ::: diff --git a/docs/source/developer_guide/evaluation/using_ais_bench.md b/docs/source/developer_guide/evaluation/using_ais_bench.md new file mode 100644 index 00000000..62b1db7b --- /dev/null +++ b/docs/source/developer_guide/evaluation/using_ais_bench.md @@ -0,0 +1,283 @@ +# Using AISBench +This document guides you to conduct accuracy testing using [AISBench](https://gitee.com/aisbench/benchmark/tree/master). AISBench provides accuracy and performance evaluation for many datasets. + +## Online Server +### 1. Start the vLLM server +You can run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--shm-size=1g \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +/bin/bash +``` + +Run the vLLM server in the docker. + +```{code-block} bash + :substitutions: +vllm serve Qwen/Qwen2.5-0.5B-Instruct --max_model_len 35000 & +``` + +:::{note} +`--max_model_len` should be greater than `35000`, this will be suitable for most datasets. Otherwise the accuracy evaluation may be affected. +::: + +The vLLM server is started successfully, if you see logs as below: + +``` +INFO: Started server process [9446] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +### 2. Run different dataset using AISBench + +#### Install AISBench + +Refer to [AISBench](https://gitee.com/aisbench/benchmark/tree/master) for details. +Install AISBench from source. + +```shell +git clone https://gitee.com/aisbench/benchmark.git +cd benchmark/ +pip3 install -e ./ --use-pep517 +``` + +Install extra AISBench dependencies. + +```shell +pip3 install -r requirements/api.txt +pip3 install -r requirements/extra.txt +``` + +Run `ais_bench -h` to check the installation. + +#### Download Dataset + +You can choose one or multiple datasets to execute accuracy evaluation. + +1. `C-Eval` dataset. + +Take `C-Eval` dataset as an example. And you can refer to [Datasets](https://gitee.com/aisbench/benchmark/tree/master/ais_bench/benchmark/configs/datasets) for more datasets. Every datasets have a `README.md` for detailed download and installation process. + +Download dataset and install it to specific path. + +```shell +cd ais_bench/datasets +mkdir ceval/ +mkdir ceval/formal_ceval +cd ceval/formal_ceval +wget https://www.modelscope.cn/datasets/opencompass/ceval-exam/resolve/master/ceval-exam.zip +unzip ceval-exam.zip +rm ceval-exam.zip +``` + +2. `MMLU` dataset. + +```shell +cd ais_bench/datasets +wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip +unzip mmlu.zip +rm mmlu.zip +``` + +3. `GPQA` dataset. + +```shell +cd ais_bench/datasets +wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip +unzip gpqa.zip +rm gpqa.zip +``` + +4. `MATH` dataset. + +```shell +cd ais_bench/datasets +wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip +unzip math.zip +rm math.zip +``` + +5. `LiveCodeBench` dataset. + +```shell +cd ais_bench/datasets +git lfs install +git clone https://huggingface.co/datasets/livecodebench/code_generation_lite +``` + +6. `AIME 2024` dataset. + +```shell +cd ais_bench/datasets +mkdir aime/ +cd aime/ +wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip +unzip aime.zip +rm aime.zip +``` + +7. `GSM8K` dataset. + +```shell +cd ais_bench/datasets +wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip +unzip gsm8k.zip +rm gsm8k.zip +``` + +#### Configuration + +Update the file `benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_general_chat.py`. +There are several arguments that you should update according to your environment. + +- `path`: Update to your model weight path. +- `model`: Update to your model name in vLLM. +- `host_ip` and `host_port`: Update to your vLLM server ip and port. +- `max_out_len`: Note `max_out_len` + LLM input length should be less than `max-model-len`(config in your vllm server), `32768` will be suitable for most datasets. +- `batch_size`: Update according to your dataset. +- `temperature`: Update inference argument. + +```python +from ais_bench.benchmark.models import VLLMCustomAPIChat +from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content + +models = [ + dict( + attr="service", + type=VLLMCustomAPIChat, + abbr='vllm-api-general-chat', + path="xxxx", + model="xxxx", + request_rate = 0, + retry = 2, + host_ip = "localhost", + host_port = 8000, + max_out_len = xxx, + batch_size = xxx, + trust_remote_code=False, + generation_kwargs = dict( + temperature = 0.6, + top_k = 10, + top_p = 0.95, + seed = None, + repetition_penalty = 1.03, + ), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] + +``` + +#### Execute Accuracy Evaluation + +Run the following code to execute different accuracy evaluation. + +```shell +# run C-Eval dataset +ais_bench --models vllm_api_general_chat --datasets ceval_gen_0_shot_cot_chat_prompt.py --mode all --dump-eval-details --merge-ds + +# run MMLU dataset +ais_bench --models vllm_api_general_chat --datasets mmlu_gen_0_shot_cot_chat_prompt.py --mode all --dump-eval-details --merge-ds + +# run GPQA dataset +ais_bench --models vllm_api_general_chat --datasets gpqa_gen_0_shot_str.py --mode all --dump-eval-details --merge-ds + +# run MATH-500 dataset +ais_bench --models vllm_api_general_chat --datasets math500_gen_0_shot_cot_chat_prompt.py --mode all --dump-eval-details --merge-ds + +# run LiveCodeBench dataset +ais_bench --models vllm_api_general_chat --datasets livecodebench_code_generate_lite_gen_0_shot_chat.py --mode all --dump-eval-details --merge-ds + +# run AIME 2024 dataset +ais_bench --models vllm_api_general_chat --datasets aime2024_gen_0_shot_chat_prompt.py --mode all --dump-eval-details --merge-ds + +``` + +After each dataset execution, you can get the result from saved files such as `outputs/default/20250628_151326`, there is an example as follows: + +``` +20250628_151326/ +├── configs # Combined configuration file for model tasks, dataset tasks, and result presentation tasks +│ └── 20250628_151326_29317.py +├── logs # Execution logs; if --debug is added to the command, no intermediate logs are saved to disk (all are printed directly to the screen) +│ ├── eval +│ │ └── vllm-api-general-chat +│ │ └── demo_gsm8k.out # Logs of the accuracy evaluation process based on inference results in the predictions/ folder +│ └── infer +│ └── vllm-api-general-chat +│ └── demo_gsm8k.out # Logs of the inference process +├── predictions +│ └── vllm-api-general-chat +│ └── demo_gsm8k.json # Inference results (all outputs returned by the inference service) +├── results +│ └── vllm-api-general-chat +│ └── demo_gsm8k.json # Raw scores calculated from the accuracy evaluation +└── summary + ├── summary_20250628_151326.csv # Final accuracy scores (in table format) + ├── summary_20250628_151326.md # Final accuracy scores (in Markdown format) + └── summary_20250628_151326.txt # Final accuracy scores (in text format) +``` + +#### Execute Performance Evaluation + +```shell +# run C-Eval dataset +ais_bench --models vllm_api_general_chat --datasets ceval_gen_0_shot_cot_chat_prompt.py --summarizer default_perf --mode perf + +# run MMLU dataset +ais_bench --models vllm_api_general_chat --datasets mmlu_gen_0_shot_cot_chat_prompt.py --summarizer default_perf --mode perf + +# run GPQA dataset +ais_bench --models vllm_api_general_chat --datasets gpqa_gen_0_shot_str.py --summarizer default_perf --mode perf + +# run MATH-500 dataset +ais_bench --models vllm_api_general_chat --datasets math500_gen_0_shot_cot_chat_prompt.py --summarizer default_perf --mode perf + +# run LiveCodeBench dataset +ais_bench --models vllm_api_general_chat --datasets livecodebench_code_generate_lite_gen_0_shot_chat.py --summarizer default_perf --mode perf + +# run AIME 2024 dataset +ais_bench --models vllm_api_general_chat --datasets aime2024_gen_0_shot_chat_prompt.py --summarizer default_perf --mode perf +``` + +After execution, you can get the result from saved files, there is an example as follows: + +``` +20251031_070226/ +|-- configs # Combined configuration file for model tasks, dataset tasks, and result presentation tasks +| `-- 20251031_070226_122485.py +|-- logs +| `-- performances +| `-- vllm-api-general-chat +| `-- cevaldataset.out # Logs of the performance evaluation process +`-- performances + `-- vllm-api-general-chat + |-- cevaldataset.csv # Final performance results (in table format) + |-- cevaldataset.json # Final performance results (in json format) + |-- cevaldataset_details.h5 # Final performance results in details + |-- cevaldataset_details.json # Final performance results in details + |-- cevaldataset_plot.html # Final performance results (in html format) + `-- cevaldataset_rps_distribution_plot_with_actual_rps.html # Final performance results (in html format) +``` diff --git a/docs/source/developer_guide/evaluation/using_lm_eval.md b/docs/source/developer_guide/evaluation/using_lm_eval.md index 2075c5f7..11670aa3 100644 --- a/docs/source/developer_guide/evaluation/using_lm_eval.md +++ b/docs/source/developer_guide/evaluation/using_lm_eval.md @@ -122,10 +122,10 @@ After 30 minutes, the output is as shown below: ``` The markdown format results is as below: -Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3215|± |0.0129| -| | |strict-match | 5|exact_match|↑ |0.2077|± |0.0112| +|gsm8k| 3|strict-match | 5|exact_match|↑ |0.2077|± |0.0112| ``` @@ -187,7 +187,7 @@ The markdown format results is as below: Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3412|± |0.0131| -| | |strict-match | 5|exact_match|↑ |0.3139|± |0.0128| +|gsm8k| 3|strict-match | 5|exact_match|↑ |0.3139|± |0.0128| ``` diff --git a/docs/source/installation.md b/docs/source/installation.md index e8bc8cbb..fbc1c391 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -20,7 +20,7 @@ There are two installation methods: - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip. - **Using docker**: use the `vllm-ascend` pre-built docker image directly. -## Configure a new environment +## Configure Ascend CANN environment Before installation, you need to make sure firmware/driver and CANN are installed correctly, refer to [Ascend Environment Setup Guide](https://ascend.github.io/docs/sources/ascend/quick_install.html) for more details. @@ -109,14 +109,7 @@ No more extra step if you are using `vllm-ascend` prebuilt Docker image. Once it is done, you can start to set up `vllm` and `vllm-ascend`. -## Setup vllm and vllm-ascend - -:::::{tab-set} -:sync-group: install - -::::{tab-item} Using pip -:selected: -:sync: pip +## Set up using Python First install system dependencies and configure pip mirror: @@ -181,12 +174,19 @@ To build custom operators, gcc/g++ higher than 8 and c++ 17 or higher is require If you encounter other problems during compiling, it is probably because unexpected compiler is being used, you may export `CXX_COMPILER` and `C_COMPILER` in environment to specify your g++ and gcc locations before compiling. ``` -:::: +## Set up using Docker -::::{tab-item} Using docker -:sync: docker +`vllm-ascend` offers Docker images for deployment. You can just pull the **prebuilt image** from the image repository [ascend/vllm-ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and run it with bash. -You can just pull the **prebuilt image** and run it with bash. +Supported images as following. +| image name | Hardware | OS | +|-|-|-| +| image-tag | Atlas A2 | Ubuntu | +| image-tag-openeuler | Atlas A2 | openEuler | +| image-tag-a3 | Atlas A3 | Ubuntu | +| image-tag-a3-openeuler | Atlas A3 | openEuler | +| image-tag-310p | Atlas 300I | Ubuntu | +| image-tag-310p-openeuler | Atlas 300I | openEuler | :::{dropdown} Click here to see "Build from Dockerfile" or build IMAGE from **source code**: @@ -202,18 +202,27 @@ docker build -t vllm-ascend-dev-image:latest -f ./Dockerfile . ```{code-block} bash :substitutions: -# Update DEVICE according to your device (/dev/davinci[0-7]) -export DEVICE=/dev/davinci7 -# Update the vllm-ascend image +# Update --device according to your device (Atlas A2: /dev/davinci[0-7] Atlas A3:/dev/davinci[0-15]). +# Update the vllm-ascend image according to your environment. +# Note you should download the weight to /root/.cache in advance. export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| docker run --rm \ --name vllm-ascend-env \ --shm-size=1g \ - --device $DEVICE \ + --net=host \ + --device /dev/davinci0 \ + --device /dev/davinci1 \ + --device /dev/davinci2 \ + --device /dev/davinci3 \ + --device /dev/davinci4 \ + --device /dev/davinci5 \ + --device /dev/davinci6 \ + --device /dev/davinci7 \ --device /dev/davinci_manager \ --device /dev/devmm_svm \ --device /dev/hisi_hdc \ -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ @@ -223,9 +232,6 @@ docker run --rm \ ``` The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) (`pip install -e`) to help developer immediately take place changes without requiring a new installation. -:::: - -::::: ## Extra information @@ -287,3 +293,183 @@ Prompt: 'The president of the United States is', Generated text: ' a very import Prompt: 'The capital of France is', Generated text: ' Paris. The oldest part of the city is Saint-Germain-des-Pr' Prompt: 'The future of AI is', Generated text: ' not bright\n\nThere is no doubt that the evolution of AI will have a huge' ``` + +## Multi-node Deployment +### Verify Multi-Node Communication + +First, check physical layer connectivity, then verify each node, and finally verify the inter-node connectivity. + +#### Physical Layer Requirements: + +- The physical machines must be located on the same WLAN, with network connectivity. +- All NPUs are connected with optical modules, and the connection status must be normal. + +#### Each Node Verification: + +Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`: + +:::::{tab-set} +::::{tab-item} A2 series + +```bash + # Check the remote switch ports + for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done + # Get the link status of the Ethernet ports (UP or DOWN) + for i in {0..7}; do hccn_tool -i $i -link -g ; done + # Check the network health status + for i in {0..7}; do hccn_tool -i $i -net_health -g ; done + # View the network detected IP configuration + for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done + # View gateway configuration + for i in {0..7}; do hccn_tool -i $i -gateway -g ; done + # View NPU network configuration + cat /etc/hccn.conf +``` + +:::: +::::{tab-item} A3 series + +```bash + # Check the remote switch ports + for i in {0..15}; do hccn_tool -i $i -lldp -g | grep Ifname; done + # Get the link status of the Ethernet ports (UP or DOWN) + for i in {0..15}; do hccn_tool -i $i -link -g ; done + # Check the network health status + for i in {0..15}; do hccn_tool -i $i -net_health -g ; done + # View the network detected IP configuration + for i in {0..15}; do hccn_tool -i $i -netdetect -g ; done + # View gateway configuration + for i in {0..15}; do hccn_tool -i $i -gateway -g ; done + # View NPU network configuration + cat /etc/hccn.conf +``` + +:::: +::::: + +#### Interconnect Verification: +##### 1. Get NPU IP Addresses +:::::{tab-set} +::::{tab-item} A2 series + +```bash +for i in {0..7}; do hccn_tool -i $i -ip -g | grep ipaddr; done +``` + +:::: +::::{tab-item} A3 series + +```bash +for i in {0..15}; do hccn_tool -i $i -ip -g | grep ipaddr; done +``` + +:::: +::::: + +##### 2. Cross-Node PING Test + +```bash +# Execute on the target node (replace with actual IP) +hccn_tool -i 0 -ping -g address x.x.x.x +``` + +### Run Container In Each Node + +Using vLLM-ascend official container is more efficient to run multi-node environment. + +Run the following command to start the container in each node (You should download the weight to /root/.cache in advance): + +:::::{tab-set} +::::{tab-item} A2 series + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +# openEuler: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-openeuler +# Ubuntu: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| + +# Run the container using the defined variables +# Note if you are running bridge network with docker, Please expose available ports +# for multiple nodes communication in advance +docker run --rm \ +--name vllm-ascend \ +--net=host \ +--shm-size=1g \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci4 \ +--device /dev/davinci5 \ +--device /dev/davinci6 \ +--device /dev/davinci7 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-it $IMAGE bash +``` + +:::: +::::{tab-item} A3 series + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +# openEuler: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-a3-openeuler +# Ubuntu: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-a3 +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-a3 + +# Run the container using the defined variables +# Note if you are running bridge network with docker, Please expose available ports +# for multiple nodes communication in advance +docker run --rm \ +--name vllm-ascend \ +--net=host \ +--shm-size=1g \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci4 \ +--device /dev/davinci5 \ +--device /dev/davinci6 \ +--device /dev/davinci7 \ +--device /dev/davinci8 \ +--device /dev/davinci9 \ +--device /dev/davinci10 \ +--device /dev/davinci11 \ +--device /dev/davinci12 \ +--device /dev/davinci13 \ +--device /dev/davinci14 \ +--device /dev/davinci15 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-it $IMAGE bash +``` + +:::: +::::: + +### Verify installation + +TODO diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 6555a506..9ce0206a 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -20,6 +20,10 @@ # Update DEVICE according to your device (/dev/davinci[0-7]) export DEVICE=/dev/davinci0 # Update the vllm-ascend image +# Atlas A2: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +# Atlas A3: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-a3 export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| docker run --rm \ --name vllm-ascend \ @@ -50,6 +54,10 @@ apt-get update -y && apt-get install -y curl # Update DEVICE according to your device (/dev/davinci[0-7]) export DEVICE=/dev/davinci0 # Update the vllm-ascend image +# Atlas A2: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-openeuler +# Atlas A3: +# export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-a3-openeuler export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-openeuler docker run --rm \ --name vllm-ascend \ diff --git a/docs/source/tutorials/DeepSeek-V3.2-Exp.md b/docs/source/tutorials/DeepSeek-V3.2-Exp.md new file mode 100644 index 00000000..c3e7cbf6 --- /dev/null +++ b/docs/source/tutorials/DeepSeek-V3.2-Exp.md @@ -0,0 +1,429 @@ +# DeepSeek-V3.2-Exp + +## Introduction + +DeepSeek-V3.2-Exp is a sparse attention model. The main architecture is similar to DeepSeek-V3.1, but with a sparse attention mechanism, which is designed to explore and validate optimizations for training and inference efficiency in long-context scenarios. + +This document will show the main verification steps of the model, including supported features, feature configuration, environment preparation, single-node and multi-node deployment, accuracy and performance evaluation. + +The `DeepSeek-V3.2-Exp` model is first supported in `vllm-ascend:v0.11.0rc0`. + +## Supported Features + +Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. + +Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. + +## Environment Preparation + +### Model Weight + +- `DeepSeek-V3.2-Exp`(BF16 version): require 2 Atlas 800 A3 (64G × 16) nodes or 4 Atlas 800 A2 (64G × 8) nodes. [Download model weight](https://modelers.cn/models/Modelers_Park/DeepSeek-V3.2-Exp-BF16) +- `DeepSeek-V3.2-Exp-w8a8`(Quantized version): require 1 Atlas 800 A3 (64G × 16) node or 2 Atlas 800 A2 (64G × 8) nodes. [Download model weight](https://modelers.cn/models/Modelers_Park/DeepSeek-V3.2-Exp-w8a8) + +It is recommended to download the model weight to the shared directory of multiple nodes, such as `/root/.cache/` + +### Verify Multi-node Communication(Optional) + +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). + +### Installation + +:::::{tab-set} +::::{tab-item} Use deepseek-v3.2 docker image + +Currently, we provide the all-in-one images `quay.io/ascend/vllm-ascend:v0.11.0rc0-deepseek-v3.2-exp`(for Atlas 800 A2) and `quay.io/ascend/vllm-ascend:v0.11.0rc0-a3-deepseek-v3.2-exp`(for Atlas 800 A3). + +Refer to [using docker](../installation.md#set-up-using-docker) to set up environment using Docker, remember to replace the image with deepseek-v3.2 docker image. + +:::{note} +The image is based on a specific version and will not continue to release new version. +Only AArch64 architecture are supported currently due to extra operator's installation limitations. +::: + +:::: +::::{tab-item} Use vllm-ascend docker image + +You can using our official docker image and install extra operator for supporting `DeepSeek-V3.2-Exp`. + +:::{note} +Only AArch64 architecture are supported currently due to extra operator's installation limitations. +::: + +For `A3` image: + +1. Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). + +2. Install the package `custom-ops` to make the kernels available. + +```shell +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run +chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run +./CANN-custom_ops-sfa-linux.aarch64.run --quiet +export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH} +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl +pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl +``` + +3. Download and install `MLAPO`. + +```shell +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-mlapo-linux.aarch64.run +# please set a custom install-path, here take `/`vllm-workspace/CANN` as example. +chmod +x ./CANN-custom_ops-mlapo-linux.aarch64.run +./CANN-custom_ops-mlapo-linux.aarch64.run --quiet --install-path=/vllm-workspace/CANN +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/torch_npu-2.7.1%2Bgitb7c90d0-cp311-cp311-linux_aarch64.whl +pip install torch_npu-2.7.1+gitb7c90d0-cp311-cp311-linux_aarch64.whl +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/libopsproto_rt2.0.so +cp libopsproto_rt2.0.so /usr/local/Ascend/ascend-toolkit/8.2.RC1/opp/built-in/op_proto/lib/linux/aarch64/libopsproto_rt2.0.so +# Don't forget to replace `/vllm-workspace/CANN/` to the custom path you set before. +source /vllm-workspace/CANN/vendors/customize/bin/set_env.bash +export LD_PRELOAD=/vllm-workspace/CANN/vendors/customize/op_proto/lib/linux/aarch64/libcust_opsproto_rt2.0.so:${LD_PRELOAD} +``` + +For `A2` image, you should change all `wget` commands as above, and replace `A3` with `A2` release file. + +1. Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). + +2. Install the package `custom-ops` to make the kernels available. + +```shell +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a2/CANN-custom_ops-sfa-linux.aarch64.run +chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run +./CANN-custom_ops-sfa-linux.aarch64.run --quiet +export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH} +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a2/custom_ops-1.0-cp311-cp311-linux_aarch64.whl +pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl +``` + +3. Download and install `MLAPO`. + +```shell +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a2/CANN-custom_ops-mlapo-linux.aarch64.run +# please set a custom install-path, here take `/`vllm-workspace/CANN` as example. +chmod +x ./CANN-custom_ops-mlapo-linux.aarch64.run +./CANN-custom_ops-mlapo-linux.aarch64.run --quiet --install-path=/vllm-workspace/CANN +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a2/torch_npu-2.7.1%2Bgitb7c90d0-cp311-cp311-linux_aarch64.whl +pip install torch_npu-2.7.1+gitb7c90d0-cp311-cp311-linux_aarch64.whl +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a2/libopsproto_rt2.0.so +cp libopsproto_rt2.0.so /usr/local/Ascend/ascend-toolkit/8.2.RC1/opp/built-in/op_proto/lib/linux/aarch64/libopsproto_rt2.0.so +# Don't forget to replace `/vllm-workspace/CANN/` to the custom path you set before. +source /vllm-workspace/CANN/vendors/customize/bin/set_env.bash +export LD_PRELOAD=/vllm-workspace/CANN/vendors/customize/op_proto/lib/linux/aarch64/libcust_opsproto_rt2.0.so:${LD_PRELOAD} +``` + +:::: +::::{tab-item} Build from source + +You can build all from source. + +- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python). +- Install extra operator for supporting `DeepSeek-V3.2-Exp`, refer to `Use vllm-ascend docker image` tab. + +:::: +::::: + +If you want to deploy multi-node environment, you need to set up environment on each node. + +## Deployment + +### Single-node Deployment + +Only the quantized model `DeepSeek-V3.2-Exp-w8a8` can be deployed on 1 Atlas 800 A3. + +Run the following script to execute online inference. + +```shell +#!/bin/sh +export VLLM_USE_MODELSCOPE=true + +vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ +--host 0.0.0.0 \ +--port 8000 \ +--tensor-parallel-size 16 \ +--seed 1024 \ +--quantization ascend \ +--served-model-name deepseek_v3.2 \ +--max-num-seqs 16 \ +--max-model-len 17450 \ +--max-num-batched-tokens 17450 \ +--enable-expert-parallel \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +``` + +### Multi-node Deployment + +- `DeepSeek-V3.2-Exp`: require 2 Atlas 800 A3 (64G × 16) nodes or 4 Atlas 800 A2 (64G × 8). +- `DeepSeek-V3.2-Exp-w8a8`: require 2 Atlas 800 A2 (64G × 8). + +:::::{tab-set} +::::{tab-item} DeepSeek-V3.2-Exp A3 series + +Run the following scripts on two nodes respectively. + +**Node 0** + +```shell +#!/bin/sh + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxxx" +local_ip="xxxx" + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export HCCL_BUFFSIZE=1024 + +vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \ +--host 0.0.0.0 \ +--port 8000 \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-address $local_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 16 \ +--seed 1024 \ +--served-model-name deepseek_v3.2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 17450 \ +--max-num-batched-tokens 17450 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.9 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +``` + +**Node 1** + +```shell +#!/bin/sh + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxx" +local_ip="xxx" + +# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) +node0_ip="xxxx" + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export HCCL_BUFFSIZE=1024 + +vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \ +--host 0.0.0.0 \ +--port 8000 \ +--headless \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-start-rank 1 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 16 \ +--seed 1024 \ +--served-model-name deepseek_v3.2 \ +--max-num-seqs 16 \ +--max-model-len 17450 \ +--max-num-batched-tokens 17450 \ +--enable-expert-parallel \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +``` + +:::: +::::{tab-item} DeepSeek-V3.2-Exp-W8A8 A2 series + +Run the following scripts on two nodes respectively. + +**Node 0** + +```shell +#!/bin/sh + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxxx" +local_ip="xxxx" + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export HCCL_BUFFSIZE=1024 +export HCCL_OP_EXPANSION_MODE="AIV" +export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" + +vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ +--host 0.0.0.0 \ +--port 8000 \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-address $local_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 8 \ +--seed 1024 \ +--served-model-name deepseek_v3.2 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 17450 \ +--max-num-batched-tokens 17450 \ +--trust-remote-code \ +--quantization ascend \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.9 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +``` + +**Node 1** + +```shell +#!/bin/sh + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip of the current node +nic_name="xxx" +local_ip="xxx" + +# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) +node0_ip="xxxx" + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export HCCL_BUFFSIZE=1024 +export HCCL_OP_EXPANSION_MODE="AIV" +export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" + +vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ +--host 0.0.0.0 \ +--port 8000 \ +--headless \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-start-rank 1 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 8 \ +--seed 1024 \ +--served-model-name deepseek_v3.2 \ +--max-num-seqs 16 \ +--max-model-len 17450 \ +--max-num-batched-tokens 17450 \ +--enable-expert-parallel \ +--trust-remote-code \ +--quantization ascend \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' +``` + +:::: +::::: + +### Prefill-Decode Disaggregation + +Not supported yet. + +## Functional Verification + +Once your server is started, you can query the model with input prompts: + +```shell +curl http://:/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek_v3.2", + "prompt": "The future of AI is", + "max_tokens": 50, + "temperature": 0 + }' +``` + +## Accuracy Evaluation + +Here are two accuracy evaluation methods. + +### Using AISBench + +1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. + +2. After execution, you can get the result, here is the result of `DeepSeek-V3.2-Exp-W8A8` in `vllm-ascend:0.11.0rc0` for reference only. + +| dataset | version | metric | mode | vllm-api-general-chat | +|----- | ----- | ----- | ----- | -----| +| cevaldataset | - | accuracy | gen | 92.20 | + +### Using Language Model Evaluation Harness + +As an example, take the `gsm8k` dataset as a test dataset, and run accuracy evaluation of `DeepSeek-V3.2-Exp-W8A8` in online mode. + +1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation. + +2. Run `lm_eval` to execute the accuracy evaluation. + +```shell +lm_eval \ + --model local-completions \ + --model_args model=/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-Exp-W8A8,base_url=http://127.0.0.1:8000/v1/completions,tokenized_requests=False,trust_remote_code=True \ + --tasks gsm8k \ + --output_path ./ +``` + +3. After execution, you can get the result, here is the result of `DeepSeek-V3.2-Exp-W8A8` in `vllm-ascend:0.11.0rc0` for reference only. + +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9591|± |0.0055| +|gsm8k| 3|strict-match | 5|exact_match|↑ |0.9583|± |0.0055| + +## Performance + +### Using AISBench + +Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. + +### Using vLLM Benchmark + +Run performance evaluation of `DeepSeek-V3.2-Exp-W8A8` as an example. + +Refer to [vllm benchmark](https://docs.vllm.ai/en/latest/contributing/benchmarks.html) for more details. + +There are three `vllm bench` subcommand: +- `latency`: Benchmark the latency of a single batch of requests. +- `serve`: Benchmark the online serving throughput. +- `throughput`: Benchmark offline inference throughput. + +Take the `serve` as an example. Run the code as follows. + +```shell +export VLLM_USE_MODELSCOPE=true +vllm bench serve --model vllm-ascend/DeepSeek-V3.2-Exp-W8A8 --dataset-name random --random-input 200 --num-prompt 200 --request-rate 1 --save-result --result-dir ./ +``` + +After about several minutes, you can get the performance evaluation result. diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/index.md index aee2c6b3..30108264 100644 --- a/docs/source/tutorials/index.md +++ b/docs/source/tutorials/index.md @@ -14,7 +14,7 @@ multi_npu_moge multi_npu_qwen3_moe multi_npu_quantization single_node_300i -multi-node_dsv3.2.md +DeepSeek-V3.2-Exp.md multi_node multi_node_kimi multi_node_qwen3vl diff --git a/docs/source/tutorials/multi-node_dsv3.2.md b/docs/source/tutorials/multi-node_dsv3.2.md deleted file mode 100644 index 2e58f28f..00000000 --- a/docs/source/tutorials/multi-node_dsv3.2.md +++ /dev/null @@ -1,407 +0,0 @@ -# Multi-Node (DeepSeek V3.2) - -:::{note} -Only machines with AArch64 are supported currently. x86 will be supported soon. This guide takes A3 as the example. -::: - -## Verify Multi-Node Communication Environment - -### Physical Layer Requirements: - -- The physical machines must be located on the same WLAN, with network connectivity. -- All NPUs are connected with optical modules, and the connection status must be normal. - -### Verification Process: - -Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`: - -:::::{tab-set} -::::{tab-item} A2 series - -```bash - # Check the remote switch ports - for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done - # Get the link status of the Ethernet ports (UP or DOWN) - for i in {0..7}; do hccn_tool -i $i -link -g ; done - # Check the network health status - for i in {0..7}; do hccn_tool -i $i -net_health -g ; done - # View the network detected IP configuration - for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done - # View gateway configuration - for i in {0..7}; do hccn_tool -i $i -gateway -g ; done - # View NPU network configuration - cat /etc/hccn.conf -``` - -:::: -::::{tab-item} A3 series - -```bash - # Check the remote switch ports - for i in {0..15}; do hccn_tool -i $i -lldp -g | grep Ifname; done - # Get the link status of the Ethernet ports (UP or DOWN) - for i in {0..15}; do hccn_tool -i $i -link -g ; done - # Check the network health status - for i in {0..15}; do hccn_tool -i $i -net_health -g ; done - # View the network detected IP configuration - for i in {0..15}; do hccn_tool -i $i -netdetect -g ; done - # View gateway configuration - for i in {0..15}; do hccn_tool -i $i -gateway -g ; done - # View NPU network configuration - cat /etc/hccn.conf -``` - -:::: -::::: - -### NPU Interconnect Verification: -#### 1. Get NPU IP Addresses -:::::{tab-set} -::::{tab-item} A2 series - -```bash -for i in {0..7}; do hccn_tool -i $i -ip -g | grep ipaddr; done -``` - -:::: -::::{tab-item} A3 series - -```bash -for i in {0..15}; do hccn_tool -i $i -ip -g | grep ipaddr; done -``` - -:::: -::::: - -#### 2. Cross-Node PING Test - -```bash -# Execute on the target node (replace with actual IP) -hccn_tool -i 0 -ping -g address 10.20.0.20 -``` - -## Deploy DeepSeek-V3.2-Exp with vLLM-Ascend - -Currently, we provide a all-in-one image (include CANN 8.2RC1 + [SparseFlashAttention/LightningIndexer](https://gitcode.com/cann/cann-recipes-infer/tree/master/ops/ascendc) + [MLAPO](https://github.com/vllm-project/vllm-ascend/pull/3226)). You can also build your own image by referring to [link](https://github.com/vllm-project/vllm-ascend/issues/3278). - -- `DeepSeek-V3.2-Exp`: require 2 Atlas 800 A3 (64G × 16) nodes or 4 Atlas 800 A2 (64G × 8). [Model weight link](https://modelers.cn/models/Modelers_Park/DeepSeek-V3.2-Exp-BF16) -- `DeepSeek-V3.2-Exp-w8a8`: require 1 Atlas 800 A3 (64G × 16) node or 2 Atlas 800 A2 (64G × 8). [Model weight link](https://modelers.cn/models/Modelers_Park/DeepSeek-V3.2-Exp-w8a8) - -Run the following command to start the container in each node (You should download the weight to /root/.cache in advance): - -:::::{tab-set} -::::{tab-item} A2 series - -```{code-block} bash - :substitutions: -# Update the vllm-ascend image -# export IMAGE=quay.io/ascend/vllm-ascend:v0.11.0rc0-deepseek-v3.2-exp -export IMAGE=quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc0-deepseek-v3.2-exp -export NAME=vllm-ascend - -# Run the container using the defined variables -# Note if you are running bridge network with docker, Please expose available ports -# for multiple nodes communication in advance -docker run --rm \ ---name $NAME \ ---net=host \ ---shm-size=1g \ ---device /dev/davinci0 \ ---device /dev/davinci1 \ ---device /dev/davinci2 \ ---device /dev/davinci3 \ ---device /dev/davinci4 \ ---device /dev/davinci5 \ ---device /dev/davinci6 \ ---device /dev/davinci7 \ ---device /dev/davinci_manager \ ---device /dev/devmm_svm \ ---device /dev/hisi_hdc \ --v /usr/local/dcmi:/usr/local/dcmi \ --v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ --v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ --v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ --v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ --v /etc/ascend_install.info:/etc/ascend_install.info \ --v /root/.cache:/root/.cache \ --it $IMAGE bash -``` - -:::: -::::{tab-item} A3 series - -```{code-block} bash - :substitutions: -# Update the vllm-ascend image -# openEuler: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.11.0rc0-a3-openeuler-deepseek-v3.2-exp -# Ubuntu: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.11.0rc0-a3-deepseek-v3.2-exp -export IMAGE=quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc0-a3-deepseek-v3.2-exp -export NAME=vllm-ascend - -# Run the container using the defined variables -# Note if you are running bridge network with docker, Please expose available ports -# for multiple nodes communication in advance -docker run --rm \ ---name $NAME \ ---net=host \ ---shm-size=1g \ ---device /dev/davinci0 \ ---device /dev/davinci1 \ ---device /dev/davinci2 \ ---device /dev/davinci3 \ ---device /dev/davinci4 \ ---device /dev/davinci5 \ ---device /dev/davinci6 \ ---device /dev/davinci7 \ ---device /dev/davinci8 \ ---device /dev/davinci9 \ ---device /dev/davinci10 \ ---device /dev/davinci11 \ ---device /dev/davinci12 \ ---device /dev/davinci13 \ ---device /dev/davinci14 \ ---device /dev/davinci15 \ ---device /dev/davinci_manager \ ---device /dev/devmm_svm \ ---device /dev/hisi_hdc \ --v /usr/local/dcmi:/usr/local/dcmi \ --v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ --v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ --v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ --v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ --v /etc/ascend_install.info:/etc/ascend_install.info \ --v /root/.cache:/root/.cache \ --it $IMAGE bash -``` - -:::: -::::: - -:::::{tab-set} -::::{tab-item} DeepSeek-V3.2-Exp A3 series - -Run the following scripts on two nodes respectively. - -:::{note} -Before launching the inference server, ensure the following environment variables are set for multi-node communication. -::: - -**Node 0** - -```shell -#!/bin/sh - -# this obtained through ifconfig -# nic_name is the network interface name corresponding to local_ip of the current node -nic_name="xxxx" -local_ip="xxxx" - -export VLLM_USE_MODELSCOPE=True -export HCCL_IF_IP=$local_ip -export GLOO_SOCKET_IFNAME=$nic_name -export TP_SOCKET_IFNAME=$nic_name -export HCCL_SOCKET_IFNAME=$nic_name -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=100 -export HCCL_BUFFSIZE=1024 - -vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \ ---host 0.0.0.0 \ ---port 8000 \ ---data-parallel-size 2 \ ---data-parallel-size-local 1 \ ---data-parallel-address $local_ip \ ---data-parallel-rpc-port 13389 \ ---tensor-parallel-size 16 \ ---seed 1024 \ ---served-model-name deepseek_v3.2 \ ---enable-expert-parallel \ ---max-num-seqs 16 \ ---max-model-len 17450 \ ---max-num-batched-tokens 17450 \ ---trust-remote-code \ ---no-enable-prefix-caching \ ---gpu-memory-utilization 0.9 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' -``` - -**Node 1** - -```shell -#!/bin/sh - -# this obtained through ifconfig -# nic_name is the network interface name corresponding to local_ip of the current node -nic_name="xxx" -local_ip="xxx" - -# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) -node0_ip="xxxx" - -export VLLM_USE_MODELSCOPE=True -export HCCL_IF_IP=$local_ip -export GLOO_SOCKET_IFNAME=$nic_name -export TP_SOCKET_IFNAME=$nic_name -export HCCL_SOCKET_IFNAME=$nic_name -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=100 -export HCCL_BUFFSIZE=1024 - -vllm serve /root/.cache/Modelers_Park/DeepSeek-V3.2-Exp \ ---host 0.0.0.0 \ ---port 8000 \ ---headless \ ---data-parallel-size 2 \ ---data-parallel-size-local 1 \ ---data-parallel-start-rank 1 \ ---data-parallel-address $node0_ip \ ---data-parallel-rpc-port 13389 \ ---tensor-parallel-size 16 \ ---seed 1024 \ ---served-model-name deepseek_v3.2 \ ---max-num-seqs 16 \ ---max-model-len 17450 \ ---max-num-batched-tokens 17450 \ ---enable-expert-parallel \ ---trust-remote-code \ ---no-enable-prefix-caching \ ---gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' -``` - -:::: - -::::{tab-item} DeepSeek-V3.2-Exp-W8A8 A3 series - -```shell -#!/bin/sh -export VLLM_USE_MODELSCOPE=true - -vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ ---host 0.0.0.0 \ ---port 8000 \ ---tensor-parallel-size 16 \ ---seed 1024 \ ---quantization ascend \ ---served-model-name deepseek_v3.2 \ ---max-num-seqs 16 \ ---max-model-len 17450 \ ---max-num-batched-tokens 17450 \ ---enable-expert-parallel \ ---trust-remote-code \ ---no-enable-prefix-caching \ ---gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' -``` - -:::: -::::{tab-item} DeepSeek-V3.2-Exp-W8A8 A2 series - -Run the following scripts on two nodes respectively. - -**Node 0** - -```shell -#!/bin/sh - -# this obtained through ifconfig -# nic_name is the network interface name corresponding to local_ip of the current node -nic_name="xxxx" -local_ip="xxxx" - -export VLLM_USE_MODELSCOPE=True -export HCCL_IF_IP=$local_ip -export GLOO_SOCKET_IFNAME=$nic_name -export TP_SOCKET_IFNAME=$nic_name -export HCCL_SOCKET_IFNAME=$nic_name -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=100 -export HCCL_BUFFSIZE=1024 -export HCCL_OP_EXPANSION_MODE="AIV" -export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" - -vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ ---host 0.0.0.0 \ ---port 8000 \ ---data-parallel-size 2 \ ---data-parallel-size-local 1 \ ---data-parallel-address $local_ip \ ---data-parallel-rpc-port 13389 \ ---tensor-parallel-size 8 \ ---seed 1024 \ ---served-model-name deepseek_v3.2 \ ---enable-expert-parallel \ ---max-num-seqs 16 \ ---max-model-len 17450 \ ---max-num-batched-tokens 17450 \ ---trust-remote-code \ ---quantization ascend \ ---no-enable-prefix-caching \ ---gpu-memory-utilization 0.9 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' -``` - -**Node 1** - -```shell -#!/bin/sh - -# this obtained through ifconfig -# nic_name is the network interface name corresponding to local_ip of the current node -nic_name="xxx" -local_ip="xxx" - -# The value of node0_ip must be consistent with the value of local_ip set in node0 (master node) -node0_ip="xxxx" - -export VLLM_USE_MODELSCOPE=True -export HCCL_IF_IP=$local_ip -export GLOO_SOCKET_IFNAME=$nic_name -export TP_SOCKET_IFNAME=$nic_name -export HCCL_SOCKET_IFNAME=$nic_name -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=100 -export HCCL_BUFFSIZE=1024 -export HCCL_OP_EXPANSION_MODE="AIV" -export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" - -vllm serve vllm-ascend/DeepSeek-V3.2-Exp-W8A8 \ ---host 0.0.0.0 \ ---port 8000 \ ---headless \ ---data-parallel-size 2 \ ---data-parallel-size-local 1 \ ---data-parallel-start-rank 1 \ ---data-parallel-address $node0_ip \ ---data-parallel-rpc-port 13389 \ ---tensor-parallel-size 8 \ ---seed 1024 \ ---served-model-name deepseek_v3.2 \ ---max-num-seqs 16 \ ---max-model-len 17450 \ ---max-num-batched-tokens 17450 \ ---enable-expert-parallel \ ---trust-remote-code \ ---quantization ascend \ ---no-enable-prefix-caching \ ---gpu-memory-utilization 0.92 \ ---additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}' -``` - -:::: -::::: - -Once your server is started, you can query the model with input prompts: - -```shell -curl http://:/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "deepseek_v3.2", - "prompt": "The future of AI is", - "max_tokens": 50, - "temperature": 0 - }' -```