[Doc] Upgrade to multi-node tutorial model to deepseek-v3.1-w8a8 (#2553)
### What this PR does / why we need it?
Upgrade to multi-node tutorial model to deepseek-v3.1-w8a8
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.10.1.1
- vLLM main:
de02b07db4
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -57,7 +57,7 @@ hccn_tool -i 0 -ping -g address 10.20.0.20
|
||||
```
|
||||
|
||||
## Run with docker
|
||||
Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the `deepseek-v3-w8a8` quantitative model across multi-node.
|
||||
Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the `deepseek-v3.1-w8a8` quantitative model across multi-node.
|
||||
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
@@ -107,6 +107,7 @@ Before launch the inference server, ensure the following environment variables a
|
||||
nic_name="xxxx"
|
||||
local_ip="xxxx"
|
||||
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
@@ -115,9 +116,9 @@ export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V3-W8A8
|
||||
# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V3.1-W8A8
|
||||
# If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/quantization.html
|
||||
vllm serve /root/.cache/ds_v3 \
|
||||
vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
--port 8004 \
|
||||
--data-parallel-size 4 \
|
||||
@@ -126,7 +127,7 @@ vllm serve /root/.cache/ds_v3 \
|
||||
--data-parallel-rpc-port 13389 \
|
||||
--tensor-parallel-size 4 \
|
||||
--seed 1024 \
|
||||
--served-model-name deepseek_v3 \
|
||||
--served-model-name deepseek_v3.1 \
|
||||
--enable-expert-parallel \
|
||||
--max-num-seqs 16 \
|
||||
--max-model-len 32768 \
|
||||
@@ -146,6 +147,7 @@ vllm serve /root/.cache/ds_v3 \
|
||||
nic_name="xxx"
|
||||
local_ip="xxx"
|
||||
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
@@ -155,7 +157,7 @@ export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve /root/.cache/ds_v3 \
|
||||
vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
--port 8004 \
|
||||
--headless \
|
||||
@@ -167,7 +169,7 @@ vllm serve /root/.cache/ds_v3 \
|
||||
--tensor-parallel-size 4 \
|
||||
--seed 1024 \
|
||||
--quantization ascend \
|
||||
--served-model-name deepseek_v3 \
|
||||
--served-model-name deepseek_v3.1 \
|
||||
--max-num-seqs 16 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
@@ -187,7 +189,7 @@ Once your server is started, you can query the model with input prompts:
|
||||
curl http://{ node0 ip:8004 }/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "/root/.cache/ds_v3",
|
||||
"model": "deepseek_v3.1",
|
||||
"prompt": "The future of AI is",
|
||||
"max_tokens": 50,
|
||||
"temperature": 0
|
||||
@@ -198,7 +200,8 @@ curl http://{ node0 ip:8004 }/v1/completions \
|
||||
For details please refer to [benchmark](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks)
|
||||
|
||||
```shell
|
||||
vllm bench serve --model /root/.cache/ds_v3 --served-model-name deepseek_v3 \
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
vllm bench serve --model vllm-ascend/DeepSeek-V3.1-W8A8 --served-model-name deepseek_v3.1 \
|
||||
--dataset-name random --random-input-len 128 --random-output-len 128 \
|
||||
--num-prompts 200 --trust-remote-code --base-url "http://{ node0 ip }:8004" --request-rate 1
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user