diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md index 5ef82ed..e4ca3c5 100644 --- a/docs/source/tutorials/multi_node.md +++ b/docs/source/tutorials/multi_node.md @@ -57,7 +57,7 @@ hccn_tool -i 0 -ping -g address 10.20.0.20 ``` ## Run with docker -Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the `deepseek-v3-w8a8` quantitative model across multi-node. +Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the `deepseek-v3.1-w8a8` quantitative model across multi-node. ```{code-block} bash :substitutions: @@ -107,6 +107,7 @@ Before launch the inference server, ensure the following environment variables a nic_name="xxxx" local_ip="xxxx" +export VLLM_USE_MODELSCOPE=True export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name @@ -115,9 +116,9 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 export HCCL_BUFFSIZE=1024 -# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V3-W8A8 +# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V3.1-W8A8 # If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/quantization.html -vllm serve /root/.cache/ds_v3 \ +vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ --host 0.0.0.0 \ --port 8004 \ --data-parallel-size 4 \ @@ -126,7 +127,7 @@ vllm serve /root/.cache/ds_v3 \ --data-parallel-rpc-port 13389 \ --tensor-parallel-size 4 \ --seed 1024 \ ---served-model-name deepseek_v3 \ +--served-model-name deepseek_v3.1 \ --enable-expert-parallel \ --max-num-seqs 16 \ --max-model-len 32768 \ @@ -146,6 +147,7 @@ vllm serve /root/.cache/ds_v3 \ nic_name="xxx" local_ip="xxx" +export VLLM_USE_MODELSCOPE=True export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name @@ -155,7 +157,7 @@ export OMP_NUM_THREADS=100 export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 -vllm serve /root/.cache/ds_v3 \ +vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ --host 0.0.0.0 \ --port 8004 \ --headless \ @@ -167,7 +169,7 @@ vllm serve /root/.cache/ds_v3 \ --tensor-parallel-size 4 \ --seed 1024 \ --quantization ascend \ ---served-model-name deepseek_v3 \ +--served-model-name deepseek_v3.1 \ --max-num-seqs 16 \ --max-model-len 32768 \ --max-num-batched-tokens 4096 \ @@ -187,7 +189,7 @@ Once your server is started, you can query the model with input prompts: curl http://{ node0 ip:8004 }/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "/root/.cache/ds_v3", + "model": "deepseek_v3.1", "prompt": "The future of AI is", "max_tokens": 50, "temperature": 0 @@ -198,7 +200,8 @@ curl http://{ node0 ip:8004 }/v1/completions \ For details please refer to [benchmark](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks) ```shell -vllm bench serve --model /root/.cache/ds_v3 --served-model-name deepseek_v3 \ +export VLLM_USE_MODELSCOPE=true +vllm bench serve --model vllm-ascend/DeepSeek-V3.1-W8A8 --served-model-name deepseek_v3.1 \ --dataset-name random --random-input-len 128 --random-output-len 128 \ --num-prompts 200 --trust-remote-code --base-url "http://{ node0 ip }:8004" --request-rate 1 ```