From f952de93df7fa9d6f5c9506763296e2a09d5f228 Mon Sep 17 00:00:00 2001 From: 1092626063 <1092626063@qq.com> Date: Fri, 19 Dec 2025 10:52:33 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Doc=E3=80=91Deepseekv3.1/R1=20doc=20en?= =?UTF-8?q?hancement=20(#4827)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Deepseekv3.1、DeepSeekR1 doc enhancement - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: 1092626063 <1092626063@qq.com> --- docs/source/tutorials/DeepSeek-R1.md | 7 +++++++ docs/source/tutorials/DeepSeek-V3.1.md | 21 ++++++++++--------- .../support_matrix/supported_models.md | 4 ++-- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/docs/source/tutorials/DeepSeek-R1.md b/docs/source/tutorials/DeepSeek-R1.md index 3329de55..32a3d6a3 100644 --- a/docs/source/tutorials/DeepSeek-R1.md +++ b/docs/source/tutorials/DeepSeek-R1.md @@ -113,6 +113,13 @@ vllm serve vllm-ascend/DeepSeek-R1-W8A8 \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' ``` +**Notice:** +The parameters are explained as follows: +- Setting the environment variable `VLLM_ASCEND_ENABLE_MLAPO=1` enables a fusion operator that can significantly improve performance, though it requires more NPU memory. It is therefore recommended to enable this option when sufficient NPU memory is available. +- For single-node deployment, we recommend using `dp4tp4` instead of `dp2tp8`. +- `--max-model-len` specifies the maximum context length - that is, the sum of input and output tokens for a single request. For performance testing with an input length of 3.5K and output length of 1.5K, a value of `16384` is sufficient, however, for precision testing, please set it at least `35000`. +- `--no-enable-prefix-caching` indicates that prefix caching is disabled. To enable it, remove this option. + :::: ::::{tab-item} DeepSeek-R1-W8A8 A2 series diff --git a/docs/source/tutorials/DeepSeek-V3.1.md b/docs/source/tutorials/DeepSeek-V3.1.md index 058f809a..8114cb4e 100644 --- a/docs/source/tutorials/DeepSeek-V3.1.md +++ b/docs/source/tutorials/DeepSeek-V3.1.md @@ -72,7 +72,7 @@ docker run --rm \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ -v /etc/ascend_install.info:/etc/ascend_install.info \ --v /mnt/sfs_turbo/.cache:/root/.cache \ +-v /root/.cache:/root/.cache \ -it $IMAGE bash ``` @@ -104,14 +104,8 @@ export HCCL_IF_IP=$local_ip export GLOO_SOCKET_IFNAME=$nic_name export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name -export OMP_PROC_BIND=false -export OMP_NUM_THREADS=10 -export VLLM_USE_V1=1 -export HCCL_BUFFSIZE=200 export VLLM_ASCEND_ENABLE_MLAPO=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export VLLM_ASCEND_ENABLE_FLASHCOMM1=0 -export DISABLE_L2_CACHE=1 vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --host 0.0.0.0 \ @@ -123,7 +117,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --served-model-name deepseek_v3 \ --enable-expert-parallel \ --max-num-seqs 16 \ ---max-model-len 8192 \ +--max-model-len 16384 \ --max-num-batched-tokens 4096 \ --trust-remote-code \ --no-enable-prefix-caching \ @@ -132,6 +126,13 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ ``` +**Notice:** +The parameters are explained as follows: +- Setting the environment variable `VLLM_ASCEND_ENABLE_MLAPO=1` enables a fusion operator that can significantly improve performance, though it requires more NPU memory. It is therefore recommended to enable this option when sufficient NPU memory is available. +- For single-node deployment, we recommend using `dp4tp4` instead of `dp2tp8`. +- `--max-model-len` specifies the maximum context length - that is, the sum of input and output tokens for a single request. For performance testing with an input length of 3.5K and output length of 1.5K, a value of `16384` is sufficient, however, for precision testing, please set it at least `35000`. +- `--no-enable-prefix-caching` indicates that prefix caching is disabled. To enable it, remove this option. + ### Multi-node Deployment - `DeepSeek-V3.1_w8a8mix_mtp`: require at least 2 Atlas 800 A2 (64G × 8). @@ -184,7 +185,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --served-model-name deepseek_v3 \ --enable-expert-parallel \ --max-num-seqs 20 \ ---max-model-len 8192 \ +--max-model-len 16384 \ --max-num-batched-tokens 4096 \ --trust-remote-code \ --no-enable-prefix-caching \ @@ -240,7 +241,7 @@ vllm serve /weights/DeepSeek-V3.1_w8a8mix_mtp \ --served-model-name deepseek_v3 \ --enable-expert-parallel \ --max-num-seqs 20 \ ---max-model-len 8192 \ +--max-model-len 16384 \ --max-num-batched-tokens 4096 \ --trust-remote-code \ --no-enable-prefix-caching \ diff --git a/docs/source/user_guide/support_matrix/supported_models.md b/docs/source/user_guide/support_matrix/supported_models.md index e7828723..abf9ffa5 100644 --- a/docs/source/user_guide/support_matrix/supported_models.md +++ b/docs/source/user_guide/support_matrix/supported_models.md @@ -15,9 +15,9 @@ Get the latest info here: https://github.com/vllm-project/vllm-ascend/issues/160 | Qwen3 | ✅ | | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ ||| ✅ || ✅ | ✅ | 128k | ✅ | [Qwen3-Dense](../../tutorials/Qwen3-Dense.md) | | Qwen3-based | ✅ | ||||||||||||||||||| | Qwen3-Coder | ✅ | | ✅ | A2/A3 ||✅|✅|✅|||✅|✅|✅|✅||||||[Qwen3-Coder-30B-A3B tutorial](../../tutorials/Qwen3-Coder-30B-A3B.md)| -| Qwen3-Moe | ✅ | | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ || ✅ | ✅ | ✅ | ✅ | ✅ ||| [Qwen3-235B-A22B](../../tutorials/Qwen3-235B-A22B.md) | +| Qwen3-Moe | ✅ | | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | 256k || [Qwen3-235B-A22B](../../tutorials/Qwen3-235B-A22B.md) | | Qwen3-Next | ✅ | | ✅ | A2/A3 | ✅ |||||| ✅ ||| ✅ || ✅ | ✅ ||| [Qwen3-Next](../../tutorials/Qwen3-Next.md) | -| Qwen2.5 | ✅ | | ✅ | A2/A3 | ✅ | ✅ | ✅ |||| ✅ || ✅ | ✅ |||||| [Qwen2.5-7B](../../tutorials/Qwen2.5-7B.md) | +| Qwen2.5 | ✅ | | ✅ | A2/A3 | ✅ | ✅ | ✅ |||| ✅ ||| ✅ |||||| [Qwen2.5-7B](../../tutorials/Qwen2.5-7B.md) | | Qwen2 | ✅ | ||||||||||||||||||| | Qwen2-based | ✅ | ||||||||||||||||||| | QwQ-32B | ✅ | |||||||||||||||||||