[DOC] enable both flashcomm1 and cudagraph (#6807)
## What this PR does / why we need it?
This PR updates the DeepSeek-V3.2 documentation to include the latest
performance optimizations and configuration improvements.
### Changes
- **Enable FlashComm1**: Added `VLLM_ASCEND_ENABLE_FLASHCOMM1=1`
environment variable across all deployment scenarios to enable
FlashComm1 for improved communication performance
- **Layer Sharding**: Added `--additional-config '{"layer_sharding":
["q_b_proj", "o_proj"]}'` configuration to enable layer sharding for
better memory distribution
- **CUDA Graph Optimization**: Updated cudagraph capture sizes from
`[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]` to `[8, 16, 24, 32, 40,
48]`
- **Speculative Decoding**: Increased `num_speculative_tokens` from 2 to
3
- **Documentation Links**: Fixed request forwarding documentation to use
proper GitHub repository links
## Does this PR introduce _any_ user-facing change?
Yes, users can now follow the updated documentation to enable FlashComm1
and layer sharding for improved DeepSeek-V3.2 performance.
## How was this patch tested?
Existing documentation examples have been validated to ensure
configuration consistency across all deployment scenarios.
---
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
Signed-off-by: guozr <guozr1997@hotmail.com>
Co-authored-by: guozr <guozr1997@hotmail.com>
This commit is contained in:
@@ -140,6 +140,7 @@ export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
|
||||
vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -157,6 +158,7 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
||||
|
||||
```
|
||||
@@ -197,6 +199,7 @@ export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
|
||||
vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -217,6 +220,7 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
||||
```
|
||||
|
||||
@@ -244,6 +248,7 @@ export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=200
|
||||
export VLLM_ASCEND_ENABLE_MLAPO=1
|
||||
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
|
||||
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
|
||||
vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -266,6 +271,7 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
||||
```
|
||||
|
||||
@@ -301,7 +307,7 @@ export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
export HCCL_CONNECT_TIMEOUT=120
|
||||
export HCCL_INTRA_PCIE_ENABLE=1
|
||||
export HCCL_INTRA_ROCE_ENABLE=0
|
||||
|
||||
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
|
||||
vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -321,8 +327,9 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--trust-remote-code \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}'
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48]}' \
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
||||
|
||||
```
|
||||
|
||||
@@ -354,7 +361,7 @@ export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
export HCCL_CONNECT_TIMEOUT=120
|
||||
export HCCL_INTRA_PCIE_ENABLE=1
|
||||
export HCCL_INTRA_ROCE_ENABLE=0
|
||||
|
||||
export VLLM_ASCEND_ENABLE_FLASHCOMM1=1
|
||||
|
||||
vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -376,8 +383,9 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \
|
||||
--trust-remote-code \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.92 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}'
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48]}' \
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
||||
|
||||
```
|
||||
|
||||
@@ -832,7 +840,7 @@ python launch_online_dp.py --dp-size 8 --tp-size 4 --dp-size-local 4 --dp-rank-s
|
||||
|
||||
### Request Forwarding
|
||||
|
||||
To set up request forwarding, run the following script on any machine :download:`load_balance_proxy_server_example.py <examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py>`
|
||||
To set up request forwarding, run the following script on any machine. You can get the proxy program in the repository's examples: [load_balance_proxy_server_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py)
|
||||
|
||||
```shell
|
||||
unset http_proxy
|
||||
|
||||
Reference in New Issue
Block a user