diff --git a/docs/source/tutorials/models/DeepSeek-V3.2.md b/docs/source/tutorials/models/DeepSeek-V3.2.md index 92aaedec..8573bf6d 100644 --- a/docs/source/tutorials/models/DeepSeek-V3.2.md +++ b/docs/source/tutorials/models/DeepSeek-V3.2.md @@ -140,6 +140,7 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=200 export VLLM_ASCEND_ENABLE_MLAPO=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --host 0.0.0.0 \ @@ -157,6 +158,7 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ +--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \ --speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' ``` @@ -197,6 +199,7 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=200 export VLLM_ASCEND_ENABLE_MLAPO=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --host 0.0.0.0 \ @@ -217,6 +220,7 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ +--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \ --speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' ``` @@ -244,6 +248,7 @@ export VLLM_USE_V1=1 export HCCL_BUFFSIZE=200 export VLLM_ASCEND_ENABLE_MLAPO=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --host 0.0.0.0 \ @@ -266,6 +271,7 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' \ +--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \ --speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' ``` @@ -301,7 +307,7 @@ export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export HCCL_CONNECT_TIMEOUT=120 export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 - +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --host 0.0.0.0 \ @@ -321,8 +327,9 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]}' \ ---speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}' +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48]}' \ +--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \ +--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' ``` @@ -354,7 +361,7 @@ export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export HCCL_CONNECT_TIMEOUT=120 export HCCL_INTRA_PCIE_ENABLE=1 export HCCL_INTRA_ROCE_ENABLE=0 - +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --host 0.0.0.0 \ @@ -376,8 +383,9 @@ vllm serve /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3.2-W8A8 \ --trust-remote-code \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.92 \ ---compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48]}' \ ---speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}' +--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48]}' \ +--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' \ +--speculative-config '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}' ``` @@ -832,7 +840,7 @@ python launch_online_dp.py --dp-size 8 --tp-size 4 --dp-size-local 4 --dp-rank-s ### Request Forwarding -To set up request forwarding, run the following script on any machine :download:`load_balance_proxy_server_example.py ` +To set up request forwarding, run the following script on any machine. You can get the proxy program in the repository's examples: [load_balance_proxy_server_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py) ```shell unset http_proxy