feat: support data parallel for deepseek (#1012)

### What this PR does / why we need it? feat: support data parallel for deepseek ### Does this PR introduce _any_ user-facing change? Yes, support dp for deepseek ### How was this patch tested? ``` export VLLM_ENABLE_MC2=0 export VLLM_USE_V1=1 export TASK_QUEUE_ENABLE=1 source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/nnal/atb/set_env.sh nohup python -m vllm.entrypoints.openai.api_server --model=/path/to/DeepSeek-R1-W8A8 \ --quantization ascend \ --served-model-name auto \ --trust-remote-code \ --distributed-executor-backend=mp \ --port 8006 \ -tp=8 \ -dp=2 \ --max-num-seqs 24 \ --max-model-len 4096 \ --max-num-batched-tokens 4096 \ --block-size 128 \ -O 0 \ --no-enable-prefix-caching \ --additional-config '{"torchair_graph_batch_sizes":[24],"expert_tensor_parallel_size":16,"ascend_scheduler_config":{},"enable_graph_mode":true}' \ --gpu-memory-utilization 0.95 &> run.log & disown ``` Signed-off-by: boying <897013703@qq.com>
2025-06-04 18:31:41 +08:00
parent 517811449e
commit da9acfca60
8 changed files with 212 additions and 88 deletions
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -20,6 +20,7 @@ from typing import Any, Callable, Dict, Optional
 import torch
 import torch.distributed as dist
 import torch_npu
+from vllm.config import get_current_vllm_config
 from vllm.distributed import GroupCoordinator

 import vllm_ascend.envs as envs_ascend
@@ -508,6 +509,12 @@ class AscendW8A8DynamicFusedMoEMethod:

        self.ep_group = get_ep_group()

+        self.enable_graph_mode = False
+        additional_config = get_current_vllm_config().additional_config
+        if additional_config:
+            self.enable_graph_mode = additional_config.get(
+                "enable_graph_mode", False)
+
        try:
            device_group = self.ep_group.device_group
            # TODO: Try local_rank = ep_group.rank_in_group
@@ -629,7 +636,7 @@ class AscendW8A8DynamicFusedMoEMethod:
                top_k=top_k,
                expert_map=expert_map,
                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
-        elif self.ep_group.world_size == 1:
+        elif self.enable_graph_mode or self.ep_group.world_size == 1:
            return fused_experts(hidden_states=x,
                                 w1=layer.w13_weight,
                                 w1_scale=layer.w13_weight_scale,