support deepseek quant & mix-parallel with graphmode (#585)

### What this PR does / why we need it? 1. support deepseek with w8a8 quant; 2. support deepseek with mix-parallel(multi-DP, EP+TP); 3. support deepseek with graphmode. --------- Signed-off-by: wen-jie666 <wenjie39@huawei.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: wen-jie666 <wenjie39@huawei.com>
2025-04-23 16:23:25 +08:00
parent e74331a1ed
commit 5c6d05a59e
13 changed files with 520 additions and 221 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -24,6 +24,7 @@ import torch_npu  # noqa: F401
 import vllm.envs as envs
 from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum
+from vllm.utils import supports_dynamo

 CUSTOM_OP_ENABLED = False
 try:
@@ -119,6 +120,15 @@ class NPUPlatform(Platform):
                compilation_config.level)
            compilation_config.level = CompilationLevel.NO_COMPILATION

+        if vllm_config.additional_config is not None:
+            enable_graph_mode = vllm_config.additional_config.get(
+                "enable_graph_mode", False)
+            if enable_graph_mode and not supports_dynamo():
+                logger.warning(
+                    "enable_graph_mode is not supported because the version of torch is too low, forcing close enable_graph_mode"
+                )
+                vllm_config.additional_config["enable_graph_mode"] = False
+
        parallel_config = vllm_config.parallel_config
        if parallel_config and parallel_config.worker_cls == "auto":
            if envs.VLLM_USE_V1: