Add DeepSeek V3.2 support (#3270)

### What this PR does / why we need it? This PR added the initial DeepSeek V3.2 support with [vLLM v0.11.0](https://github.com/vllm-project/vllm/tree/releases/v0.11.0) (not released yet). We will complete vLLM adaptation as soon as possible. This feature will be ready in recent 1-2 days. Related doc: https://github.com/vllm-project/vllm-ascend/pull/3223 . ### Does this PR introduce _any_ user-facing change? Yes! ### How was this patch tested? CI passed and Run deepseek doc soon. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.0 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: wxsIcey <1790571317@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-09-30 03:25:58 +08:00
parent 5503a3142f
commit 81bd6e4c99
27 changed files with 4354 additions and 70 deletions
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -48,8 +48,8 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 class NPUTorchairModelRunner(NPUModelRunner):

    def __init__(self, vllm_config: VllmConfig, device: torch.device):
-        ascend_config = get_ascend_config()
-        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.ascend_config = get_ascend_config()
+        self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
        super().__init__(vllm_config, device)
        if self.speculative_config:
            self.actual_seq_lengths_q = list(
@@ -66,10 +66,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
        self.new_kv_cache_bytes = -1
        self.torchair_compiled_model = None  # type: ignore
        self.torchair_compiled_models = {}  # type: ignore
-        self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
-        self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
-        self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes
-        if ascend_config.torchair_graph_config.graph_batch_sizes_init:
+        self.use_cached_npu_graph = self.ascend_config.torchair_graph_config.use_cached_graph
+        self.use_cached_kv_cache_bytes = self.ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
+        self.torchair_graph_batch_sizes = self.ascend_config.torchair_graph_config.graph_batch_sizes
+        if self.ascend_config.torchair_graph_config.graph_batch_sizes_init:
            self.init_torchair_graph_batch_sizes()

        self.update_torchair_graph_batch_sizes()
@@ -362,22 +362,23 @@ class NPUTorchairModelRunner(NPUModelRunner):
            communication_adaptation_310p()

        config = torchair.CompilerConfig()
-        if get_ascend_config().torchair_graph_config.mode:
-            config.mode = get_ascend_config().torchair_graph_config.mode
+        if self.ascend_config.torchair_graph_config.mode:
+            config.mode = self.ascend_config.torchair_graph_config.mode
        config.experimental_config.frozen_parameter = \
-        get_ascend_config().torchair_graph_config.enable_frozen_parameter
+        self.ascend_config.torchair_graph_config.enable_frozen_parameter
        # enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to
        # disable it on 300I Duo platform now.
        config.experimental_config.tiling_schedule_optimize = not is_310p()
        config.experimental_config.enable_view_optimize = \
-        get_ascend_config().torchair_graph_config.enable_view_optimize
+        self.ascend_config.torchair_graph_config.enable_view_optimize
        torch.npu.set_compile_mode(jit_compile=False)
        if not self.use_cached_npu_graph:
            npu_backend = torchair.get_npu_backend(compiler_config=config)
-            self.torchair_compiled_model = torch.compile(self.model,
-                                                         dynamic=True,
-                                                         fullgraph=True,
-                                                         backend=npu_backend)
+            self.torchair_compiled_model = torch.compile(
+                self.model,
+                dynamic=not self.ascend_config.use_sfa,
+                fullgraph=True,
+                backend=npu_backend)
            return self.torchair_compiled_model
        else:
            # Generate a new forward proxy code object to prevent the invalidation of
@@ -398,7 +399,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
            self.torchair_compiled_models[
                batch_size] = torchair.inference.cache_compile(
                    self.model.__dict__[forward_proxy_name],
-                    dynamic=True,
+                    dynamic=not self.ascend_config.use_sfa,
                    fullgraph=True,
                    cache_dir=TORCHAIR_CACHE_DIR,
                    config=config,