Add DeepSeek V3.2 support (#3270)

### What this PR does / why we need it?

This PR added the initial DeepSeek V3.2 support with [vLLM
v0.11.0](https://github.com/vllm-project/vllm/tree/releases/v0.11.0)
(not released yet). We will complete vLLM adaptation as soon as
possible. This feature will be ready in recent 1-2 days.

Related doc: https://github.com/vllm-project/vllm-ascend/pull/3223 .

### Does this PR introduce _any_ user-facing change?
Yes!

### How was this patch tested?
CI passed and Run deepseek doc soon.


- vLLM version: v0.11.0rc3
- vLLM main:
https://github.com/vllm-project/vllm/commit/releases/v0.11.0

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: zzzzwwjj <1183291235@qq.com>
Signed-off-by: linfeng-yuan <1102311262@qq.com>
Signed-off-by: wxsIcey <1790571317@qq.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: zzzzwwjj <1183291235@qq.com>
Co-authored-by: linfeng-yuan <1102311262@qq.com>
Co-authored-by: wxsIcey <1790571317@qq.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
wangxiyuan
2025-09-30 03:25:58 +08:00
committed by GitHub
parent 5503a3142f
commit 81bd6e4c99
27 changed files with 4354 additions and 70 deletions

View File

@@ -48,8 +48,8 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
class NPUTorchairModelRunner(NPUModelRunner):
def __init__(self, vllm_config: VllmConfig, device: torch.device):
ascend_config = get_ascend_config()
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
self.ascend_config = get_ascend_config()
self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
super().__init__(vllm_config, device)
if self.speculative_config:
self.actual_seq_lengths_q = list(
@@ -66,10 +66,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
self.new_kv_cache_bytes = -1
self.torchair_compiled_model = None # type: ignore
self.torchair_compiled_models = {} # type: ignore
self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes
if ascend_config.torchair_graph_config.graph_batch_sizes_init:
self.use_cached_npu_graph = self.ascend_config.torchair_graph_config.use_cached_graph
self.use_cached_kv_cache_bytes = self.ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
self.torchair_graph_batch_sizes = self.ascend_config.torchair_graph_config.graph_batch_sizes
if self.ascend_config.torchair_graph_config.graph_batch_sizes_init:
self.init_torchair_graph_batch_sizes()
self.update_torchair_graph_batch_sizes()
@@ -362,22 +362,23 @@ class NPUTorchairModelRunner(NPUModelRunner):
communication_adaptation_310p()
config = torchair.CompilerConfig()
if get_ascend_config().torchair_graph_config.mode:
config.mode = get_ascend_config().torchair_graph_config.mode
if self.ascend_config.torchair_graph_config.mode:
config.mode = self.ascend_config.torchair_graph_config.mode
config.experimental_config.frozen_parameter = \
get_ascend_config().torchair_graph_config.enable_frozen_parameter
self.ascend_config.torchair_graph_config.enable_frozen_parameter
# enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to
# disable it on 300I Duo platform now.
config.experimental_config.tiling_schedule_optimize = not is_310p()
config.experimental_config.enable_view_optimize = \
get_ascend_config().torchair_graph_config.enable_view_optimize
self.ascend_config.torchair_graph_config.enable_view_optimize
torch.npu.set_compile_mode(jit_compile=False)
if not self.use_cached_npu_graph:
npu_backend = torchair.get_npu_backend(compiler_config=config)
self.torchair_compiled_model = torch.compile(self.model,
dynamic=True,
fullgraph=True,
backend=npu_backend)
self.torchair_compiled_model = torch.compile(
self.model,
dynamic=not self.ascend_config.use_sfa,
fullgraph=True,
backend=npu_backend)
return self.torchair_compiled_model
else:
# Generate a new forward proxy code object to prevent the invalidation of
@@ -398,7 +399,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
self.torchair_compiled_models[
batch_size] = torchair.inference.cache_compile(
self.model.__dict__[forward_proxy_name],
dynamic=True,
dynamic=not self.ascend_config.use_sfa,
fullgraph=True,
cache_dir=TORCHAIR_CACHE_DIR,
config=config,