From 13c73924169823fa6cc2e43b47fb6afc62e286fa Mon Sep 17 00:00:00 2001
From: Yaphets24 <44045681+Yaphets24@users.noreply.github.com>
Date: Tue, 14 Apr 2026 17:52:55 +0800
Subject: [PATCH] [BugFix] fix dsv3.1 service failed to start (#8207)

### What this PR does / why we need it?

This PR fixes a service startup failure for DeepSeek-V3.1 models by
removing a strict type assertion for `MLAAttentionSpec` in
`NPUModelRunner.get_kv_cache_spec`. The assertion was failing due to
class identity mismatches caused by the runtime patching of
`MLAAttentionSpec` with `AscendMLAAttentionSpec`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Verified that the service starts correctly for DSV3.1 models.

Signed-off-by: mayumeng <m30059191@china.huawei.com>
Co-authored-by: mayumeng <m30059191@china.huawei.com>
---
 vllm_ascend/worker/model_runner_v1.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index bac7345a..84264698 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -3241,7 +3241,6 @@ class NPUModelRunner(GPUModelRunner):
                         cache_sparse_c8=self.use_sparse_c8_indexer,
                     )
                 elif spec := attn_module.get_kv_cache_spec(self.vllm_config):
-                    assert isinstance(spec, MLAAttentionSpec)
                     from vllm.v1.kv_cache_interface import MLAAttentionSpec as AscendMLAAttentionSpec
                     if getattr(attn_module.impl, "fa_quant_layer", False):
                         head_size = attn_module.head_size + attn_module.qk_rope_head_dim