[Feature]: Support 310P device run qwen2.5/3 dense and qwen2.5vl models (#5776)

### What this PR does / why we need it? Add basic 310p support. Only dense models work with eager mode now. - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: Tflowers-0129 <2906339855@qq.com> Signed-off-by: Shaoxu Cheng <2906339855@qq.com>
2026-01-17 11:49:18 +08:00
parent 7feb74590b
commit 1ffca8673f
17 changed files with 682 additions and 23 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -48,6 +48,7 @@ from vllm_ascend.utils import (
    update_aclgraph_sizes,
    update_cudagraph_capture_sizes,
    update_default_aclgraph_sizes,
+    is_310p,
 )

 if TYPE_CHECKING:
@@ -322,7 +323,9 @@ class NPUPlatform(Platform):
        if parallel_config and parallel_config.worker_cls == "auto":
            # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
            parallel_config.all2all_backend = "flashinfer_all2allv"
-            if ascend_config.xlite_graph_config.enabled:
+            if is_310p():
+                parallel_config.worker_cls = "vllm_ascend._310p.worker_310p.NPUWorker310"
+            elif ascend_config.xlite_graph_config.enabled:
                logger.info("openEuler Xlite enabled. See: https://atomgit.com/openeuler/GVirt/tree/master/xlite")
                parallel_config.worker_cls = "vllm_ascend.xlite.xlite_worker.XliteWorker"
            else:
@@ -394,13 +397,27 @@ class NPUPlatform(Platform):

    @classmethod
    def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
+        key = (attn_selector_config.use_mla, attn_selector_config.use_sparse)
+
        backend_map = {
            (True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
            (False, False): "vllm_ascend.attention.attention_v1.AscendAttentionBackend",
            (True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
        }
+        backend_map_310 = {
+            (
+                False,
+                False,
+            ): "vllm_ascend._310p.attention.attention_v1.AscendAttentionBackend310",
+            # TODO If MLA/SFA is supported in the future, consider implementing the logic described in these comments.
+            # (True, False): "...AscendMLABackend310",
+            # (True, True):  "...AscendSFABackend310",
+        }

-        return backend_map[(attn_selector_config.use_mla, attn_selector_config.use_sparse)]
+        if is_310p():
+            return backend_map_310.get(key, backend_map_310[(False, False)])
+
+        return backend_map[key]

    @classmethod
    def get_punica_wrapper(cls) -> str: