[Feature]: Support 310P device run qwen2.5/3 dense and qwen2.5vl models (#5776)

### What this PR does / why we need it?
Add basic 310p support. Only dense models work with eager mode now.

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: Tflowers-0129 <2906339855@qq.com>
Signed-off-by: Shaoxu Cheng <2906339855@qq.com>
This commit is contained in:
Shaoxu Cheng
2026-01-17 11:49:18 +08:00
committed by GitHub
parent 7feb74590b
commit 1ffca8673f
17 changed files with 682 additions and 23 deletions

View File

@@ -48,6 +48,7 @@ from vllm_ascend.utils import (
update_aclgraph_sizes,
update_cudagraph_capture_sizes,
update_default_aclgraph_sizes,
is_310p,
)
if TYPE_CHECKING:
@@ -322,7 +323,9 @@ class NPUPlatform(Platform):
if parallel_config and parallel_config.worker_cls == "auto":
# TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
parallel_config.all2all_backend = "flashinfer_all2allv"
if ascend_config.xlite_graph_config.enabled:
if is_310p():
parallel_config.worker_cls = "vllm_ascend._310p.worker_310p.NPUWorker310"
elif ascend_config.xlite_graph_config.enabled:
logger.info("openEuler Xlite enabled. See: https://atomgit.com/openeuler/GVirt/tree/master/xlite")
parallel_config.worker_cls = "vllm_ascend.xlite.xlite_worker.XliteWorker"
else:
@@ -394,13 +397,27 @@ class NPUPlatform(Platform):
@classmethod
def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
key = (attn_selector_config.use_mla, attn_selector_config.use_sparse)
backend_map = {
(True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
(False, False): "vllm_ascend.attention.attention_v1.AscendAttentionBackend",
(True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
}
backend_map_310 = {
(
False,
False,
): "vllm_ascend._310p.attention.attention_v1.AscendAttentionBackend310",
# TODO If MLA/SFA is supported in the future, consider implementing the logic described in these comments.
# (True, False): "...AscendMLABackend310",
# (True, True): "...AscendSFABackend310",
}
return backend_map[(attn_selector_config.use_mla, attn_selector_config.use_sparse)]
if is_310p():
return backend_map_310.get(key, backend_map_310[(False, False)])
return backend_map[key]
@classmethod
def get_punica_wrapper(cls) -> str: