qwen3_moe/qwen25 support torchair graph (#2403)

### What this PR does / why we need it?
Added support for the TorchAir graph mode in qwen3_moe and qwen2.5
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```bash
llm = LLM(
    model=model,
    tensor_parallel_size=GPUs_per_dp_rank,
    enforce_eager=False,
    enable_expert_parallel=True,
    max_model_len=4096,
    max_num_seqs=16,
    trust_remote_code=trust_remote_code,
    gpu_memory_utilization=0.4,
    additional_config={
             "torchair_graph_config": {
                 "enabled": True,
                 "use_cached_graph": False,
                 "graph_batch_sizes_init": False,
                 "graph_batch_sizes": [16]
             },
             "ascend_scheduler_config": {
                 "enabled": True,
                 "chunked_prefill_enabled":True,
             },
             "refresh": True,
    },
)
```

- vLLM version: v0.10.0
- vLLM main:
b87cb97a53

Signed-off-by: taoyuxiang <oui.nicholas.tao@gmail.com>
This commit is contained in:
Nicholas Tao
2025-08-20 11:23:50 +08:00
committed by GitHub
parent 31ae249742
commit 7bec1a9b9c
9 changed files with 1123 additions and 9 deletions

View File

@@ -17,7 +17,7 @@ from typing import Optional
from vllm.logger import logger
TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2"]
TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2", "qwen"]
def _check_torchair_supported(model_type: str):
@@ -162,7 +162,7 @@ def check_ascend_config(vllm_config, enforce_eager):
else:
# torchair_graph case
if ascend_config.torchair_graph_config.enabled:
# torchair_graph is supported for deepseek/pangu model only.
# torchair_graph is supported for deepseek/pangu/qwen model only.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if not _check_torchair_supported(model_type):