[Feature] support aclgraph for model runner v2 (#7110)

### What this PR does / why we need it? This PR aims to support aclgraph for model runner v2, please see RFC #5208. The PR contains these modifications: - adapt to newest commit of vllm main branch. - supply a unified interface of extra forward context for both model runner v1 and model runner v2. - implement graph mode for main model. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2026-03-13 09:11:46 +08:00
parent 1f71da80eb
commit c980e68d40
52 changed files with 840 additions and 309 deletions
--- a/vllm_ascend/ops/fused_moe/moe_mlp.py
+++ b/vllm_ascend/ops/fused_moe/moe_mlp.py
@@ -18,10 +18,9 @@
 import torch
 import torch_npu
 from torch.nn.functional import pad
-from vllm.forward_context import get_forward_context
 from vllm.triton_utils import HAS_TRITON

-from vllm_ascend.ascend_forward_context import MoECommType
+from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
 from vllm_ascend.device.device_op import DeviceOperator
 from vllm_ascend.device.mxfp_compat import (
    ensure_mxfp8_moe_available,
@@ -147,7 +146,7 @@ def quant_apply_mlp(
    weight_prefetch_method = get_weight_prefetch_method()
    if weight_prefetch_method:
        weight_prefetch_method.maybe_prefetch_moe_weight_postprocess(hidden_states)
-    is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
+    is_mc2 = _EXTRA_CTX.moe_comm_type == MoECommType.MC2
    if w1_scale_bias is None and w1_offset is None and is_mc2:
        if _custom_gmm_swiglu_enabled(fusion, dynamic_eplb) and not use_mxfp_quant:
            # gmm1: gate_up_proj & act_fn: swiglu