From 26b83f8bde0f85bf01934aa7d33295e563e32754 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Tue, 3 Feb 2026 10:32:27 +0800 Subject: [PATCH] [Bugfix] Improve Triton stability on Ascend for large grids (#6301) ### What this PR does / why we need it? Improve Triton stability on Ascend for large grids set `TRITON_ALL_BLOCKS_PARALLEL=1` when grids > 65535 - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: hfadzxy --- vllm_ascend/ops/rotary_embedding.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0c19b3e9..b4da71f3 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -16,6 +16,7 @@ # import math +import os from typing import Optional, Tuple import torch @@ -547,6 +548,9 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding): class AscendMRotaryEmbedding(MRotaryEmbedding): + # Empirical safety threshold for large Triton grids on Ascend NPU + _ASCEND_TRITON_GRID_LIMIT = 65535 + def forward_triton(self, positions: torch.Tensor, query: torch.Tensor, @@ -568,6 +572,12 @@ class AscendMRotaryEmbedding(MRotaryEmbedding): assert self.mrope_section + # When the grid becomes large, enable TRITON_ALL_BLOCKS_PARALLEL + # to avoid scheduler/runtime failures. + if (query_shape[0] > self._ASCEND_TRITON_GRID_LIMIT and + os.environ.get("TRITON_ALL_BLOCKS_PARALLEL") != "1"): + os.environ["TRITON_ALL_BLOCKS_PARALLEL"] = "1" + q, k = triton_mrope( query, key,