From 26b83f8bde0f85bf01934aa7d33295e563e32754 Mon Sep 17 00:00:00 2001
From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com>
Date: Tue, 3 Feb 2026 10:32:27 +0800
Subject: [PATCH] [Bugfix] Improve Triton stability on Ascend for large grids
 (#6301)

### What this PR does / why we need it?
Improve Triton stability on Ascend for large grids
set `TRITON_ALL_BLOCKS_PARALLEL=1` when grids > 65535

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
---
 vllm_ascend/ops/rotary_embedding.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
index 0c19b3e9..b4da71f3 100644
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -16,6 +16,7 @@
 #
 
 import math
+import os
 from typing import Optional, Tuple
 
 import torch
@@ -547,6 +548,9 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
 
 class AscendMRotaryEmbedding(MRotaryEmbedding):
 
+    # Empirical safety threshold for large Triton grids on Ascend NPU
+    _ASCEND_TRITON_GRID_LIMIT = 65535
+
     def forward_triton(self,
                        positions: torch.Tensor,
                        query: torch.Tensor,
@@ -568,6 +572,12 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
 
         assert self.mrope_section
 
+        # When the grid becomes large, enable TRITON_ALL_BLOCKS_PARALLEL 
+        # to avoid scheduler/runtime failures.
+        if (query_shape[0] > self._ASCEND_TRITON_GRID_LIMIT and 
+                os.environ.get("TRITON_ALL_BLOCKS_PARALLEL") != "1"):
+            os.environ["TRITON_ALL_BLOCKS_PARALLEL"] = "1"
+
         q, k = triton_mrope(
             query,
             key,