[Bugfix] Improve Triton stability on Ascend for large grids (#6301)

### What this PR does / why we need it? Improve Triton stability on Ascend for large grids set `TRITON_ALL_BLOCKS_PARALLEL=1` when grids > 65535 - vLLM version: v0.14.1 - vLLM main: dc917cceb8 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-02-03 10:32:27 +08:00
parent 05cc03d785
commit 26b83f8bde
1 changed files with 10 additions and 0 deletions
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -16,6 +16,7 @@
 #
 import math
 import os
 from typing import Optional, Tuple
 import torch
@@ -547,6 +548,9 @@ class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding):
 class AscendMRotaryEmbedding(MRotaryEmbedding):
    # Empirical safety threshold for large Triton grids on Ascend NPU
    _ASCEND_TRITON_GRID_LIMIT = 65535
    def forward_triton(self,
                       positions: torch.Tensor,
                       query: torch.Tensor,
@@ -568,6 +572,12 @@ class AscendMRotaryEmbedding(MRotaryEmbedding):
        assert self.mrope_section
        # When the grid becomes large, enable TRITON_ALL_BLOCKS_PARALLEL 
        # to avoid scheduler/runtime failures.
        if (query_shape[0] > self._ASCEND_TRITON_GRID_LIMIT and 
                os.environ.get("TRITON_ALL_BLOCKS_PARALLEL") != "1"):
            os.environ["TRITON_ALL_BLOCKS_PARALLEL"] = "1"
        q, k = triton_mrope(
            query,
            key,