From 0f7492d18e572f53cce67615aa714ad09090f859 Mon Sep 17 00:00:00 2001
From: zhenghaojiang <zhjoneson@163.com>
Date: Wed, 13 Aug 2025 17:15:59 +0800
Subject: [PATCH] [Bugfix] fix the oom when chunkprefill with long context like
 64k (#2319)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The attn mask was declared in the mla.py，we don't need the splitfuse
mask when mla chunkprefill, and this mask will cause memory problem when
long context like 64k or 128k

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/14a5d903ab826b723a24a2d89631006394de76a1

---------

Signed-off-by: haojiangzheng <justineric096@gmail.com>
---
 vllm_ascend/worker/model_runner_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 3aeabc6..9891a02 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -842,7 +842,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
     def _make_attention_mask(self, seq_lens, query_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
-        if attn_state == AscendAttentionState.ChunkedPrefill:
+        if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
             return self.attn_mask_builder.get_splitfuse_attn_mask(
                 seq_lens, query_lens, position, self.dtype, self.device)
         # Prefill without cache situation.