From 0f7492d18e572f53cce67615aa714ad09090f859 Mon Sep 17 00:00:00 2001 From: zhenghaojiang Date: Wed, 13 Aug 2025 17:15:59 +0800 Subject: [PATCH] [Bugfix] fix the oom when chunkprefill with long context like 64k (#2319) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The attn mask was declared in the mla.py,we don't need the splitfuse mask when mla chunkprefill, and this mask will cause memory problem when long context like 64k or 128k - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/14a5d903ab826b723a24a2d89631006394de76a1 --------- Signed-off-by: haojiangzheng --- vllm_ascend/worker/model_runner_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3aeabc6..9891a02 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -842,7 +842,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): def _make_attention_mask(self, seq_lens, query_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. - if attn_state == AscendAttentionState.ChunkedPrefill: + if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: return self.attn_mask_builder.get_splitfuse_attn_mask( seq_lens, query_lens, position, self.dtype, self.device) # Prefill without cache situation.