[Bugfix] fix the oom when chunkprefill with long context like 64k (#2319)
The attn mask was declared in the mla.py,we don't need the splitfuse
mask when mla chunkprefill, and this mask will cause memory problem when
long context like 64k or 128k
- vLLM version: v0.10.0
- vLLM main:
14a5d903ab
---------
Signed-off-by: haojiangzheng <justineric096@gmail.com>
This commit is contained in:
@@ -842,7 +842,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
def _make_attention_mask(self, seq_lens, query_lens, position,
|
def _make_attention_mask(self, seq_lens, query_lens, position,
|
||||||
attn_state) -> torch.Tensor:
|
attn_state) -> torch.Tensor:
|
||||||
# Chunk Prefill situation.
|
# Chunk Prefill situation.
|
||||||
if attn_state == AscendAttentionState.ChunkedPrefill:
|
if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
|
||||||
return self.attn_mask_builder.get_splitfuse_attn_mask(
|
return self.attn_mask_builder.get_splitfuse_attn_mask(
|
||||||
seq_lens, query_lens, position, self.dtype, self.device)
|
seq_lens, query_lens, position, self.dtype, self.device)
|
||||||
# Prefill without cache situation.
|
# Prefill without cache situation.
|
||||||
|
|||||||
Reference in New Issue
Block a user