From 3ce5a34468e92512670759f7ee0aae0defa4ae94 Mon Sep 17 00:00:00 2001 From: Wang Xiaoran <1127858301@qq.com> Date: Fri, 9 Jan 2026 15:55:30 +0800 Subject: [PATCH] [BugFix] Xlite: Bypass the padding of the graph mode in non-MTP cases to obtain the correct decode num. (#5711) ### What this PR does / why we need it? This PR fixes a bug in Xlite backend(https://atomgit.com/openeuler/GVirt/issues/1), The direct cause of the problem is that the XModel::PrepareAttn function obtained an illegal number of tokens to be inferred, -540. This illegal value is due to the padding feature of inference in graph mode and the residual state across steps. This issue is triggered when a prefill request is newly added in a step and a decode ends simultaneously. It is first fixed using num_decode_tokens instead of attn_metadata.num_decodes. 1. In graph mode, vllm_ascend has padding characteristics. In the _prepare_inputs function, if the number of tokens to be inferred is less than the set threshold (8 in this case), the attn_metadata.num_decode array will be expanded to 8. 2. Meanwhile, vllm_ascend uses the class variable self.query_start_loc of NPUModelRunner to record the tokens to be inferred. Due to poor coordination with the graph mode padding mechanism when crossing steps, in some cases (such as when a decode request is completed in a certain step and a new prefill request is added at the same time), negative values may be calculated for attn_metadata.query_lens. 3. After type conversion, the negative values in query_lens cause an overflow. Xlite detects that the number of tokens to be inferred for the decode request is too large and triggers a "decode len too long" alert. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Same with https://atomgit.com/openeuler/GVirt/issues/1 - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: wwwumr <1127858301@qq.com> --- vllm_ascend/xlite/xlite.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index e6c7437a..534b1b15 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -255,7 +255,14 @@ class XliteWrapper: ] if not with_prefill or self.full_mode: - batch = attn_metadata.num_prefills + attn_metadata.num_decodes + # TODO: When vllm_ascend enables graph mode, attn_metadata.num_decodes + # will be padded in decode requests. Therefore, it is first fixed using + # num_decode_tokens. However, in the future, when MTP is enabled, there + # may be cases where a single request involves multiple tokens, which + # will need to be solved. + num_decodes = attn_metadata.num_decode_tokens + num_prefills = attn_metadata.num_prefills + batch = num_prefills + num_decodes seq_lens = attn_metadata.seq_lens[:batch] seq_tensor = torch.cat([ torch.tensor([0]), @@ -269,9 +276,9 @@ class XliteWrapper: xlite_attn_metadata = ModelAttnMeta() xlite_attn_metadata.lens = query_lens.tolist() xlite_attn_metadata.cached_lens = cached_lens.tolist() - xlite_attn_metadata.is_prefills = [ - False - ] * attn_metadata.num_decodes + [True] * attn_metadata.num_prefills + xlite_attn_metadata.is_prefills = [False] * num_decodes + [ + True + ] * num_prefills xlite_attn_metadata.block_tables = attn_metadata.block_tables.cpu( ).tolist()