From ccb6fb9ec19e6ae5c69bf7d73e7a8bbe6ef55c97 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Thu, 16 Oct 2025 19:43:09 +0800 Subject: [PATCH] [Fix] Clears unused slot mappings and fix accuracy issue with MLA models when enabling `FULL_DECODE_ONLY` (#3482) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? MLA and GQA use different computation logic: MLA slice batches and only compute on the actually valid tokens. That means outer padding must be handled carefully — the accuracy issue this PR fixes was caused by stale data in `slot_mapping` being reused by subsequent inference steps. So we zeros out the portion of the slot mapping tensor that is not used by the currently scheduled tokens. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? Working on it. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 69daf02..731b93f 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1462,6 +1462,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): slot_mapping[:total_num_scheduled_tokens], non_blocking=True, ) + self.slot_mapping[total_num_scheduled_tokens:].fill_(0) # Make AscendCommonAttentionMetadata common_attn_metadata = AscendCommonAttentionMetadata(