[Main2Main] Upgrade vllm commit to 0105 (#5595)

### What this PR does / why we need it? Upgrade vllm commit to 0105 (8be6432bdaf6275664d857b1e5e9bf8ed1ce299e) 1. Remove `maybe_padded_num_tokens` arg in `model_runner_v1.py` since https://github.com/vllm-project/vllm/pull/31517 deleted unused arg 2. Remove dense `Qwen/Qwen3-0.6B` in `tests/e2e/multicard/test_aclgraph_capture_replay.py` and `tests/e2e/multicard/test_data_parallel.py` due to https://github.com/vllm-project/vllm/pull/30739 where offline data parallel mode will not be supported/useful for dense models 3. Adapt `vllm_ascend/worker/worker.py` due to https://github.com/vllm-project/vllm/pull/31584 4. Adapt `self.block_size` calling due to https://github.com/vllm-project/vllm/pull/31540 5. Modify `test_mla_v1.py` due to https://github.com/vllm-project/vllm/pull/28454 , which refactorred `get_head_size()` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 7157596103 Signed-off-by: wjunLu <wjunlu217@gmail.com>
2026-01-06 08:44:29 +08:00
parent c5e2f48510
commit 3cf059a72b
15 changed files with 61 additions and 38 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -136,6 +136,7 @@ class EagleProposer(VllmEagleProposer):
        draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
        assert len(draft_attn_layer_names) == 1
        self.attn_layer_name = list(draft_attn_layer_names)
+        self.attn_layer_names = self.attn_layer_name

        # share embed_tokens with the target model if needed
        if get_pp_group().world_size == 1:
@@ -442,14 +443,19 @@ class EagleProposer(VllmEagleProposer):
            # For the requests that exceed the max model length, we set the
            # TODO: sequence length to 1 to minimize their overheads in attention.

+            if self.attn_metadata_builder is None:
+                attn_metadata_builder = self._get_attention_metadata_builder()
+            else:
+                attn_metadata_builder = self.attn_metadata_builder
+            block_size = attn_metadata_builder.kv_cache_spec.block_size
+
            # Compute the slot mapping.
-            block_numbers = (clamped_positions // self.block_size)
+            block_numbers = (clamped_positions // block_size)
            block_ids = attn_metadata.block_tables.gather(
                dim=1, index=block_numbers.view(-1, 1))
            block_ids = block_ids.view(-1)
-            slot_mapping_tmp = (
-                block_ids * self.vllm_config.cache_config.block_size +
-                clamped_positions % self.block_size)
+            slot_mapping_tmp = (block_ids * block_size +
+                                clamped_positions % block_size)

            # Mask out the slot mappings that exceed the max model length.
            # Otherwise, the KV cache will be inadvertently updated with the