[Main2Main] Upgrade vllm commit to 0105 (#5595)

### What this PR does / why we need it? Upgrade vllm commit to 0105 (8be6432bdaf6275664d857b1e5e9bf8ed1ce299e) 1. Remove `maybe_padded_num_tokens` arg in `model_runner_v1.py` since https://github.com/vllm-project/vllm/pull/31517 deleted unused arg 2. Remove dense `Qwen/Qwen3-0.6B` in `tests/e2e/multicard/test_aclgraph_capture_replay.py` and `tests/e2e/multicard/test_data_parallel.py` due to https://github.com/vllm-project/vllm/pull/30739 where offline data parallel mode will not be supported/useful for dense models 3. Adapt `vllm_ascend/worker/worker.py` due to https://github.com/vllm-project/vllm/pull/31584 4. Adapt `self.block_size` calling due to https://github.com/vllm-project/vllm/pull/31540 5. Modify `test_mla_v1.py` due to https://github.com/vllm-project/vllm/pull/28454 , which refactorred `get_head_size()` ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 7157596103 Signed-off-by: wjunLu <wjunlu217@gmail.com>
2026-01-06 08:44:29 +08:00
parent c5e2f48510
commit 3cf059a72b
15 changed files with 61 additions and 38 deletions
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -34,7 +34,7 @@ jobs:
    steps:
      - name: Get vLLM version
        run: |
-          VLLM_COMMIT=7157596103666ee7ccb7008acee8bff8a8ff1731
+          VLLM_COMMIT=8be6432bdaf6275664d857b1e5e9bf8ed1ce299e
          echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

      - name: Checkout repository
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -74,7 +74,7 @@ jobs:
    name: e2e-full
    strategy:
      matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
    needs: [changes]
    if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
    uses: ./.github/workflows/_e2e_test.yaml
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -42,7 +42,7 @@ jobs:
  lint:
    uses: ./.github/workflows/_pre_commit.yml
    with:
-      vllm: 7157596103666ee7ccb7008acee8bff8a8ff1731
+      vllm: 8be6432bdaf6275664d857b1e5e9bf8ed1ce299e
  changes:
    runs-on: linux-aarch64-a2-0
    outputs:
@@ -90,7 +90,7 @@ jobs:
        SOC_VERSION: ascend910b1
    strategy:
      matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]

    steps:
      - name: Free up disk space
@@ -163,7 +163,7 @@ jobs:
    name: e2e-light
    strategy:
      matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
    # Note (yikun): If CI resource are limited we can split job into two chain jobs
    needs: [lint, changes]
    # only trigger e2e test after lint passed and the change is e2e related with pull request.
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
+|     main    | 8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |

 ## Release cadence

--- a/tests/e2e/multicard/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py
@@ -28,7 +28,8 @@ from vllm.utils.network_utils import get_open_port
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

 MODELS = [
-    "Qwen/Qwen3-0.6B",
+    # Offline data parallel mode will be not supported/useful for dense models
+    # "Qwen/Qwen3-0.6B",
    "vllm-ascend/DeepSeek-V2-Lite-W8A8",
 ]

--- a/tests/e2e/multicard/test_data_parallel.py
+++ b/tests/e2e/multicard/test_data_parallel.py
@@ -27,9 +27,7 @@ from unittest.mock import patch

 import pytest

-MODELS = [
-    "Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"
-]
+MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]


@pytest.mark.parametrize("model", MODELS)
--- a/tests/e2e/multicard/test_data_parallel_tp2.py
+++ b/tests/e2e/multicard/test_data_parallel_tp2.py
@@ -9,7 +9,7 @@ from unittest.mock import patch

 import pytest

-MODELS = ["Qwen/Qwen3-0.6B"]
+MODELS = ["Qwen/Qwen3-30B-A3B"]


@pytest.mark.parametrize("model", MODELS)
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -17,6 +17,7 @@ from vllm_ascend.attention.mla_v1 import (AscendMLABackend,
                                          AscendMLAPrefillMetadata,
                                          ChunkedContextMetadata)
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.utils import vllm_version_is


 class TestAscendMLABackend(TestBase):
@@ -392,7 +393,10 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
        self.mock_vllm_config.model_config = model_config
        self.kv_cache_spec = MagicMock()
        self.kv_cache_spec.num_layers = 32
-        self.kv_cache_spec.head_size = 128
+        if vllm_version_is('0.13.0'):
+            self.kv_cache_spec.head_size = 128
+        else:
+            self.kv_cache_spec.head_size = 64
        self.kv_cache_spec.num_heads = 32

    @patch("vllm_ascend.attention.mla_v1.get_cos_and_sin_mla")
--- a/tests/ut/conftest.py
+++ b/tests/ut/conftest.py
@@ -18,13 +18,6 @@
 import sys
 from unittest.mock import MagicMock

-from vllm_ascend.utils import adapt_patch  # noqa E402
-from vllm_ascend.utils import register_ascend_customop
-
-# triton and torch_npu is not available in the environment, so we need to mock them
-sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
-sys.modules['torch_npu._inductor'] = MagicMock()
-
 triton_runtime = MagicMock()
 triton_runtime.driver.active.utils.get_device_properties.return_value = {
    'num_aic': 8,
@@ -32,6 +25,13 @@ triton_runtime.driver.active.utils.get_device_properties.return_value = {
 }
 sys.modules['triton.runtime'] = triton_runtime

+from vllm_ascend.utils import adapt_patch  # noqa E402
+from vllm_ascend.utils import register_ascend_customop  # noqa E402
+
+# triton and torch_npu is not available in the environment, so we need to mock them
+sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
+sys.modules['torch_npu._inductor'] = MagicMock()
+
 adapt_patch()
 adapt_patch(True)

--- a/tests/ut/spec_decode/test_eagle_proposer.py
+++ b/tests/ut/spec_decode/test_eagle_proposer.py
@@ -58,7 +58,6 @@ class TestEagleProposerInitialization(TestBase):
                                 device=self.device,
                                 runner=self.runner)

-        self.assertEqual(proposer.block_size, 16)
        self.assertEqual(proposer.hidden_size, 4096)
        self.assertTrue(proposer.use_cuda_graph)

--- a/tests/ut/spec_decode/test_mtp_proposer.py
+++ b/tests/ut/spec_decode/test_mtp_proposer.py
@@ -86,7 +86,6 @@ class TestMtpProposer:
        assert proposer.dtype == torch.float16
        assert proposer.num_speculative_tokens == 2
        assert proposer.hidden_size == 4096
-        assert proposer.block_size == 16

        # Test with mrope enabled
        assert hasattr(proposer, "positions")
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -197,6 +197,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
        vllm_config: VllmConfig,
        device: torch.device,
    ):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        self.vllm_config = vllm_config
        self.model_config = vllm_config.model_config
        self.compilation_config = vllm_config.compilation_config
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -136,6 +136,7 @@ class EagleProposer(VllmEagleProposer):
        draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
        assert len(draft_attn_layer_names) == 1
        self.attn_layer_name = list(draft_attn_layer_names)
+        self.attn_layer_names = self.attn_layer_name

        # share embed_tokens with the target model if needed
        if get_pp_group().world_size == 1:
@@ -442,14 +443,19 @@ class EagleProposer(VllmEagleProposer):
            # For the requests that exceed the max model length, we set the
            # TODO: sequence length to 1 to minimize their overheads in attention.

+            if self.attn_metadata_builder is None:
+                attn_metadata_builder = self._get_attention_metadata_builder()
+            else:
+                attn_metadata_builder = self.attn_metadata_builder
+            block_size = attn_metadata_builder.kv_cache_spec.block_size
+
            # Compute the slot mapping.
-            block_numbers = (clamped_positions // self.block_size)
+            block_numbers = (clamped_positions // block_size)
            block_ids = attn_metadata.block_tables.gather(
                dim=1, index=block_numbers.view(-1, 1))
            block_ids = block_ids.view(-1)
-            slot_mapping_tmp = (
-                block_ids * self.vllm_config.cache_config.block_size +
-                clamped_positions % self.block_size)
+            slot_mapping_tmp = (block_ids * block_size +
+                                clamped_positions % block_size)

            # Mask out the slot mappings that exceed the max model length.
            # Otherwise, the KV cache will be inadvertently updated with the
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -107,7 +107,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
 from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
                               enable_sp, get_ascend_device_type, is_moe_model,
                               lmhead_tp_enable, maybe_trans_nz,
-                               set_weight_prefetch_method)
+                               set_weight_prefetch_method, vllm_version_is)
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 from vllm_ascend.worker.pcp_utils import PCPManager

@@ -1097,12 +1097,20 @@ class NPUModelRunner(GPUModelRunner):
                                             intermediate_tensors,
                                             inputs_embeds):
        assert self.model is not None
-        hidden_states = self.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-            **self._init_model_kwargs(maybe_padded_num_tokens))
+        if vllm_version_is('0.13.0'):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **self._init_model_kwargs(maybe_padded_num_tokens))
+        else:
+            hidden_states = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **self._init_model_kwargs())

        forward_context = get_forward_context()
        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
@@ -1548,10 +1556,16 @@ class NPUModelRunner(GPUModelRunner):
                logits = None
            else:
                if self.input_batch.pooling_params:
-                    pool_output = self._pool(
-                        hidden_states,
-                        scheduler_output.total_num_scheduled_tokens,
-                        num_scheduled_tokens_np)
+                    if vllm_version_is('0.13.0'):
+                        pool_output = self._pool(
+                            hidden_states,
+                            scheduler_output.total_num_scheduled_tokens,
+                            num_scheduled_tokens_np)
+                    else:
+                        pool_output = self._pool(
+                            hidden_states,
+                            scheduler_output.total_num_scheduled_tokens,
+                            num_scheduled_tokens_np, kv_connector_output)
                    if self.debugger is not None:
                        self.debugger.stop()
                        self.debugger.step()
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -299,7 +299,7 @@ class NPUWorker(WorkerBase):
    def execute_model(
        self,
        scheduler_output: "SchedulerOutput",
-    ) -> ModelRunnerOutput | None:
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
        # enable msMonitor to monitor the performance of vllm-ascend
        if envs_ascend.MSMONITOR_USE_DAEMON:
            dp.step()
@@ -318,7 +318,8 @@ class NPUWorker(WorkerBase):

        output = self.model_runner.execute_model(scheduler_output,
                                                 intermediate_tensors)
-        if isinstance(output, (ModelRunnerOutput, NoneType)):
+        if isinstance(output,
+                      (ModelRunnerOutput, AsyncModelRunnerOutput, NoneType)):
            return output

        assert isinstance(output, IntermediateTensors)