upgrade main to 0212 (#6712)

### What this PR does / why we need it? Fixes `transformers_utils/processors/__init__` import error, due to https://github.com/vllm-project/vllm/pull/33247 Fixes Fused MoE break introduced by `MoERunner abstraction,` due to https://github.com/vllm-project/vllm/pull/32344 > delete AscendMoERunnere when https://github.com/vllm-project/vllm/pull/35178 is merged Fixes `Make Qwen3VL compatible with Transformers v5`, due to https://github.com/vllm-project/vllm/pull/34262 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea --------- Signed-off-by: wxsIcey <1790571317@qq.com>
2026-02-25 09:17:29 +08:00
parent 0331f16a50
commit ee59429015
11 changed files with 167 additions and 32 deletions
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Get vLLM version
        run: |
-          VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
+          VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
          echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
      - name: Checkout repository
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
+ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
    cd /vllm-workspace/vllm && \
    git checkout $VLLM_COMMIT
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -75,7 +75,7 @@ jobs:
    name: e2e-full
    strategy:
      matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
    needs: [changes]
    if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
    uses: ./.github/workflows/_e2e_test.yaml
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
  lint:
    uses: ./.github/workflows/_pre_commit.yml
    with:
-      vllm: 9562912cead1f11e8540fb91306c5cbda66f0007
+      vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
  changes:
    runs-on: linux-aarch64-a2b3-0
    outputs:
@@ -87,7 +87,7 @@ jobs:
    if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
    strategy:
      matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
    uses: ./.github/workflows/_unit_test.yaml
    with:
      vllm: ${{ matrix.vllm_version }}
@@ -99,7 +99,7 @@ jobs:
    name: e2e-light
    strategy:
      matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
    # Note (yikun): If CI resource are limited we can split job into two chain jobs
    needs: [lint, changes]
    # only trigger e2e test after lint passed and the change is e2e related with pull request.
--- a/.github/workflows/schedule_codecov_refresh.yaml
+++ b/.github/workflows/schedule_codecov_refresh.yaml
@@ -33,7 +33,7 @@ jobs:
    name: refresh codecov
    strategy:
      matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
    uses: ./.github/workflows/_unit_test.yaml
    with:
      vllm: ${{ matrix.vllm_version }}
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 ## Release cadence
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,7 @@ setuptools>=64
 setuptools-scm>=8
 torch==2.9.0
 torchvision
 torchaudio
 wheel
 xgrammar>=0.1.30
 pandas-stubs
--- a/tests/ut/eplb/core/test_eplb_utils.py
+++ b/tests/ut/eplb/core/test_eplb_utils.py
@@ -25,10 +25,6 @@ class TestAscendConfig(unittest.TestCase):
        if vllm_version_is("0.15.0"):
            moe_parallel_config = FusedMoEParallelConfig(
                2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
        else:
            moe_parallel_config = FusedMoEParallelConfig(
                2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
                is_sequence_parallel=False, enable_eplb=True)
            moe_config = FusedMoEConfig(
                num_experts=8,
                experts_per_token=8,
@@ -41,6 +37,23 @@ class TestAscendConfig(unittest.TestCase):
                moe_parallel_config=moe_parallel_config,
                in_dtype=torch.float16,
            )
        else:
            moe_parallel_config = FusedMoEParallelConfig(
                2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
                enable_eplb=True)
            moe_config = FusedMoEConfig(
                num_experts=8,
                experts_per_token=8,
                hidden_dim=8192,
                intermediate_size_per_partition=5,
                num_local_experts=8,
                num_logical_experts=8,
                activation="silu",
                device="npu",
                routing_method=RoutingMethodType.Simulated,
                moe_parallel_config=moe_parallel_config,
                in_dtype=torch.float16,
            )
        moe_config.supports_eplb = True
        self.vllm_config = vllm_config
        self.moe_config = moe_config
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
                prev_draft_token_indices.extend(range(start, start + draft_len))
                indices_match &= prev_index == flattened_index
                max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(sample_flattened_indices)
+        num_common_tokens = len(sample_flattened_indices)
        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
-        if num_commmon_tokens < total_without_spec:
+        if num_common_tokens < total_without_spec:
            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
            if self.enable_prompt_embeds:
                self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
                self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
-        if num_commmon_tokens == 0:
+        if num_common_tokens == 0:
            return
-        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+        if indices_match and max_flattened_index == (num_common_tokens - 1):
            # NOTE: Override the copy_ function here
-            indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
+            indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
-            source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
+            source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
            self.input_ids.gpu.index_copy_(0, indices, source)
            if self.enable_prompt_embeds:
-                self.is_token_ids.gpu[:num_commmon_tokens] = True
+                self.is_token_ids.gpu[:num_common_tokens] = True
            return
        # Upload the index tensors asynchronously so the scatter can be non-blocking.
        sampled_tokens_index_tensor = torch.tensor(
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -28,6 +28,13 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 from vllm_ascend.utils import vllm_version_is
 if not vllm_version_is("0.15.0"):
    from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase  # type: ignore
    from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter  # type: ignore
    from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner  # type: ignore
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -154,6 +161,77 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
        return final_hidden_states
 if not vllm_version_is("0.15.0"):
    # Please remove this inheritance after extending vllm, todo(wxs)
    class AscendMoERunner(DefaultMoERunner):
        """
        Default implementation of the MoE runner for executing Mixture of Experts layers.
        This class provides a comprehensive implementation for running MoE computations
        with support for:
        - Expert routing and token dispatching
        - Shared experts computation with optional parallel execution using CUDA streams
        - Data parallel (DP) chunking for large batch processing
        - Tensor model parallel and expert parallel operations
        - Various quantization methods and custom operators
        - Both monolithic and decomposed expert execution paths
        The runner handles the complete MoE forward pass including routing tokens to
        experts, executing expert computations, and combining results. It supports
        advanced features like overlapped execution of shared experts and optimized
        kernels for different parallel execution modes.
        Eventually, this class will be split up and specialized for different
        configurations, e.g. the presence or absence of shared experts, a gate, etc.
        """
        def __init__(
            self,
            layer: torch.nn.Module,
            moe_config: FusedMoEConfig,
            router: FusedMoERouter,
            routed_input_transform: torch.nn.Module | None,
            gate: torch.nn.Module | None,
            shared_experts: torch.nn.Module | None,
            quant_method: FusedMoEMethodBase,
            reduce_results: bool,
            enable_dbo: bool,
        ):
            super().__init__(
                layer,
                moe_config,
                router,
                routed_input_transform,
                gate,
                shared_experts,
                quant_method,
                reduce_results,
                enable_dbo,
            )
            if self.shared_experts is None:
                self.moe_forward = torch.ops.vllm.moe_forward
            else:
                self.moe_forward = torch.ops.vllm.moe_forward_shared
        def forward_impl(
            self,
            layer: torch.nn.Module,
            hidden_states: torch.Tensor,
            router_logits: torch.Tensor,
            shared_input: torch.Tensor | None,
        ):
            """
            Override the default forward_impl to use Ascend-specific implementation.
            This delegates to the layer's forward_impl method which contains the
            Ascend-specific MoE computation logic.
            """
            result = layer.forward_impl(hidden_states, router_logits)
            # If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
            # Otherwise, it returns just routed_out
            # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
            return result
 class AscendFusedMoE(FusedMoE):
    moe_counter = -1
    gate_stream: torch.npu.Stream | None = None
@@ -237,6 +315,26 @@ class AscendFusedMoE(FusedMoE):
        setup_moe_comm_method(self.moe_config)
        self.quant_type = self._get_quant_type()
        if not vllm_version_is("0.15.0"):
            self.runner = self._init_runner()
    if not vllm_version_is("0.15.0"):
        def _init_runner(self):
            # Storing the runner in the FusedMoE is an intermediate state, eventually
            # the runner will own the FusedMoE layer and provide the execution interface
            # for MoE ops.
            return AscendMoERunner(
                layer=self,
                moe_config=self.moe_config,
                router=self.router,
                routed_input_transform=self._routed_input_transform,
                gate=self.gate,
                shared_experts=self.shared_experts,
                quant_method=self.quant_method,
                reduce_results=self.reduce_results,
                enable_dbo=self.vllm_config.parallel_config.enable_dbo,
            )
    def _get_quant_type(self) -> QuantType:
        quant_type = QuantType.NONE
@@ -266,6 +364,19 @@ class AscendFusedMoE(FusedMoE):
        """
        return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
    if not vllm_version_is("0.15.0"):
        def forward(
            self,
            hidden_states: torch.Tensor,
            router_logits: torch.Tensor,
        ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
            self.ensure_moe_quant_config_init()
            return self.runner.forward(
                hidden_states,
                router_logits,
            )
    def forward_impl(  # type: ignore[override]
        self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
    ) -> torch.Tensor | FusedMoEResult:
@@ -414,6 +525,10 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
            logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
        self._gate = gate
        if not vllm_version_is("0.15.0"):
            # Recreate the runner with the correct shared_experts parameter
            # The parent class created the runner before self._shared_experts was set
            self.runner = self._init_runner()
        if self.multistream_overlap_shared_expert:
            # Wrap the quant_method's process_weights_after_loading to validate that
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -525,6 +525,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
            "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
        )
    from vllm_ascend.utils import vllm_version_is
    if vllm_version_is("0.15.0"):
        arch_name = vllm_config.model_config.architectures[0]
    else:
        arch_name = vllm_config.model_config.architecture
    # If original sizes exceed maximum, sample a representative subset
    if max_num_batch_sizes < len(original_sizes):
        # Sample uniformly from original sizes
@@ -536,10 +543,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
        sampled_sizes = [original_sizes[i] for i in indices]
        update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
        logger.info(
            "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
-            vllm_config.model_config.architectures[0],
+            arch_name,
            num_hidden_layers,
            len(original_sizes),
            len(
@@ -551,7 +557,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
        compilation_config.cudagraph_capture_sizes = original_sizes
        logger.info(
            "No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
-            vllm_config.model_config.architectures[0],
+            arch_name,
            num_hidden_layers,
            len(original_sizes),
        )