upgrade main to 0212 (#6712)

### What this PR does / why we need it?
Fixes `transformers_utils/processors/__init__` import error, due to
https://github.com/vllm-project/vllm/pull/33247
Fixes Fused MoE break introduced by `MoERunner abstraction,` due to
https://github.com/vllm-project/vllm/pull/32344

> delete AscendMoERunnere when
https://github.com/vllm-project/vllm/pull/35178 is merged

Fixes `Make Qwen3VL compatible with Transformers v5`, due to
https://github.com/vllm-project/vllm/pull/34262

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
Icey
2026-02-25 09:17:29 +08:00
committed by GitHub
parent 0331f16a50
commit ee59429015
11 changed files with 167 additions and 32 deletions

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
- name: Checkout repository

View File

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 9562912cead1f11e8540fb91306c5cbda66f0007
vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
@@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
@@ -99,7 +99,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}

View File

@@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
## Release cadence

View File

@@ -13,6 +13,7 @@ setuptools>=64
setuptools-scm>=8
torch==2.9.0
torchvision
torchaudio
wheel
xgrammar>=0.1.30
pandas-stubs

View File

@@ -25,22 +25,35 @@ class TestAscendConfig(unittest.TestCase):
if vllm_version_is("0.15.0"):
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
else:
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
is_sequence_parallel=False, enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
num_logical_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
moe_config.supports_eplb = True
self.vllm_config = vllm_config
self.moe_config = moe_config

View File

@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
prev_draft_token_indices.extend(range(start, start + draft_len))
indices_match &= prev_index == flattened_index
max_flattened_index = max(max_flattened_index, flattened_index)
num_commmon_tokens = len(sample_flattened_indices)
num_common_tokens = len(sample_flattened_indices)
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
if num_commmon_tokens < total_without_spec:
if num_common_tokens < total_without_spec:
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0:
if num_common_tokens == 0:
return
if indices_match and max_flattened_index == (num_commmon_tokens - 1):
if indices_match and max_flattened_index == (num_common_tokens - 1):
# NOTE: Override the copy_ function here
indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
self.input_ids.gpu.index_copy_(0, indices, source)
if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True
self.is_token_ids.gpu[:num_common_tokens] = True
return
# Upload the index tensors asynchronously so the scatter can be non-blocking.
sampled_tokens_index_tensor = torch.tensor(

View File

@@ -28,6 +28,13 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
from vllm_ascend.utils import vllm_version_is
if not vllm_version_is("0.15.0"):
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -154,6 +161,77 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
return final_hidden_states
if not vllm_version_is("0.15.0"):
# Please remove this inheritance after extending vllm, todo(wxs)
class AscendMoERunner(DefaultMoERunner):
"""
Default implementation of the MoE runner for executing Mixture of Experts layers.
This class provides a comprehensive implementation for running MoE computations
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths
The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.
Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
"""
def __init__(
self,
layer: torch.nn.Module,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared
def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
"""
Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
Ascend-specific MoE computation logic.
"""
result = layer.forward_impl(hidden_states, router_logits)
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
# Otherwise, it returns just routed_out
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
return result
class AscendFusedMoE(FusedMoE):
moe_counter = -1
gate_stream: torch.npu.Stream | None = None
@@ -237,6 +315,26 @@ class AscendFusedMoE(FusedMoE):
setup_moe_comm_method(self.moe_config)
self.quant_type = self._get_quant_type()
if not vllm_version_is("0.15.0"):
self.runner = self._init_runner()
if not vllm_version_is("0.15.0"):
def _init_runner(self):
# Storing the runner in the FusedMoE is an intermediate state, eventually
# the runner will own the FusedMoE layer and provide the execution interface
# for MoE ops.
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def _get_quant_type(self) -> QuantType:
quant_type = QuantType.NONE
@@ -266,6 +364,19 @@ class AscendFusedMoE(FusedMoE):
"""
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
if not vllm_version_is("0.15.0"):
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
self.ensure_moe_quant_config_init()
return self.runner.forward(
hidden_states,
router_logits,
)
def forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
) -> torch.Tensor | FusedMoEResult:
@@ -414,6 +525,10 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
self._gate = gate
if not vllm_version_is("0.15.0"):
# Recreate the runner with the correct shared_experts parameter
# The parent class created the runner before self._shared_experts was set
self.runner = self._init_runner()
if self.multistream_overlap_shared_expert:
# Wrap the quant_method's process_weights_after_loading to validate that

View File

@@ -525,6 +525,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.15.0"):
arch_name = vllm_config.model_config.architectures[0]
else:
arch_name = vllm_config.model_config.architecture
# If original sizes exceed maximum, sample a representative subset
if max_num_batch_sizes < len(original_sizes):
# Sample uniformly from original sizes
@@ -536,10 +543,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
sampled_sizes = [original_sizes[i] for i in indices]
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
logger.info(
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d%d sizes",
vllm_config.model_config.architectures[0],
arch_name,
num_hidden_layers,
len(original_sizes),
len(
@@ -551,7 +557,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
compilation_config.cudagraph_capture_sizes = original_sizes
logger.info(
"No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
vllm_config.model_config.architectures[0],
arch_name,
num_hidden_layers,
len(original_sizes),
)