upgrade main to 0212 (#6712)

### What this PR does / why we need it?
Fixes `transformers_utils/processors/__init__` import error, due to
https://github.com/vllm-project/vllm/pull/33247
Fixes Fused MoE break introduced by `MoERunner abstraction,` due to
https://github.com/vllm-project/vllm/pull/32344

> delete AscendMoERunnere when
https://github.com/vllm-project/vllm/pull/35178 is merged

Fixes `Make Qwen3VL compatible with Transformers v5`, due to
https://github.com/vllm-project/vllm/pull/34262

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
Icey
2026-02-25 09:17:29 +08:00
committed by GitHub
parent 0331f16a50
commit ee59429015
11 changed files with 167 additions and 32 deletions

View File

@@ -37,7 +37,7 @@ jobs:
steps: steps:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007 VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
- name: Checkout repository - name: Checkout repository

View File

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching. # For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007 ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT git checkout $VLLM_COMMIT

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0] vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -41,7 +41,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/_pre_commit.yml uses: ./.github/workflows/_pre_commit.yml
with: with:
vllm: 9562912cead1f11e8540fb91306c5cbda66f0007 vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
changes: changes:
runs-on: linux-aarch64-a2b3-0 runs-on: linux-aarch64-a2b3-0
outputs: outputs:
@@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy: strategy:
matrix: matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0] vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}
@@ -99,7 +99,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0] vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -33,7 +33,7 @@ jobs:
name: refresh codecov name: refresh codecov
strategy: strategy:
matrix: matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007] vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}

View File

@@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------| |-------------|--------------|------------------|-------------|--------------------|
| main | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | | main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
## Release cadence ## Release cadence

View File

@@ -13,6 +13,7 @@ setuptools>=64
setuptools-scm>=8 setuptools-scm>=8
torch==2.9.0 torch==2.9.0
torchvision torchvision
torchaudio
wheel wheel
xgrammar>=0.1.30 xgrammar>=0.1.30
pandas-stubs pandas-stubs

View File

@@ -25,10 +25,6 @@ class TestAscendConfig(unittest.TestCase):
if vllm_version_is("0.15.0"): if vllm_version_is("0.15.0"):
moe_parallel_config = FusedMoEParallelConfig( moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
else:
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
is_sequence_parallel=False, enable_eplb=True)
moe_config = FusedMoEConfig( moe_config = FusedMoEConfig(
num_experts=8, num_experts=8,
experts_per_token=8, experts_per_token=8,
@@ -41,6 +37,23 @@ class TestAscendConfig(unittest.TestCase):
moe_parallel_config=moe_parallel_config, moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16, in_dtype=torch.float16,
) )
else:
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
num_logical_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
moe_config.supports_eplb = True moe_config.supports_eplb = True
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.moe_config = moe_config self.moe_config = moe_config

View File

@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
prev_draft_token_indices.extend(range(start, start + draft_len)) prev_draft_token_indices.extend(range(start, start + draft_len))
indices_match &= prev_index == flattened_index indices_match &= prev_index == flattened_index
max_flattened_index = max(max_flattened_index, flattened_index) max_flattened_index = max(max_flattened_index, flattened_index)
num_commmon_tokens = len(sample_flattened_indices) num_common_tokens = len(sample_flattened_indices)
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
if num_commmon_tokens < total_without_spec: if num_common_tokens < total_without_spec:
self.input_ids.copy_to_gpu(total_num_scheduled_tokens) self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds: if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0: if num_common_tokens == 0:
return return
if indices_match and max_flattened_index == (num_commmon_tokens - 1): if indices_match and max_flattened_index == (num_common_tokens - 1):
# NOTE: Override the copy_ function here # NOTE: Override the copy_ function here
indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device) indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0] source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
self.input_ids.gpu.index_copy_(0, indices, source) self.input_ids.gpu.index_copy_(0, indices, source)
if self.enable_prompt_embeds: if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True self.is_token_ids.gpu[:num_common_tokens] = True
return return
# Upload the index tensors asynchronously so the scatter can be non-blocking. # Upload the index tensors asynchronously so the scatter can be non-blocking.
sampled_tokens_index_tensor = torch.tensor( sampled_tokens_index_tensor = torch.tensor(

View File

@@ -28,6 +28,13 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
from vllm_ascend.utils import vllm_version_is
if not vllm_version_is("0.15.0"):
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -154,6 +161,77 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
return final_hidden_states return final_hidden_states
if not vllm_version_is("0.15.0"):
# Please remove this inheritance after extending vllm, todo(wxs)
class AscendMoERunner(DefaultMoERunner):
"""
Default implementation of the MoE runner for executing Mixture of Experts layers.
This class provides a comprehensive implementation for running MoE computations
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths
The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.
Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
"""
def __init__(
self,
layer: torch.nn.Module,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared
def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
"""
Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
Ascend-specific MoE computation logic.
"""
result = layer.forward_impl(hidden_states, router_logits)
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
# Otherwise, it returns just routed_out
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
return result
class AscendFusedMoE(FusedMoE): class AscendFusedMoE(FusedMoE):
moe_counter = -1 moe_counter = -1
gate_stream: torch.npu.Stream | None = None gate_stream: torch.npu.Stream | None = None
@@ -237,6 +315,26 @@ class AscendFusedMoE(FusedMoE):
setup_moe_comm_method(self.moe_config) setup_moe_comm_method(self.moe_config)
self.quant_type = self._get_quant_type() self.quant_type = self._get_quant_type()
if not vllm_version_is("0.15.0"):
self.runner = self._init_runner()
if not vllm_version_is("0.15.0"):
def _init_runner(self):
# Storing the runner in the FusedMoE is an intermediate state, eventually
# the runner will own the FusedMoE layer and provide the execution interface
# for MoE ops.
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def _get_quant_type(self) -> QuantType: def _get_quant_type(self) -> QuantType:
quant_type = QuantType.NONE quant_type = QuantType.NONE
@@ -266,6 +364,19 @@ class AscendFusedMoE(FusedMoE):
""" """
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
if not vllm_version_is("0.15.0"):
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
self.ensure_moe_quant_config_init()
return self.runner.forward(
hidden_states,
router_logits,
)
def forward_impl( # type: ignore[override] def forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
) -> torch.Tensor | FusedMoEResult: ) -> torch.Tensor | FusedMoEResult:
@@ -414,6 +525,10 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
self._gate = gate self._gate = gate
if not vllm_version_is("0.15.0"):
# Recreate the runner with the correct shared_experts parameter
# The parent class created the runner before self._shared_experts was set
self.runner = self._init_runner()
if self.multistream_overlap_shared_expert: if self.multistream_overlap_shared_expert:
# Wrap the quant_method's process_weights_after_loading to validate that # Wrap the quant_method's process_weights_after_loading to validate that

View File

@@ -525,6 +525,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV." "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
) )
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.15.0"):
arch_name = vllm_config.model_config.architectures[0]
else:
arch_name = vllm_config.model_config.architecture
# If original sizes exceed maximum, sample a representative subset # If original sizes exceed maximum, sample a representative subset
if max_num_batch_sizes < len(original_sizes): if max_num_batch_sizes < len(original_sizes):
# Sample uniformly from original sizes # Sample uniformly from original sizes
@@ -536,10 +543,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
sampled_sizes = [original_sizes[i] for i in indices] sampled_sizes = [original_sizes[i] for i in indices]
update_cudagraph_capture_sizes(vllm_config, sampled_sizes) update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
logger.info( logger.info(
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d%d sizes", "Adjusted ACL graph batch sizes for %s model (layers: %d): %d%d sizes",
vllm_config.model_config.architectures[0], arch_name,
num_hidden_layers, num_hidden_layers,
len(original_sizes), len(original_sizes),
len( len(
@@ -551,7 +557,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
compilation_config.cudagraph_capture_sizes = original_sizes compilation_config.cudagraph_capture_sizes = original_sizes
logger.info( logger.info(
"No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes", "No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
vllm_config.model_config.architectures[0], arch_name,
num_hidden_layers, num_hidden_layers,
len(original_sizes), len(original_sizes),
) )