upgrade main to 0212 (#6712)
### What this PR does / why we need it?
Fixes `transformers_utils/processors/__init__` import error, due to
https://github.com/vllm-project/vllm/pull/33247
Fixes Fused MoE break introduced by `MoERunner abstraction,` due to
https://github.com/vllm-project/vllm/pull/32344
> delete AscendMoERunnere when
https://github.com/vllm-project/vllm/pull/35178 is merged
Fixes `Make Qwen3VL compatible with Transformers v5`, due to
https://github.com/vllm-project/vllm/pull/34262
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
2
.github/workflows/bot_pr_create.yaml
vendored
2
.github/workflows/bot_pr_create.yaml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
steps:
|
||||
- name: Get vLLM version
|
||||
run: |
|
||||
VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
|
||||
VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
|
||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
|
||||
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
# For lint purpose, actually we need make a main2main matching.
|
||||
ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
|
||||
ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
|
||||
cd /vllm-workspace/vllm && \
|
||||
git checkout $VLLM_COMMIT
|
||||
|
||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
|
||||
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
6
.github/workflows/pr_test_light.yaml
vendored
6
.github/workflows/pr_test_light.yaml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
lint:
|
||||
uses: ./.github/workflows/_pre_commit.yml
|
||||
with:
|
||||
vllm: 9562912cead1f11e8540fb91306c5cbda66f0007
|
||||
vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
|
||||
changes:
|
||||
runs-on: linux-aarch64-a2b3-0
|
||||
outputs:
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
|
||||
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
|
||||
uses: ./.github/workflows/_unit_test.yaml
|
||||
with:
|
||||
vllm: ${{ matrix.vllm_version }}
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
|
||||
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
@@ -33,7 +33,7 @@ jobs:
|
||||
name: refresh codecov
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007]
|
||||
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
|
||||
uses: ./.github/workflows/_unit_test.yaml
|
||||
with:
|
||||
vllm: ${{ matrix.vllm_version }}
|
||||
|
||||
@@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
|
||||
|
||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||
|-------------|--------------|------------------|-------------|--------------------|
|
||||
| main | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||
| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||
|
||||
## Release cadence
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ setuptools>=64
|
||||
setuptools-scm>=8
|
||||
torch==2.9.0
|
||||
torchvision
|
||||
torchaudio
|
||||
wheel
|
||||
xgrammar>=0.1.30
|
||||
pandas-stubs
|
||||
|
||||
@@ -25,22 +25,35 @@ class TestAscendConfig(unittest.TestCase):
|
||||
if vllm_version_is("0.15.0"):
|
||||
moe_parallel_config = FusedMoEParallelConfig(
|
||||
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=8,
|
||||
experts_per_token=8,
|
||||
hidden_dim=8192,
|
||||
intermediate_size_per_partition=5,
|
||||
num_local_experts=8,
|
||||
activation="silu",
|
||||
device="npu",
|
||||
routing_method=RoutingMethodType.Simulated,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=torch.float16,
|
||||
)
|
||||
else:
|
||||
moe_parallel_config = FusedMoEParallelConfig(
|
||||
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
|
||||
is_sequence_parallel=False, enable_eplb=True)
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=8,
|
||||
experts_per_token=8,
|
||||
hidden_dim=8192,
|
||||
intermediate_size_per_partition=5,
|
||||
num_local_experts=8,
|
||||
activation="silu",
|
||||
device="npu",
|
||||
routing_method=RoutingMethodType.Simulated,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=torch.float16,
|
||||
)
|
||||
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
|
||||
enable_eplb=True)
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=8,
|
||||
experts_per_token=8,
|
||||
hidden_dim=8192,
|
||||
intermediate_size_per_partition=5,
|
||||
num_local_experts=8,
|
||||
num_logical_experts=8,
|
||||
activation="silu",
|
||||
device="npu",
|
||||
routing_method=RoutingMethodType.Simulated,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=torch.float16,
|
||||
)
|
||||
moe_config.supports_eplb = True
|
||||
self.vllm_config = vllm_config
|
||||
self.moe_config = moe_config
|
||||
|
||||
@@ -236,22 +236,22 @@ class NPUModelRunner310(NPUModelRunner):
|
||||
prev_draft_token_indices.extend(range(start, start + draft_len))
|
||||
indices_match &= prev_index == flattened_index
|
||||
max_flattened_index = max(max_flattened_index, flattened_index)
|
||||
num_commmon_tokens = len(sample_flattened_indices)
|
||||
num_common_tokens = len(sample_flattened_indices)
|
||||
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
|
||||
if num_commmon_tokens < total_without_spec:
|
||||
if num_common_tokens < total_without_spec:
|
||||
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||
if self.enable_prompt_embeds:
|
||||
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
|
||||
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||
if num_commmon_tokens == 0:
|
||||
if num_common_tokens == 0:
|
||||
return
|
||||
if indices_match and max_flattened_index == (num_commmon_tokens - 1):
|
||||
if indices_match and max_flattened_index == (num_common_tokens - 1):
|
||||
# NOTE: Override the copy_ function here
|
||||
indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
|
||||
source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
|
||||
indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
|
||||
source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
|
||||
self.input_ids.gpu.index_copy_(0, indices, source)
|
||||
if self.enable_prompt_embeds:
|
||||
self.is_token_ids.gpu[:num_commmon_tokens] = True
|
||||
self.is_token_ids.gpu[:num_common_tokens] = True
|
||||
return
|
||||
# Upload the index tensors asynchronously so the scatter can be non-blocking.
|
||||
sampled_tokens_index_tensor = torch.tensor(
|
||||
|
||||
@@ -28,6 +28,13 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
|
||||
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if not vllm_version_is("0.15.0"):
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
@@ -154,6 +161,77 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
return final_hidden_states
|
||||
|
||||
|
||||
if not vllm_version_is("0.15.0"):
|
||||
# Please remove this inheritance after extending vllm, todo(wxs)
|
||||
class AscendMoERunner(DefaultMoERunner):
|
||||
"""
|
||||
Default implementation of the MoE runner for executing Mixture of Experts layers.
|
||||
|
||||
This class provides a comprehensive implementation for running MoE computations
|
||||
with support for:
|
||||
- Expert routing and token dispatching
|
||||
- Shared experts computation with optional parallel execution using CUDA streams
|
||||
- Data parallel (DP) chunking for large batch processing
|
||||
- Tensor model parallel and expert parallel operations
|
||||
- Various quantization methods and custom operators
|
||||
- Both monolithic and decomposed expert execution paths
|
||||
|
||||
The runner handles the complete MoE forward pass including routing tokens to
|
||||
experts, executing expert computations, and combining results. It supports
|
||||
advanced features like overlapped execution of shared experts and optimized
|
||||
kernels for different parallel execution modes.
|
||||
|
||||
Eventually, this class will be split up and specialized for different
|
||||
configurations, e.g. the presence or absence of shared experts, a gate, etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
moe_config: FusedMoEConfig,
|
||||
router: FusedMoERouter,
|
||||
routed_input_transform: torch.nn.Module | None,
|
||||
gate: torch.nn.Module | None,
|
||||
shared_experts: torch.nn.Module | None,
|
||||
quant_method: FusedMoEMethodBase,
|
||||
reduce_results: bool,
|
||||
enable_dbo: bool,
|
||||
):
|
||||
super().__init__(
|
||||
layer,
|
||||
moe_config,
|
||||
router,
|
||||
routed_input_transform,
|
||||
gate,
|
||||
shared_experts,
|
||||
quant_method,
|
||||
reduce_results,
|
||||
enable_dbo,
|
||||
)
|
||||
if self.shared_experts is None:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward
|
||||
else:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward_shared
|
||||
|
||||
def forward_impl(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_input: torch.Tensor | None,
|
||||
):
|
||||
"""
|
||||
Override the default forward_impl to use Ascend-specific implementation.
|
||||
This delegates to the layer's forward_impl method which contains the
|
||||
Ascend-specific MoE computation logic.
|
||||
"""
|
||||
result = layer.forward_impl(hidden_states, router_logits)
|
||||
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
|
||||
# Otherwise, it returns just routed_out
|
||||
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
|
||||
return result
|
||||
|
||||
|
||||
class AscendFusedMoE(FusedMoE):
|
||||
moe_counter = -1
|
||||
gate_stream: torch.npu.Stream | None = None
|
||||
@@ -237,6 +315,26 @@ class AscendFusedMoE(FusedMoE):
|
||||
|
||||
setup_moe_comm_method(self.moe_config)
|
||||
self.quant_type = self._get_quant_type()
|
||||
if not vllm_version_is("0.15.0"):
|
||||
self.runner = self._init_runner()
|
||||
|
||||
if not vllm_version_is("0.15.0"):
|
||||
|
||||
def _init_runner(self):
|
||||
# Storing the runner in the FusedMoE is an intermediate state, eventually
|
||||
# the runner will own the FusedMoE layer and provide the execution interface
|
||||
# for MoE ops.
|
||||
return AscendMoERunner(
|
||||
layer=self,
|
||||
moe_config=self.moe_config,
|
||||
router=self.router,
|
||||
routed_input_transform=self._routed_input_transform,
|
||||
gate=self.gate,
|
||||
shared_experts=self.shared_experts,
|
||||
quant_method=self.quant_method,
|
||||
reduce_results=self.reduce_results,
|
||||
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
|
||||
)
|
||||
|
||||
def _get_quant_type(self) -> QuantType:
|
||||
quant_type = QuantType.NONE
|
||||
@@ -266,6 +364,19 @@ class AscendFusedMoE(FusedMoE):
|
||||
"""
|
||||
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
|
||||
|
||||
if not vllm_version_is("0.15.0"):
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
self.ensure_moe_quant_config_init()
|
||||
return self.runner.forward(
|
||||
hidden_states,
|
||||
router_logits,
|
||||
)
|
||||
|
||||
def forward_impl( # type: ignore[override]
|
||||
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
|
||||
) -> torch.Tensor | FusedMoEResult:
|
||||
@@ -414,6 +525,10 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
||||
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
|
||||
|
||||
self._gate = gate
|
||||
if not vllm_version_is("0.15.0"):
|
||||
# Recreate the runner with the correct shared_experts parameter
|
||||
# The parent class created the runner before self._shared_experts was set
|
||||
self.runner = self._init_runner()
|
||||
|
||||
if self.multistream_overlap_shared_expert:
|
||||
# Wrap the quant_method's process_weights_after_loading to validate that
|
||||
|
||||
@@ -525,6 +525,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
"increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
|
||||
)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.15.0"):
|
||||
arch_name = vllm_config.model_config.architectures[0]
|
||||
else:
|
||||
arch_name = vllm_config.model_config.architecture
|
||||
|
||||
# If original sizes exceed maximum, sample a representative subset
|
||||
if max_num_batch_sizes < len(original_sizes):
|
||||
# Sample uniformly from original sizes
|
||||
@@ -536,10 +543,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
|
||||
sampled_sizes = [original_sizes[i] for i in indices]
|
||||
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
|
||||
|
||||
logger.info(
|
||||
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
|
||||
vllm_config.model_config.architectures[0],
|
||||
arch_name,
|
||||
num_hidden_layers,
|
||||
len(original_sizes),
|
||||
len(
|
||||
@@ -551,7 +557,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
compilation_config.cudagraph_capture_sizes = original_sizes
|
||||
logger.info(
|
||||
"No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
|
||||
vllm_config.model_config.architectures[0],
|
||||
arch_name,
|
||||
num_hidden_layers,
|
||||
len(original_sizes),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user