[Version] Drop 0.16.0 support (#7153)

### What this PR does / why we need it?
Drop 0.16.0 support in main
- Fix eagle proposer break introduced by
https://github.com/vllm-project/vllm/pull/34552. Mainly change to use
the draft attention group to initialize the attention metadata builder.
- Fix the `ModelRunner` has no attribute `cudagraph_capture_sizes`
error, which is a bug in vLLM v0.17.0, and fixed by a later pr
https://github.com/vllm-project/vllm/pull/30515

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2026-03-13 16:14:15 +08:00
committed by GitHub
parent 7ed9e9de69
commit 986cd45397
20 changed files with 255 additions and 268 deletions

View File

@@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.16.0"
default: "v0.17.0"
type: string
description: vllm version to use
vllm_ascend_remote_url:

View File

@@ -39,7 +39,7 @@ on:
vllm_version:
required: false
type: string
default: "v0.16.0"
default: "v0.17.0"
is_pr_test:
required: true
type: boolean

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0]
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0]
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0]
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -276,7 +276,7 @@ jobs:
- Qwen3-Omni-30B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with:
vllm: v0.16.0
vllm: v0.17.0
runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'

View File

@@ -51,7 +51,7 @@ jobs:
strategy:
matrix:
include:
- vllm_branch: v0.16.0
- vllm_branch: v0.17.0
vllm_ascend_branch: main
max-parallel: 1
container:

View File

@@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0
ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0
ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0
ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -49,7 +49,7 @@ RUN apt-get update -y && \
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0
ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ RUN yum update -y && \
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0
ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ RUN yum update -y && \
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0
ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -75,9 +75,9 @@ myst_substitutions = {
"pip_vllm_ascend_version": "0.16.0rc1",
"pip_vllm_version": "0.16.0",
# CANN image tag
"cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11",
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vllm version in ci
"ci_vllm_version": "v0.16.0",
"ci_vllm_version": "v0.17.0",
}
# For cross-file header anchors

View File

@@ -1,4 +1,5 @@
from unittest.mock import MagicMock, patch
import unittest
import numpy as np
import torch
@@ -137,7 +138,7 @@ class TestEagleProposerInitialization(TestBase):
expected_max_num_tokens = proposer.max_num_tokens
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))
@unittest.skip("Skip due to the changes in #7153, fix me later")
class TestEagleProposerLoadModel(TestBase):
def setUp(self):
self.vllm_config = MagicMock(spec=VllmConfig)

View File

@@ -26,7 +26,6 @@ from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
from vllm_ascend.quantization.methods.base import QuantType
from vllm_ascend.utils import vllm_version_is
from .experts_selector import select_experts
from .moe_comm_method import AllGatherCommImpl310
@@ -152,25 +151,22 @@ class AscendFusedMoE310(FusedMoE):
self.quant_type = self.get_quant_type()
_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
if not vllm_version_is("0.16.0"):
self.runner = self._init_runner()
self.runner = self._init_runner()
if not vllm_version_is("0.16.0"):
def _init_runner(self):
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
def _init_runner(self):
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def init_experts_map(self, moe_config):
"""

View File

@@ -25,17 +25,13 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tp_group, tensor_mo
from vllm.forward_context import get_forward_context
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
from vllm_ascend.utils import vllm_version_is
if not vllm_version_is("0.16.0"):
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -50,7 +46,6 @@ from vllm_ascend.utils import (
npu_stream_switch,
shared_expert_dp_enabled,
shared_experts_calculation_stream,
vllm_version_is,
)
@@ -169,75 +164,74 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
return final_hidden_states
if not vllm_version_is("0.16.0"):
# Please remove this inheritance after extending vllm, todo(wxs)
class AscendMoERunner(DefaultMoERunner):
# Please remove this inheritance after extending vllm, todo(wxs)
class AscendMoERunner(DefaultMoERunner):
"""
Default implementation of the MoE runner for executing Mixture of Experts layers.
This class provides a comprehensive implementation for running MoE computations
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths
The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.
Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
"""
def __init__(
self,
layer: torch.nn.Module,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared
def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
"""
Default implementation of the MoE runner for executing Mixture of Experts layers.
This class provides a comprehensive implementation for running MoE computations
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths
The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.
Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
Ascend-specific MoE computation logic.
"""
def __init__(
self,
layer: torch.nn.Module,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared
def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
"""
Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
Ascend-specific MoE computation logic.
"""
result = layer.forward_impl(hidden_states, router_logits)
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
# Otherwise, it returns just routed_out
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
return result
result = layer.forward_impl(hidden_states, router_logits)
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
# Otherwise, it returns just routed_out
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
return result
class AscendFusedMoE(FusedMoE):
@@ -328,26 +322,23 @@ class AscendFusedMoE(FusedMoE):
setup_moe_comm_method(self.moe_config)
self.quant_type = self._get_quant_type()
if not vllm_version_is("0.16.0"):
self.runner = self._init_runner()
self.runner = self._init_runner()
if not vllm_version_is("0.16.0"):
def _init_runner(self):
# Storing the runner in the FusedMoE is an intermediate state, eventually
# the runner will own the FusedMoE layer and provide the execution interface
# for MoE ops.
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def _init_runner(self):
# Storing the runner in the FusedMoE is an intermediate state, eventually
# the runner will own the FusedMoE layer and provide the execution interface
# for MoE ops.
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def _get_quant_type(self) -> QuantType:
quant_type = QuantType.NONE
@@ -379,18 +370,16 @@ class AscendFusedMoE(FusedMoE):
"""
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
if not vllm_version_is("0.16.0"):
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
self.ensure_moe_quant_config_init()
return self.runner.forward(
hidden_states,
router_logits,
)
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
self.ensure_moe_quant_config_init()
return self.runner.forward(
hidden_states,
router_logits,
)
def forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
@@ -551,10 +540,9 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
self._gate = gate
if not vllm_version_is("0.16.0"):
# Recreate the runner with the correct shared_experts parameter
# The parent class created the runner before self._shared_experts was set
self.runner = self._init_runner()
# Recreate the runner with the correct shared_experts parameter
# The parent class created the runner before self._shared_experts was set
self.runner = self._init_runner()
if self.multistream_overlap_shared_expert:
# Wrap the quant_method's process_weights_after_loading to validate that

View File

@@ -17,13 +17,9 @@
from vllm.triton_utils import HAS_TRITON
from vllm_ascend.utils import vllm_version_is
if HAS_TRITON:
import vllm_ascend.patch.worker.patch_triton
if not vllm_version_is("v0.16.0"):
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
# isort: off
import vllm_ascend.patch.platform.patch_sched_yield # noqa
@@ -35,6 +31,7 @@ import vllm_ascend.patch.worker.patch_minimax_m2_linear_attn # noqa
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
import vllm_ascend.patch.worker.patch_v2_eagle # noqa
import vllm_ascend.patch.worker.patch_v2_uva # noqa

View File

@@ -21,14 +21,7 @@ import vllm
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
from vllm.v1.worker.gpu.input_batch import InputBatch
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.16.0"):
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
else:
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
@@ -175,7 +168,4 @@ def propose(
return self.draft_tokens[:num_reqs]
if vllm_version_is("v0.16.0"):
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
else:
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose

View File

@@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
# Currently we will fix block size to a small one since `num_reqs` can't be too large
_PREPARE_INPUTS_BLOCK_SIZE = 4
@@ -183,30 +183,25 @@ class SpecDecodeBaseProposer(EagleProposer):
def load_model(self, model: nn.Module) -> None:
target_attn_layer_names = set(get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys())
target_indexer_layer_names = set(get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys())
with self.maybe_eager_context:
self.model = self._get_model()
indexer_layers = get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys()
# Find draft layers (attention layers added by draft model)
all_attn_layers = get_layers_from_vllm_config(
self.vllm_config,
AttentionLayerBase, # type: ignore[type-abstract]
)
all_indexer_layer_names = set(get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys())
self._draft_attn_layer_names = set(all_attn_layers.keys()) - target_attn_layer_names - all_indexer_layer_names
assert len(self._draft_attn_layer_names) == 1
self.attn_layer_names = list(sorted(self._draft_attn_layer_names))
draft_attn_layers_dict = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
draft_attn_layers = draft_attn_layers_dict.keys()
draft_attn_layer_names = draft_attn_layers - target_attn_layer_names
draft_indexer_layer_names = indexer_layers - target_indexer_layer_names
draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
self.attn_layer_names = list(sorted(draft_attn_layer_names))
self.kernel_block_size = (
draft_attn_layers_dict[self.attn_layer_names[0]].get_attn_backend().get_supported_kernel_block_sizes()[0]
)
self.piece_all_attn_layer_name = []
for _ in range(self.num_speculative_tokens):
self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names])
self.attn_layer_names = list(sorted(draft_attn_layer_names))
self.piece_all_attn_layer_name = []
for _ in range(self.num_speculative_tokens):
self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names])
@@ -668,6 +663,46 @@ class SpecDecodeBaseProposer(EagleProposer):
# Copy the old attn_metadata and update
if not self.parallel_drafting:
for draft_step in range(1, self.num_speculative_tokens):
per_layer_attn_metadata = dict()
if vllm_version_is("0.17.0"):
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
common_attn_metadata,
batch_size,
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
ori_seq_len,
slot_indices,
mtp_slot_mapping,
attn_group=attn_group,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
else:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
common_attn_metadata,
batch_size,
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
ori_seq_len,
slot_indices,
mtp_slot_mapping,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
multi_steps_attn_metadata.append(per_layer_attn_metadata)
else:
# Copy the old attn_metadata and update
for draft_step in range(1, self.num_speculative_tokens):
per_layer_attn_metadata = dict()
if vllm_version_is("0.17.0"):
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
@@ -676,18 +711,11 @@ class SpecDecodeBaseProposer(EagleProposer):
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
ori_seq_len,
slot_indices,
mtp_slot_mapping,
attn_group=attn_group,
)
per_layer_attn_metadata = dict()
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
multi_steps_attn_metadata.append(per_layer_attn_metadata)
else:
# Copy the old attn_metadata and update
if not self.parallel_drafting:
for draft_step in range(1, self.num_speculative_tokens):
else:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
@@ -697,10 +725,9 @@ class SpecDecodeBaseProposer(EagleProposer):
used_update_positions,
aclgraph_runtime_mode,
)
per_layer_attn_metadata = dict()
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
multi_steps_attn_metadata.append(per_layer_attn_metadata)
multi_steps_attn_metadata.append(per_layer_attn_metadata)
token_indices_to_sample_len = token_indices_to_sample.shape[0]
self.token_indices_to_sample[:token_indices_to_sample_len].copy_(token_indices_to_sample)
@@ -1077,8 +1104,11 @@ class SpecDecodeBaseProposer(EagleProposer):
ori_seq_len=None,
slot_indices=None,
mtp_slot_mapping=None,
attn_group=None,
):
assert draft_step > 0
if vllm_version_is("0.17.0"):
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
if draft_step == 1:
@@ -1150,11 +1180,6 @@ class SpecDecodeBaseProposer(EagleProposer):
else:
common_attn_metadata.positions[:batch_size].copy_(clamped_positions)
if self.attn_metadata_builder is None:
attn_metadata_builder = self._get_attention_metadata_builder()
else:
attn_metadata_builder = self.attn_metadata_builder
if self.pcp_size * self.dcp_size > 1:
num_computed_tokens_of_pcp_dcp = self.runner.pcp_manager._get_cp_local_seq_lens(
ori_seq_len + draft_step + 1,
@@ -1194,8 +1219,15 @@ class SpecDecodeBaseProposer(EagleProposer):
# Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
# Rebuild attention metadata
attn_metadata = attn_metadata_builder.build_for_drafting( # type: ignore
if vllm_version_is("0.17.0"):
attn_metadata_builder = attn_group.get_metadata_builder()
else:
if self.attn_metadata_builder is None:
attn_metadata_builder = self._get_attention_metadata_builder()
else:
attn_metadata_builder = self.attn_metadata_builder
attn_metadata = attn_metadata_builder.build_for_drafting(
common_attn_metadata=common_attn_metadata,
draft_index=draft_step,
)

View File

@@ -74,6 +74,7 @@ from vllm.v1.outputs import (
from vllm.v1.sample.logits_processor import build_logitsprocs
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import RejectionSampler
from vllm.v1.spec_decode.draft_model import DraftModelProposer
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.structured_output.utils import apply_grammar_bitmask
from vllm.v1.utils import record_function_or_nullcontext
@@ -407,6 +408,16 @@ class NPUModelRunner(GPUModelRunner):
self.cpu_slot_mapping = None
self.sampling_done_event: torch.npu.Event | None = None
if vllm_version_is("0.17.0"):
# self.cudagraph_batch_sizes sorts in ascending order.
if (
self.compilation_config.cudagraph_capture_sizes
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
):
self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
else:
self.cudagraph_batch_sizes = []
@property
def use_cp(self) -> bool:
return self.pcp_size * self.dcp_size > 1
@@ -1327,48 +1338,27 @@ class NPUModelRunner(GPUModelRunner):
# Run forward pass
clear_kv_metadata = self.speculative_config is None
if vllm_version_is("0.16.0"):
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
else:
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(
scheduler_output, clear_metadata=clear_kv_metadata
) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(
scheduler_output, clear_metadata=clear_kv_metadata
) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
with record_function_or_nullcontext("post process"):
aux_hidden_states = None
if self.use_aux_hidden_state_outputs:
@@ -1926,23 +1916,14 @@ class NPUModelRunner(GPUModelRunner):
if force_eager:
return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
if vllm_version_is("0.16.0"):
return self.cudagraph_dispatcher.dispatch(
num_tokens=num_tokens,
has_lora=has_lora,
uniform_decode=uniform_decode,
disable_full=disable_full,
num_active_loras=num_active_loras,
)
else:
return self.cudagraph_dispatcher.dispatch(
num_tokens=num_tokens,
has_lora=has_lora,
uniform_decode=uniform_decode,
valid_modes=valid_modes,
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
num_active_loras=num_active_loras,
)
return self.cudagraph_dispatcher.dispatch(
num_tokens=num_tokens,
has_lora=has_lora,
uniform_decode=uniform_decode,
valid_modes=valid_modes,
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
num_active_loras=num_active_loras,
)
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output)
num_tokens_padded = batch_descriptor.num_tokens
@@ -1964,16 +1945,10 @@ class NPUModelRunner(GPUModelRunner):
dp_rank = self.parallel_config.data_parallel_rank
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
# Re-dispatch with DP padding
if vllm_version_is("0.16.0"):
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded,
disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
)
else:
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded,
valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
)
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded,
valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
)
# Assert to make sure the agreed upon token count is correct otherwise
# num_tokens_across_dp will no-longer be valid
assert batch_descriptor.num_tokens == num_tokens_padded
@@ -2580,6 +2555,14 @@ class NPUModelRunner(GPUModelRunner):
self.may_reinitialize_input_batch(kv_cache_config)
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
if vllm_version_is("0.17.0"):
# TODO: refactor the logic of attention
# Initialize drafter attention group initialization
if self.speculative_config and (
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
):
assert isinstance(self.drafter, AscendEagleProposer | DraftModelProposer)
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -2966,7 +2949,7 @@ class NPUModelRunner(GPUModelRunner):
# For attention backends that support virtual block splitting,
# use the supported block sizes from the backend
# For other backends (like Mamba), use [0] (no splitting)
kernel_block_sizes = []
self.kernel_block_sizes = []
for kv_cache_group_id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
kv_cache_spec = kv_cache_group.kv_cache_spec
if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
@@ -2993,15 +2976,15 @@ class NPUModelRunner(GPUModelRunner):
else:
# Fallback to cache config block_size if no backend found
kernel_block_size_list = [self.cache_config.block_size]
kernel_block_sizes.append(kernel_block_size_list)
self.kernel_block_sizes.append(kernel_block_size_list)
else:
# This is likely Mamba or other non-attention cache,
# no splitting.
# NOTE: set kernel_block_sizes to 0 to disable slotmapping computation
# of mamba block. In this case, BlockTable.block_size will never equal
# to kernel_block_sizes[0]
kernel_block_sizes.append([0])
if block_sizes != [self.cache_config.block_size] or kernel_block_sizes != [[self.cache_config.block_size]]:
self.kernel_block_sizes.append([0])
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
assert self.cache_config.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
@@ -3023,7 +3006,7 @@ class NPUModelRunner(GPUModelRunner):
if self.vllm_config.speculative_config
else 0
),
kernel_block_sizes=kernel_block_sizes,
kernel_block_sizes=self.kernel_block_sizes,
)
def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: