[Version] Drop 0.16.0 support (#7153)
### What this PR does / why we need it?
Drop 0.16.0 support in main
- Fix eagle proposer break introduced by
https://github.com/vllm-project/vllm/pull/34552. Mainly change to use
the draft attention group to initialize the attention metadata builder.
- Fix the `ModelRunner` has no attribute `cudagraph_capture_sizes`
error, which is a bug in vLLM v0.17.0, and fixed by a later pr
https://github.com/vllm-project/vllm/pull/30515
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||
vllm_version:
|
||||
required: false
|
||||
default: "v0.16.0"
|
||||
default: "v0.17.0"
|
||||
type: string
|
||||
description: vllm version to use
|
||||
vllm_ascend_remote_url:
|
||||
|
||||
@@ -39,7 +39,7 @@ on:
|
||||
vllm_version:
|
||||
required: false
|
||||
type: string
|
||||
default: "v0.16.0"
|
||||
default: "v0.17.0"
|
||||
is_pr_test:
|
||||
required: true
|
||||
type: boolean
|
||||
|
||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0]
|
||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
4
.github/workflows/pr_test_light.yaml
vendored
4
.github/workflows/pr_test_light.yaml
vendored
@@ -90,7 +90,7 @@ jobs:
|
||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0]
|
||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
|
||||
uses: ./.github/workflows/_unit_test.yaml
|
||||
with:
|
||||
vllm: ${{ matrix.vllm_version }}
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0]
|
||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
@@ -276,7 +276,7 @@ jobs:
|
||||
- Qwen3-Omni-30B-A3B-Instruct
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||
with:
|
||||
vllm: v0.16.0
|
||||
vllm: v0.17.0
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'
|
||||
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- vllm_branch: v0.16.0
|
||||
- vllm_branch: v0.17.0
|
||||
vllm_ascend_branch: main
|
||||
max-parallel: 1
|
||||
container:
|
||||
|
||||
@@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.16.0
|
||||
ARG VLLM_TAG=v0.17.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.16.0
|
||||
ARG VLLM_TAG=v0.17.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.16.0
|
||||
ARG VLLM_TAG=v0.17.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -49,7 +49,7 @@ RUN apt-get update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.16.0
|
||||
ARG VLLM_TAG=v0.17.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -50,7 +50,7 @@ RUN yum update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.16.0
|
||||
ARG VLLM_TAG=v0.17.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -50,7 +50,7 @@ RUN yum update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.16.0
|
||||
ARG VLLM_TAG=v0.17.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -75,9 +75,9 @@ myst_substitutions = {
|
||||
"pip_vllm_ascend_version": "0.16.0rc1",
|
||||
"pip_vllm_version": "0.16.0",
|
||||
# CANN image tag
|
||||
"cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11",
|
||||
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
|
||||
# vllm version in ci
|
||||
"ci_vllm_version": "v0.16.0",
|
||||
"ci_vllm_version": "v0.17.0",
|
||||
}
|
||||
|
||||
# For cross-file header anchors
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -137,7 +138,7 @@ class TestEagleProposerInitialization(TestBase):
|
||||
expected_max_num_tokens = proposer.max_num_tokens
|
||||
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))
|
||||
|
||||
|
||||
@unittest.skip("Skip due to the changes in #7153, fix me later")
|
||||
class TestEagleProposerLoadModel(TestBase):
|
||||
def setUp(self):
|
||||
self.vllm_config = MagicMock(spec=VllmConfig)
|
||||
|
||||
@@ -26,7 +26,6 @@ from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
|
||||
from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
|
||||
from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
|
||||
from vllm_ascend.quantization.methods.base import QuantType
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
from .experts_selector import select_experts
|
||||
from .moe_comm_method import AllGatherCommImpl310
|
||||
@@ -152,25 +151,22 @@ class AscendFusedMoE310(FusedMoE):
|
||||
self.quant_type = self.get_quant_type()
|
||||
|
||||
_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
|
||||
if not vllm_version_is("0.16.0"):
|
||||
self.runner = self._init_runner()
|
||||
self.runner = self._init_runner()
|
||||
|
||||
if not vllm_version_is("0.16.0"):
|
||||
def _init_runner(self):
|
||||
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
|
||||
|
||||
def _init_runner(self):
|
||||
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
|
||||
|
||||
return AscendMoERunner(
|
||||
layer=self,
|
||||
moe_config=self.moe_config,
|
||||
router=self.router,
|
||||
routed_input_transform=self._routed_input_transform,
|
||||
gate=self.gate,
|
||||
shared_experts=self.shared_experts,
|
||||
quant_method=self.quant_method,
|
||||
reduce_results=self.reduce_results,
|
||||
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
|
||||
)
|
||||
return AscendMoERunner(
|
||||
layer=self,
|
||||
moe_config=self.moe_config,
|
||||
router=self.router,
|
||||
routed_input_transform=self._routed_input_transform,
|
||||
gate=self.gate,
|
||||
shared_experts=self.shared_experts,
|
||||
quant_method=self.quant_method,
|
||||
reduce_results=self.reduce_results,
|
||||
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
|
||||
)
|
||||
|
||||
def init_experts_map(self, moe_config):
|
||||
"""
|
||||
|
||||
@@ -25,17 +25,13 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tp_group, tensor_mo
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
|
||||
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if not vllm_version_is("0.16.0"):
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
|
||||
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
@@ -50,7 +46,6 @@ from vllm_ascend.utils import (
|
||||
npu_stream_switch,
|
||||
shared_expert_dp_enabled,
|
||||
shared_experts_calculation_stream,
|
||||
vllm_version_is,
|
||||
)
|
||||
|
||||
|
||||
@@ -169,75 +164,74 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
||||
return final_hidden_states
|
||||
|
||||
|
||||
if not vllm_version_is("0.16.0"):
|
||||
# Please remove this inheritance after extending vllm, todo(wxs)
|
||||
class AscendMoERunner(DefaultMoERunner):
|
||||
# Please remove this inheritance after extending vllm, todo(wxs)
|
||||
class AscendMoERunner(DefaultMoERunner):
|
||||
"""
|
||||
Default implementation of the MoE runner for executing Mixture of Experts layers.
|
||||
|
||||
This class provides a comprehensive implementation for running MoE computations
|
||||
with support for:
|
||||
- Expert routing and token dispatching
|
||||
- Shared experts computation with optional parallel execution using CUDA streams
|
||||
- Data parallel (DP) chunking for large batch processing
|
||||
- Tensor model parallel and expert parallel operations
|
||||
- Various quantization methods and custom operators
|
||||
- Both monolithic and decomposed expert execution paths
|
||||
|
||||
The runner handles the complete MoE forward pass including routing tokens to
|
||||
experts, executing expert computations, and combining results. It supports
|
||||
advanced features like overlapped execution of shared experts and optimized
|
||||
kernels for different parallel execution modes.
|
||||
|
||||
Eventually, this class will be split up and specialized for different
|
||||
configurations, e.g. the presence or absence of shared experts, a gate, etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
moe_config: FusedMoEConfig,
|
||||
router: FusedMoERouter,
|
||||
routed_input_transform: torch.nn.Module | None,
|
||||
gate: torch.nn.Module | None,
|
||||
shared_experts: torch.nn.Module | None,
|
||||
quant_method: FusedMoEMethodBase,
|
||||
reduce_results: bool,
|
||||
enable_dbo: bool,
|
||||
):
|
||||
super().__init__(
|
||||
layer,
|
||||
moe_config,
|
||||
router,
|
||||
routed_input_transform,
|
||||
gate,
|
||||
shared_experts,
|
||||
quant_method,
|
||||
reduce_results,
|
||||
enable_dbo,
|
||||
)
|
||||
if self.shared_experts is None:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward
|
||||
else:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward_shared
|
||||
|
||||
def forward_impl(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_input: torch.Tensor | None,
|
||||
):
|
||||
"""
|
||||
Default implementation of the MoE runner for executing Mixture of Experts layers.
|
||||
|
||||
This class provides a comprehensive implementation for running MoE computations
|
||||
with support for:
|
||||
- Expert routing and token dispatching
|
||||
- Shared experts computation with optional parallel execution using CUDA streams
|
||||
- Data parallel (DP) chunking for large batch processing
|
||||
- Tensor model parallel and expert parallel operations
|
||||
- Various quantization methods and custom operators
|
||||
- Both monolithic and decomposed expert execution paths
|
||||
|
||||
The runner handles the complete MoE forward pass including routing tokens to
|
||||
experts, executing expert computations, and combining results. It supports
|
||||
advanced features like overlapped execution of shared experts and optimized
|
||||
kernels for different parallel execution modes.
|
||||
|
||||
Eventually, this class will be split up and specialized for different
|
||||
configurations, e.g. the presence or absence of shared experts, a gate, etc.
|
||||
Override the default forward_impl to use Ascend-specific implementation.
|
||||
This delegates to the layer's forward_impl method which contains the
|
||||
Ascend-specific MoE computation logic.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
moe_config: FusedMoEConfig,
|
||||
router: FusedMoERouter,
|
||||
routed_input_transform: torch.nn.Module | None,
|
||||
gate: torch.nn.Module | None,
|
||||
shared_experts: torch.nn.Module | None,
|
||||
quant_method: FusedMoEMethodBase,
|
||||
reduce_results: bool,
|
||||
enable_dbo: bool,
|
||||
):
|
||||
super().__init__(
|
||||
layer,
|
||||
moe_config,
|
||||
router,
|
||||
routed_input_transform,
|
||||
gate,
|
||||
shared_experts,
|
||||
quant_method,
|
||||
reduce_results,
|
||||
enable_dbo,
|
||||
)
|
||||
if self.shared_experts is None:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward
|
||||
else:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward_shared
|
||||
|
||||
def forward_impl(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_input: torch.Tensor | None,
|
||||
):
|
||||
"""
|
||||
Override the default forward_impl to use Ascend-specific implementation.
|
||||
This delegates to the layer's forward_impl method which contains the
|
||||
Ascend-specific MoE computation logic.
|
||||
"""
|
||||
result = layer.forward_impl(hidden_states, router_logits)
|
||||
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
|
||||
# Otherwise, it returns just routed_out
|
||||
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
|
||||
return result
|
||||
result = layer.forward_impl(hidden_states, router_logits)
|
||||
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
|
||||
# Otherwise, it returns just routed_out
|
||||
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
|
||||
return result
|
||||
|
||||
|
||||
class AscendFusedMoE(FusedMoE):
|
||||
@@ -328,26 +322,23 @@ class AscendFusedMoE(FusedMoE):
|
||||
|
||||
setup_moe_comm_method(self.moe_config)
|
||||
self.quant_type = self._get_quant_type()
|
||||
if not vllm_version_is("0.16.0"):
|
||||
self.runner = self._init_runner()
|
||||
self.runner = self._init_runner()
|
||||
|
||||
if not vllm_version_is("0.16.0"):
|
||||
|
||||
def _init_runner(self):
|
||||
# Storing the runner in the FusedMoE is an intermediate state, eventually
|
||||
# the runner will own the FusedMoE layer and provide the execution interface
|
||||
# for MoE ops.
|
||||
return AscendMoERunner(
|
||||
layer=self,
|
||||
moe_config=self.moe_config,
|
||||
router=self.router,
|
||||
routed_input_transform=self._routed_input_transform,
|
||||
gate=self.gate,
|
||||
shared_experts=self.shared_experts,
|
||||
quant_method=self.quant_method,
|
||||
reduce_results=self.reduce_results,
|
||||
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
|
||||
)
|
||||
def _init_runner(self):
|
||||
# Storing the runner in the FusedMoE is an intermediate state, eventually
|
||||
# the runner will own the FusedMoE layer and provide the execution interface
|
||||
# for MoE ops.
|
||||
return AscendMoERunner(
|
||||
layer=self,
|
||||
moe_config=self.moe_config,
|
||||
router=self.router,
|
||||
routed_input_transform=self._routed_input_transform,
|
||||
gate=self.gate,
|
||||
shared_experts=self.shared_experts,
|
||||
quant_method=self.quant_method,
|
||||
reduce_results=self.reduce_results,
|
||||
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
|
||||
)
|
||||
|
||||
def _get_quant_type(self) -> QuantType:
|
||||
quant_type = QuantType.NONE
|
||||
@@ -379,18 +370,16 @@ class AscendFusedMoE(FusedMoE):
|
||||
"""
|
||||
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
|
||||
|
||||
if not vllm_version_is("0.16.0"):
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
self.ensure_moe_quant_config_init()
|
||||
return self.runner.forward(
|
||||
hidden_states,
|
||||
router_logits,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
self.ensure_moe_quant_config_init()
|
||||
return self.runner.forward(
|
||||
hidden_states,
|
||||
router_logits,
|
||||
)
|
||||
|
||||
def forward_impl( # type: ignore[override]
|
||||
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
|
||||
@@ -551,10 +540,9 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
||||
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
|
||||
|
||||
self._gate = gate
|
||||
if not vllm_version_is("0.16.0"):
|
||||
# Recreate the runner with the correct shared_experts parameter
|
||||
# The parent class created the runner before self._shared_experts was set
|
||||
self.runner = self._init_runner()
|
||||
# Recreate the runner with the correct shared_experts parameter
|
||||
# The parent class created the runner before self._shared_experts was set
|
||||
self.runner = self._init_runner()
|
||||
|
||||
if self.multistream_overlap_shared_expert:
|
||||
# Wrap the quant_method's process_weights_after_loading to validate that
|
||||
|
||||
@@ -17,13 +17,9 @@
|
||||
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if HAS_TRITON:
|
||||
import vllm_ascend.patch.worker.patch_triton
|
||||
|
||||
if not vllm_version_is("v0.16.0"):
|
||||
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
|
||||
|
||||
# isort: off
|
||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||
@@ -35,6 +31,7 @@ import vllm_ascend.patch.worker.patch_minimax_m2_linear_attn # noqa
|
||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
|
||||
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
|
||||
import vllm_ascend.patch.worker.patch_v2_eagle # noqa
|
||||
import vllm_ascend.patch.worker.patch_v2_uva # noqa
|
||||
|
||||
@@ -21,14 +21,7 @@ import vllm
|
||||
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
|
||||
from vllm.v1.worker.gpu.input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("v0.16.0"):
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
|
||||
else:
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
|
||||
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
|
||||
|
||||
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
|
||||
|
||||
@@ -175,7 +168,4 @@ def propose(
|
||||
return self.draft_tokens[:num_reqs]
|
||||
|
||||
|
||||
if vllm_version_is("v0.16.0"):
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
|
||||
else:
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
|
||||
|
||||
@@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
|
||||
from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
|
||||
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
|
||||
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled
|
||||
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
|
||||
|
||||
# Currently we will fix block size to a small one since `num_reqs` can't be too large
|
||||
_PREPARE_INPUTS_BLOCK_SIZE = 4
|
||||
@@ -183,30 +183,25 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
|
||||
def load_model(self, model: nn.Module) -> None:
|
||||
target_attn_layer_names = set(get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys())
|
||||
target_indexer_layer_names = set(get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys())
|
||||
|
||||
with self.maybe_eager_context:
|
||||
self.model = self._get_model()
|
||||
|
||||
indexer_layers = get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys()
|
||||
# Find draft layers (attention layers added by draft model)
|
||||
all_attn_layers = get_layers_from_vllm_config(
|
||||
self.vllm_config,
|
||||
AttentionLayerBase, # type: ignore[type-abstract]
|
||||
)
|
||||
all_indexer_layer_names = set(get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys())
|
||||
self._draft_attn_layer_names = set(all_attn_layers.keys()) - target_attn_layer_names - all_indexer_layer_names
|
||||
|
||||
assert len(self._draft_attn_layer_names) == 1
|
||||
self.attn_layer_names = list(sorted(self._draft_attn_layer_names))
|
||||
draft_attn_layers_dict = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
|
||||
draft_attn_layers = draft_attn_layers_dict.keys()
|
||||
|
||||
draft_attn_layer_names = draft_attn_layers - target_attn_layer_names
|
||||
draft_indexer_layer_names = indexer_layers - target_indexer_layer_names
|
||||
draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
|
||||
|
||||
self.attn_layer_names = list(sorted(draft_attn_layer_names))
|
||||
|
||||
self.kernel_block_size = (
|
||||
draft_attn_layers_dict[self.attn_layer_names[0]].get_attn_backend().get_supported_kernel_block_sizes()[0]
|
||||
)
|
||||
|
||||
self.piece_all_attn_layer_name = []
|
||||
for _ in range(self.num_speculative_tokens):
|
||||
self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names])
|
||||
self.attn_layer_names = list(sorted(draft_attn_layer_names))
|
||||
|
||||
self.piece_all_attn_layer_name = []
|
||||
for _ in range(self.num_speculative_tokens):
|
||||
self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names])
|
||||
@@ -668,6 +663,46 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
# Copy the old attn_metadata and update
|
||||
if not self.parallel_drafting:
|
||||
for draft_step in range(1, self.num_speculative_tokens):
|
||||
per_layer_attn_metadata = dict()
|
||||
if vllm_version_is("0.17.0"):
|
||||
for attn_group in self.draft_attn_groups:
|
||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
||||
draft_step,
|
||||
attn_metadata,
|
||||
common_attn_metadata,
|
||||
batch_size,
|
||||
num_input_tokens,
|
||||
used_update_positions,
|
||||
aclgraph_runtime_mode,
|
||||
ori_seq_len,
|
||||
slot_indices,
|
||||
mtp_slot_mapping,
|
||||
attn_group=attn_group,
|
||||
)
|
||||
for layer_name in self.attn_layer_names:
|
||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
||||
else:
|
||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
||||
draft_step,
|
||||
attn_metadata,
|
||||
common_attn_metadata,
|
||||
batch_size,
|
||||
num_input_tokens,
|
||||
used_update_positions,
|
||||
aclgraph_runtime_mode,
|
||||
ori_seq_len,
|
||||
slot_indices,
|
||||
mtp_slot_mapping,
|
||||
)
|
||||
for layer_name in self.attn_layer_names:
|
||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
||||
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
||||
else:
|
||||
# Copy the old attn_metadata and update
|
||||
for draft_step in range(1, self.num_speculative_tokens):
|
||||
per_layer_attn_metadata = dict()
|
||||
if vllm_version_is("0.17.0"):
|
||||
for attn_group in self.draft_attn_groups:
|
||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
||||
draft_step,
|
||||
attn_metadata,
|
||||
@@ -676,18 +711,11 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
num_input_tokens,
|
||||
used_update_positions,
|
||||
aclgraph_runtime_mode,
|
||||
ori_seq_len,
|
||||
slot_indices,
|
||||
mtp_slot_mapping,
|
||||
attn_group=attn_group,
|
||||
)
|
||||
per_layer_attn_metadata = dict()
|
||||
for layer_name in self.attn_layer_names:
|
||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
||||
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
||||
else:
|
||||
# Copy the old attn_metadata and update
|
||||
if not self.parallel_drafting:
|
||||
for draft_step in range(1, self.num_speculative_tokens):
|
||||
else:
|
||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
||||
draft_step,
|
||||
attn_metadata,
|
||||
@@ -697,10 +725,9 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
used_update_positions,
|
||||
aclgraph_runtime_mode,
|
||||
)
|
||||
per_layer_attn_metadata = dict()
|
||||
for layer_name in self.attn_layer_names:
|
||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
||||
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
||||
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
||||
|
||||
token_indices_to_sample_len = token_indices_to_sample.shape[0]
|
||||
self.token_indices_to_sample[:token_indices_to_sample_len].copy_(token_indices_to_sample)
|
||||
@@ -1077,8 +1104,11 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
ori_seq_len=None,
|
||||
slot_indices=None,
|
||||
mtp_slot_mapping=None,
|
||||
attn_group=None,
|
||||
):
|
||||
assert draft_step > 0
|
||||
if vllm_version_is("0.17.0"):
|
||||
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
|
||||
common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
|
||||
|
||||
if draft_step == 1:
|
||||
@@ -1150,11 +1180,6 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
else:
|
||||
common_attn_metadata.positions[:batch_size].copy_(clamped_positions)
|
||||
|
||||
if self.attn_metadata_builder is None:
|
||||
attn_metadata_builder = self._get_attention_metadata_builder()
|
||||
else:
|
||||
attn_metadata_builder = self.attn_metadata_builder
|
||||
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
num_computed_tokens_of_pcp_dcp = self.runner.pcp_manager._get_cp_local_seq_lens(
|
||||
ori_seq_len + draft_step + 1,
|
||||
@@ -1194,8 +1219,15 @@ class SpecDecodeBaseProposer(EagleProposer):
|
||||
# Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
|
||||
common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
|
||||
|
||||
# Rebuild attention metadata
|
||||
attn_metadata = attn_metadata_builder.build_for_drafting( # type: ignore
|
||||
if vllm_version_is("0.17.0"):
|
||||
attn_metadata_builder = attn_group.get_metadata_builder()
|
||||
else:
|
||||
if self.attn_metadata_builder is None:
|
||||
attn_metadata_builder = self._get_attention_metadata_builder()
|
||||
else:
|
||||
attn_metadata_builder = self.attn_metadata_builder
|
||||
|
||||
attn_metadata = attn_metadata_builder.build_for_drafting(
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
draft_index=draft_step,
|
||||
)
|
||||
|
||||
@@ -74,6 +74,7 @@ from vllm.v1.outputs import (
|
||||
from vllm.v1.sample.logits_processor import build_logitsprocs
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.sample.rejection_sampler import RejectionSampler
|
||||
from vllm.v1.spec_decode.draft_model import DraftModelProposer
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
from vllm.v1.structured_output.utils import apply_grammar_bitmask
|
||||
from vllm.v1.utils import record_function_or_nullcontext
|
||||
@@ -407,6 +408,16 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.cpu_slot_mapping = None
|
||||
self.sampling_done_event: torch.npu.Event | None = None
|
||||
|
||||
if vllm_version_is("0.17.0"):
|
||||
# self.cudagraph_batch_sizes sorts in ascending order.
|
||||
if (
|
||||
self.compilation_config.cudagraph_capture_sizes
|
||||
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
):
|
||||
self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
|
||||
else:
|
||||
self.cudagraph_batch_sizes = []
|
||||
|
||||
@property
|
||||
def use_cp(self) -> bool:
|
||||
return self.pcp_size * self.dcp_size > 1
|
||||
@@ -1327,48 +1338,27 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
# Run forward pass
|
||||
clear_kv_metadata = self.speculative_config is None
|
||||
if vllm_version_is("0.16.0"):
|
||||
with (
|
||||
record_function_or_nullcontext("forward"),
|
||||
set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens_padded,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
aclgraph_runtime_mode=cudagraph_mode,
|
||||
batch_descriptor=batch_desc,
|
||||
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||
model_instance=self.model,
|
||||
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||
skip_compiled=has_encoder_input,
|
||||
),
|
||||
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
|
||||
):
|
||||
hidden_states = self._model_forward(
|
||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||
)
|
||||
else:
|
||||
with (
|
||||
record_function_or_nullcontext("forward"),
|
||||
set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens_padded,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
aclgraph_runtime_mode=cudagraph_mode,
|
||||
batch_descriptor=batch_desc,
|
||||
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||
model_instance=self.model,
|
||||
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||
skip_compiled=has_encoder_input,
|
||||
),
|
||||
self.maybe_get_kv_connector_output(
|
||||
scheduler_output, clear_metadata=clear_kv_metadata
|
||||
) as kv_connector_output,
|
||||
):
|
||||
hidden_states = self._model_forward(
|
||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||
)
|
||||
with (
|
||||
record_function_or_nullcontext("forward"),
|
||||
set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens_padded,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
aclgraph_runtime_mode=cudagraph_mode,
|
||||
batch_descriptor=batch_desc,
|
||||
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||
model_instance=self.model,
|
||||
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||
skip_compiled=has_encoder_input,
|
||||
),
|
||||
self.maybe_get_kv_connector_output(
|
||||
scheduler_output, clear_metadata=clear_kv_metadata
|
||||
) as kv_connector_output,
|
||||
):
|
||||
hidden_states = self._model_forward(
|
||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||
)
|
||||
with record_function_or_nullcontext("post process"):
|
||||
aux_hidden_states = None
|
||||
if self.use_aux_hidden_state_outputs:
|
||||
@@ -1926,23 +1916,14 @@ class NPUModelRunner(GPUModelRunner):
|
||||
if force_eager:
|
||||
return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
|
||||
|
||||
if vllm_version_is("0.16.0"):
|
||||
return self.cudagraph_dispatcher.dispatch(
|
||||
num_tokens=num_tokens,
|
||||
has_lora=has_lora,
|
||||
uniform_decode=uniform_decode,
|
||||
disable_full=disable_full,
|
||||
num_active_loras=num_active_loras,
|
||||
)
|
||||
else:
|
||||
return self.cudagraph_dispatcher.dispatch(
|
||||
num_tokens=num_tokens,
|
||||
has_lora=has_lora,
|
||||
uniform_decode=uniform_decode,
|
||||
valid_modes=valid_modes,
|
||||
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
|
||||
num_active_loras=num_active_loras,
|
||||
)
|
||||
return self.cudagraph_dispatcher.dispatch(
|
||||
num_tokens=num_tokens,
|
||||
has_lora=has_lora,
|
||||
uniform_decode=uniform_decode,
|
||||
valid_modes=valid_modes,
|
||||
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
|
||||
num_active_loras=num_active_loras,
|
||||
)
|
||||
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output)
|
||||
num_tokens_padded = batch_descriptor.num_tokens
|
||||
@@ -1964,16 +1945,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
dp_rank = self.parallel_config.data_parallel_rank
|
||||
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
|
||||
# Re-dispatch with DP padding
|
||||
if vllm_version_is("0.16.0"):
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded,
|
||||
disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
|
||||
)
|
||||
else:
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded,
|
||||
valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
|
||||
)
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded,
|
||||
valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
|
||||
)
|
||||
# Assert to make sure the agreed upon token count is correct otherwise
|
||||
# num_tokens_across_dp will no-longer be valid
|
||||
assert batch_descriptor.num_tokens == num_tokens_padded
|
||||
@@ -2580,6 +2555,14 @@ class NPUModelRunner(GPUModelRunner):
|
||||
|
||||
self.may_reinitialize_input_batch(kv_cache_config)
|
||||
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
|
||||
if vllm_version_is("0.17.0"):
|
||||
# TODO: refactor the logic of attention
|
||||
# Initialize drafter attention group initialization
|
||||
if self.speculative_config and (
|
||||
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
|
||||
):
|
||||
assert isinstance(self.drafter, AscendEagleProposer | DraftModelProposer)
|
||||
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
|
||||
|
||||
if has_kv_transfer_group():
|
||||
get_kv_transfer_group().register_kv_caches(kv_caches)
|
||||
@@ -2966,7 +2949,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# For attention backends that support virtual block splitting,
|
||||
# use the supported block sizes from the backend
|
||||
# For other backends (like Mamba), use [0] (no splitting)
|
||||
kernel_block_sizes = []
|
||||
self.kernel_block_sizes = []
|
||||
for kv_cache_group_id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
|
||||
kv_cache_spec = kv_cache_group.kv_cache_spec
|
||||
if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
|
||||
@@ -2993,15 +2976,15 @@ class NPUModelRunner(GPUModelRunner):
|
||||
else:
|
||||
# Fallback to cache config block_size if no backend found
|
||||
kernel_block_size_list = [self.cache_config.block_size]
|
||||
kernel_block_sizes.append(kernel_block_size_list)
|
||||
self.kernel_block_sizes.append(kernel_block_size_list)
|
||||
else:
|
||||
# This is likely Mamba or other non-attention cache,
|
||||
# no splitting.
|
||||
# NOTE: set kernel_block_sizes to 0 to disable slotmapping computation
|
||||
# of mamba block. In this case, BlockTable.block_size will never equal
|
||||
# to kernel_block_sizes[0]
|
||||
kernel_block_sizes.append([0])
|
||||
if block_sizes != [self.cache_config.block_size] or kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||
self.kernel_block_sizes.append([0])
|
||||
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||
assert self.cache_config.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
@@ -3023,7 +3006,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
if self.vllm_config.speculative_config
|
||||
else 0
|
||||
),
|
||||
kernel_block_sizes=kernel_block_sizes,
|
||||
kernel_block_sizes=self.kernel_block_sizes,
|
||||
)
|
||||
|
||||
def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
|
||||
|
||||
Reference in New Issue
Block a user