[Version] Drop 0.16.0 support (#7153)

### What this PR does / why we need it?
Drop 0.16.0 support in main
- Fix eagle proposer break introduced by
https://github.com/vllm-project/vllm/pull/34552. Mainly change to use
the draft attention group to initialize the attention metadata builder.
- Fix the `ModelRunner` has no attribute `cudagraph_capture_sizes`
error, which is a bug in vLLM v0.17.0, and fixed by a later pr
https://github.com/vllm-project/vllm/pull/30515

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2026-03-13 16:14:15 +08:00
committed by GitHub
parent 7ed9e9de69
commit 986cd45397
20 changed files with 255 additions and 268 deletions

View File

@@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version: vllm_version:
required: false required: false
default: "v0.16.0" default: "v0.17.0"
type: string type: string
description: vllm version to use description: vllm version to use
vllm_ascend_remote_url: vllm_ascend_remote_url:

View File

@@ -39,7 +39,7 @@ on:
vllm_version: vllm_version:
required: false required: false
type: string type: string
default: "v0.16.0" default: "v0.17.0"
is_pr_test: is_pr_test:
required: true required: true
type: boolean type: boolean

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0] vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy: strategy:
matrix: matrix:
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0] vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.16.0] vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -276,7 +276,7 @@ jobs:
- Qwen3-Omni-30B-A3B-Instruct - Qwen3-Omni-30B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with: with:
vllm: v0.16.0 vllm: v0.17.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }} model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'

View File

@@ -51,7 +51,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: v0.16.0 - vllm_branch: v0.17.0
vllm_ascend_branch: main vllm_ascend_branch: main
max-parallel: 1 max-parallel: 1
container: container:

View File

@@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0 ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0 ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0 ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -49,7 +49,7 @@ RUN apt-get update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0 ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0 ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -50,7 +50,7 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.16.0 ARG VLLM_TAG=v0.17.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -75,9 +75,9 @@ myst_substitutions = {
"pip_vllm_ascend_version": "0.16.0rc1", "pip_vllm_ascend_version": "0.16.0rc1",
"pip_vllm_version": "0.16.0", "pip_vllm_version": "0.16.0",
# CANN image tag # CANN image tag
"cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11", "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vllm version in ci # vllm version in ci
"ci_vllm_version": "v0.16.0", "ci_vllm_version": "v0.17.0",
} }
# For cross-file header anchors # For cross-file header anchors

View File

@@ -1,4 +1,5 @@
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import unittest
import numpy as np import numpy as np
import torch import torch
@@ -137,7 +138,7 @@ class TestEagleProposerInitialization(TestBase):
expected_max_num_tokens = proposer.max_num_tokens expected_max_num_tokens = proposer.max_num_tokens
self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048))
@unittest.skip("Skip due to the changes in #7153, fix me later")
class TestEagleProposerLoadModel(TestBase): class TestEagleProposerLoadModel(TestBase):
def setUp(self): def setUp(self):
self.vllm_config = MagicMock(spec=VllmConfig) self.vllm_config = MagicMock(spec=VllmConfig)

View File

@@ -26,7 +26,6 @@ from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
from vllm_ascend.quantization.methods.base import QuantType from vllm_ascend.quantization.methods.base import QuantType
from vllm_ascend.utils import vllm_version_is
from .experts_selector import select_experts from .experts_selector import select_experts
from .moe_comm_method import AllGatherCommImpl310 from .moe_comm_method import AllGatherCommImpl310
@@ -152,25 +151,22 @@ class AscendFusedMoE310(FusedMoE):
self.quant_type = self.get_quant_type() self.quant_type = self.get_quant_type()
_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config) _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
if not vllm_version_is("0.16.0"): self.runner = self._init_runner()
self.runner = self._init_runner()
if not vllm_version_is("0.16.0"): def _init_runner(self):
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
def _init_runner(self): return AscendMoERunner(
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner layer=self,
moe_config=self.moe_config,
return AscendMoERunner( router=self.router,
layer=self, routed_input_transform=self._routed_input_transform,
moe_config=self.moe_config, gate=self.gate,
router=self.router, shared_experts=self.shared_experts,
routed_input_transform=self._routed_input_transform, quant_method=self.quant_method,
gate=self.gate, reduce_results=self.reduce_results,
shared_experts=self.shared_experts, enable_dbo=self.vllm_config.parallel_config.enable_dbo,
quant_method=self.quant_method, )
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def init_experts_map(self, moe_config): def init_experts_map(self, moe_config):
""" """

View File

@@ -25,17 +25,13 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tp_group, tensor_mo
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm.logger import logger from vllm.logger import logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
from vllm_ascend.utils import vllm_version_is
if not vllm_version_is("0.16.0"):
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -50,7 +46,6 @@ from vllm_ascend.utils import (
npu_stream_switch, npu_stream_switch,
shared_expert_dp_enabled, shared_expert_dp_enabled,
shared_experts_calculation_stream, shared_experts_calculation_stream,
vllm_version_is,
) )
@@ -169,75 +164,74 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
return final_hidden_states return final_hidden_states
if not vllm_version_is("0.16.0"): # Please remove this inheritance after extending vllm, todo(wxs)
# Please remove this inheritance after extending vllm, todo(wxs) class AscendMoERunner(DefaultMoERunner):
class AscendMoERunner(DefaultMoERunner): """
Default implementation of the MoE runner for executing Mixture of Experts layers.
This class provides a comprehensive implementation for running MoE computations
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths
The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.
Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
"""
def __init__(
self,
layer: torch.nn.Module,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared
def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
""" """
Default implementation of the MoE runner for executing Mixture of Experts layers. Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
This class provides a comprehensive implementation for running MoE computations Ascend-specific MoE computation logic.
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths
The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.
Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
""" """
result = layer.forward_impl(hidden_states, router_logits)
def __init__( # If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
self, # Otherwise, it returns just routed_out
layer: torch.nn.Module, # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
moe_config: FusedMoEConfig, return result
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared
def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
"""
Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
Ascend-specific MoE computation logic.
"""
result = layer.forward_impl(hidden_states, router_logits)
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
# Otherwise, it returns just routed_out
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
return result
class AscendFusedMoE(FusedMoE): class AscendFusedMoE(FusedMoE):
@@ -328,26 +322,23 @@ class AscendFusedMoE(FusedMoE):
setup_moe_comm_method(self.moe_config) setup_moe_comm_method(self.moe_config)
self.quant_type = self._get_quant_type() self.quant_type = self._get_quant_type()
if not vllm_version_is("0.16.0"): self.runner = self._init_runner()
self.runner = self._init_runner()
if not vllm_version_is("0.16.0"): def _init_runner(self):
# Storing the runner in the FusedMoE is an intermediate state, eventually
def _init_runner(self): # the runner will own the FusedMoE layer and provide the execution interface
# Storing the runner in the FusedMoE is an intermediate state, eventually # for MoE ops.
# the runner will own the FusedMoE layer and provide the execution interface return AscendMoERunner(
# for MoE ops. layer=self,
return AscendMoERunner( moe_config=self.moe_config,
layer=self, router=self.router,
moe_config=self.moe_config, routed_input_transform=self._routed_input_transform,
router=self.router, gate=self.gate,
routed_input_transform=self._routed_input_transform, shared_experts=self.shared_experts,
gate=self.gate, quant_method=self.quant_method,
shared_experts=self.shared_experts, reduce_results=self.reduce_results,
quant_method=self.quant_method, enable_dbo=self.vllm_config.parallel_config.enable_dbo,
reduce_results=self.reduce_results, )
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)
def _get_quant_type(self) -> QuantType: def _get_quant_type(self) -> QuantType:
quant_type = QuantType.NONE quant_type = QuantType.NONE
@@ -379,18 +370,16 @@ class AscendFusedMoE(FusedMoE):
""" """
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
if not vllm_version_is("0.16.0"): def forward(
self,
def forward( hidden_states: torch.Tensor,
self, router_logits: torch.Tensor,
hidden_states: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
router_logits: torch.Tensor, self.ensure_moe_quant_config_init()
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: return self.runner.forward(
self.ensure_moe_quant_config_init() hidden_states,
return self.runner.forward( router_logits,
hidden_states, )
router_logits,
)
def forward_impl( # type: ignore[override] def forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
@@ -551,10 +540,9 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
self._gate = gate self._gate = gate
if not vllm_version_is("0.16.0"): # Recreate the runner with the correct shared_experts parameter
# Recreate the runner with the correct shared_experts parameter # The parent class created the runner before self._shared_experts was set
# The parent class created the runner before self._shared_experts was set self.runner = self._init_runner()
self.runner = self._init_runner()
if self.multistream_overlap_shared_expert: if self.multistream_overlap_shared_expert:
# Wrap the quant_method's process_weights_after_loading to validate that # Wrap the quant_method's process_weights_after_loading to validate that

View File

@@ -17,13 +17,9 @@
from vllm.triton_utils import HAS_TRITON from vllm.triton_utils import HAS_TRITON
from vllm_ascend.utils import vllm_version_is
if HAS_TRITON: if HAS_TRITON:
import vllm_ascend.patch.worker.patch_triton import vllm_ascend.patch.worker.patch_triton
if not vllm_version_is("v0.16.0"):
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
# isort: off # isort: off
import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa
@@ -35,6 +31,7 @@ import vllm_ascend.patch.worker.patch_minimax_m2_linear_attn # noqa
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
import vllm_ascend.patch.worker.patch_v2_eagle # noqa import vllm_ascend.patch.worker.patch_v2_eagle # noqa
import vllm_ascend.patch.worker.patch_v2_uva # noqa import vllm_ascend.patch.worker.patch_v2_uva # noqa

View File

@@ -21,14 +21,7 @@ import vllm
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
from vllm.v1.worker.gpu.input_batch import InputBatch from vllm.v1.worker.gpu.input_batch import InputBatch
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.16.0"):
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
else:
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
@@ -175,7 +168,4 @@ def propose(
return self.draft_tokens[:num_reqs] return self.draft_tokens[:num_reqs]
if vllm_version_is("v0.16.0"): vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
else:
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose

View File

@@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
# Currently we will fix block size to a small one since `num_reqs` can't be too large # Currently we will fix block size to a small one since `num_reqs` can't be too large
_PREPARE_INPUTS_BLOCK_SIZE = 4 _PREPARE_INPUTS_BLOCK_SIZE = 4
@@ -183,30 +183,25 @@ class SpecDecodeBaseProposer(EagleProposer):
def load_model(self, model: nn.Module) -> None: def load_model(self, model: nn.Module) -> None:
target_attn_layer_names = set(get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys()) target_attn_layer_names = set(get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase).keys())
target_indexer_layer_names = set(get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys())
with self.maybe_eager_context: with self.maybe_eager_context:
self.model = self._get_model() self.model = self._get_model()
indexer_layers = get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys() # Find draft layers (attention layers added by draft model)
all_attn_layers = get_layers_from_vllm_config(
self.vllm_config,
AttentionLayerBase, # type: ignore[type-abstract]
)
all_indexer_layer_names = set(get_layers_from_vllm_config(self.vllm_config, DeepseekV32IndexerCache).keys())
self._draft_attn_layer_names = set(all_attn_layers.keys()) - target_attn_layer_names - all_indexer_layer_names
assert len(self._draft_attn_layer_names) == 1
self.attn_layer_names = list(sorted(self._draft_attn_layer_names))
draft_attn_layers_dict = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) draft_attn_layers_dict = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
draft_attn_layers = draft_attn_layers_dict.keys()
draft_attn_layer_names = draft_attn_layers - target_attn_layer_names
draft_indexer_layer_names = indexer_layers - target_indexer_layer_names
draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
self.attn_layer_names = list(sorted(draft_attn_layer_names))
self.kernel_block_size = ( self.kernel_block_size = (
draft_attn_layers_dict[self.attn_layer_names[0]].get_attn_backend().get_supported_kernel_block_sizes()[0] draft_attn_layers_dict[self.attn_layer_names[0]].get_attn_backend().get_supported_kernel_block_sizes()[0]
) )
self.piece_all_attn_layer_name = []
for _ in range(self.num_speculative_tokens):
self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names])
self.attn_layer_names = list(sorted(draft_attn_layer_names))
self.piece_all_attn_layer_name = [] self.piece_all_attn_layer_name = []
for _ in range(self.num_speculative_tokens): for _ in range(self.num_speculative_tokens):
self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names]) self.piece_all_attn_layer_name.append([name for name in self.attn_layer_names])
@@ -668,6 +663,46 @@ class SpecDecodeBaseProposer(EagleProposer):
# Copy the old attn_metadata and update # Copy the old attn_metadata and update
if not self.parallel_drafting: if not self.parallel_drafting:
for draft_step in range(1, self.num_speculative_tokens): for draft_step in range(1, self.num_speculative_tokens):
per_layer_attn_metadata = dict()
if vllm_version_is("0.17.0"):
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
common_attn_metadata,
batch_size,
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
ori_seq_len,
slot_indices,
mtp_slot_mapping,
attn_group=attn_group,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
else:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step,
attn_metadata,
common_attn_metadata,
batch_size,
num_input_tokens,
used_update_positions,
aclgraph_runtime_mode,
ori_seq_len,
slot_indices,
mtp_slot_mapping,
)
for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata
multi_steps_attn_metadata.append(per_layer_attn_metadata)
else:
# Copy the old attn_metadata and update
for draft_step in range(1, self.num_speculative_tokens):
per_layer_attn_metadata = dict()
if vllm_version_is("0.17.0"):
for attn_group in self.draft_attn_groups:
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step, draft_step,
attn_metadata, attn_metadata,
@@ -676,18 +711,11 @@ class SpecDecodeBaseProposer(EagleProposer):
num_input_tokens, num_input_tokens,
used_update_positions, used_update_positions,
aclgraph_runtime_mode, aclgraph_runtime_mode,
ori_seq_len, attn_group=attn_group,
slot_indices,
mtp_slot_mapping,
) )
per_layer_attn_metadata = dict()
for layer_name in self.attn_layer_names: for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata per_layer_attn_metadata[layer_name] = attn_metadata
multi_steps_attn_metadata.append(per_layer_attn_metadata) else:
else:
# Copy the old attn_metadata and update
if not self.parallel_drafting:
for draft_step in range(1, self.num_speculative_tokens):
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
draft_step, draft_step,
attn_metadata, attn_metadata,
@@ -697,10 +725,9 @@ class SpecDecodeBaseProposer(EagleProposer):
used_update_positions, used_update_positions,
aclgraph_runtime_mode, aclgraph_runtime_mode,
) )
per_layer_attn_metadata = dict()
for layer_name in self.attn_layer_names: for layer_name in self.attn_layer_names:
per_layer_attn_metadata[layer_name] = attn_metadata per_layer_attn_metadata[layer_name] = attn_metadata
multi_steps_attn_metadata.append(per_layer_attn_metadata) multi_steps_attn_metadata.append(per_layer_attn_metadata)
token_indices_to_sample_len = token_indices_to_sample.shape[0] token_indices_to_sample_len = token_indices_to_sample.shape[0]
self.token_indices_to_sample[:token_indices_to_sample_len].copy_(token_indices_to_sample) self.token_indices_to_sample[:token_indices_to_sample_len].copy_(token_indices_to_sample)
@@ -1077,8 +1104,11 @@ class SpecDecodeBaseProposer(EagleProposer):
ori_seq_len=None, ori_seq_len=None,
slot_indices=None, slot_indices=None,
mtp_slot_mapping=None, mtp_slot_mapping=None,
attn_group=None,
): ):
assert draft_step > 0 assert draft_step > 0
if vllm_version_is("0.17.0"):
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
common_attn_metadata = self.shallow_copy_metadata(old_common_metadata) common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
if draft_step == 1: if draft_step == 1:
@@ -1150,11 +1180,6 @@ class SpecDecodeBaseProposer(EagleProposer):
else: else:
common_attn_metadata.positions[:batch_size].copy_(clamped_positions) common_attn_metadata.positions[:batch_size].copy_(clamped_positions)
if self.attn_metadata_builder is None:
attn_metadata_builder = self._get_attention_metadata_builder()
else:
attn_metadata_builder = self.attn_metadata_builder
if self.pcp_size * self.dcp_size > 1: if self.pcp_size * self.dcp_size > 1:
num_computed_tokens_of_pcp_dcp = self.runner.pcp_manager._get_cp_local_seq_lens( num_computed_tokens_of_pcp_dcp = self.runner.pcp_manager._get_cp_local_seq_lens(
ori_seq_len + draft_step + 1, ori_seq_len + draft_step + 1,
@@ -1194,8 +1219,15 @@ class SpecDecodeBaseProposer(EagleProposer):
# Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx] # Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step] common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
# Rebuild attention metadata if vllm_version_is("0.17.0"):
attn_metadata = attn_metadata_builder.build_for_drafting( # type: ignore attn_metadata_builder = attn_group.get_metadata_builder()
else:
if self.attn_metadata_builder is None:
attn_metadata_builder = self._get_attention_metadata_builder()
else:
attn_metadata_builder = self.attn_metadata_builder
attn_metadata = attn_metadata_builder.build_for_drafting(
common_attn_metadata=common_attn_metadata, common_attn_metadata=common_attn_metadata,
draft_index=draft_step, draft_index=draft_step,
) )

View File

@@ -74,6 +74,7 @@ from vllm.v1.outputs import (
from vllm.v1.sample.logits_processor import build_logitsprocs from vllm.v1.sample.logits_processor import build_logitsprocs
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.rejection_sampler import RejectionSampler
from vllm.v1.spec_decode.draft_model import DraftModelProposer
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.structured_output.utils import apply_grammar_bitmask
from vllm.v1.utils import record_function_or_nullcontext from vllm.v1.utils import record_function_or_nullcontext
@@ -407,6 +408,16 @@ class NPUModelRunner(GPUModelRunner):
self.cpu_slot_mapping = None self.cpu_slot_mapping = None
self.sampling_done_event: torch.npu.Event | None = None self.sampling_done_event: torch.npu.Event | None = None
if vllm_version_is("0.17.0"):
# self.cudagraph_batch_sizes sorts in ascending order.
if (
self.compilation_config.cudagraph_capture_sizes
and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
):
self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
else:
self.cudagraph_batch_sizes = []
@property @property
def use_cp(self) -> bool: def use_cp(self) -> bool:
return self.pcp_size * self.dcp_size > 1 return self.pcp_size * self.dcp_size > 1
@@ -1327,48 +1338,27 @@ class NPUModelRunner(GPUModelRunner):
# Run forward pass # Run forward pass
clear_kv_metadata = self.speculative_config is None clear_kv_metadata = self.speculative_config is None
if vllm_version_is("0.16.0"): with (
with ( record_function_or_nullcontext("forward"),
record_function_or_nullcontext("forward"), set_ascend_forward_context(
set_ascend_forward_context( attn_metadata,
attn_metadata, self.vllm_config,
self.vllm_config, num_tokens=num_tokens_padded,
num_tokens=num_tokens_padded, num_tokens_across_dp=num_tokens_across_dp,
num_tokens_across_dp=num_tokens_across_dp, aclgraph_runtime_mode=cudagraph_mode,
aclgraph_runtime_mode=cudagraph_mode, batch_descriptor=batch_desc,
batch_descriptor=batch_desc, num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens, model_instance=self.model,
model_instance=self.model, max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp, skip_compiled=has_encoder_input,
skip_compiled=has_encoder_input, ),
), self.maybe_get_kv_connector_output(
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output, scheduler_output, clear_metadata=clear_kv_metadata
): ) as kv_connector_output,
hidden_states = self._model_forward( ):
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs hidden_states = self._model_forward(
) num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
else: )
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(
scheduler_output, clear_metadata=clear_kv_metadata
) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
with record_function_or_nullcontext("post process"): with record_function_or_nullcontext("post process"):
aux_hidden_states = None aux_hidden_states = None
if self.use_aux_hidden_state_outputs: if self.use_aux_hidden_state_outputs:
@@ -1926,23 +1916,14 @@ class NPUModelRunner(GPUModelRunner):
if force_eager: if force_eager:
return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
if vllm_version_is("0.16.0"): return self.cudagraph_dispatcher.dispatch(
return self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens,
num_tokens=num_tokens, has_lora=has_lora,
has_lora=has_lora, uniform_decode=uniform_decode,
uniform_decode=uniform_decode, valid_modes=valid_modes,
disable_full=disable_full, invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
num_active_loras=num_active_loras, num_active_loras=num_active_loras,
) )
else:
return self.cudagraph_dispatcher.dispatch(
num_tokens=num_tokens,
has_lora=has_lora,
uniform_decode=uniform_decode,
valid_modes=valid_modes,
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
num_active_loras=num_active_loras,
)
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output) cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output)
num_tokens_padded = batch_descriptor.num_tokens num_tokens_padded = batch_descriptor.num_tokens
@@ -1964,16 +1945,10 @@ class NPUModelRunner(GPUModelRunner):
dp_rank = self.parallel_config.data_parallel_rank dp_rank = self.parallel_config.data_parallel_rank
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
# Re-dispatch with DP padding # Re-dispatch with DP padding
if vllm_version_is("0.16.0"): cudagraph_mode, batch_descriptor = dispatch_cudagraph(
cudagraph_mode, batch_descriptor = dispatch_cudagraph( num_tokens_padded,
num_tokens_padded, valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, )
)
else:
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded,
valid_modes={CUDAGraphMode(synced_cudagraph_mode)},
)
# Assert to make sure the agreed upon token count is correct otherwise # Assert to make sure the agreed upon token count is correct otherwise
# num_tokens_across_dp will no-longer be valid # num_tokens_across_dp will no-longer be valid
assert batch_descriptor.num_tokens == num_tokens_padded assert batch_descriptor.num_tokens == num_tokens_padded
@@ -2580,6 +2555,14 @@ class NPUModelRunner(GPUModelRunner):
self.may_reinitialize_input_batch(kv_cache_config) self.may_reinitialize_input_batch(kv_cache_config)
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
if vllm_version_is("0.17.0"):
# TODO: refactor the logic of attention
# Initialize drafter attention group initialization
if self.speculative_config and (
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
):
assert isinstance(self.drafter, AscendEagleProposer | DraftModelProposer)
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
if has_kv_transfer_group(): if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches) get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -2966,7 +2949,7 @@ class NPUModelRunner(GPUModelRunner):
# For attention backends that support virtual block splitting, # For attention backends that support virtual block splitting,
# use the supported block sizes from the backend # use the supported block sizes from the backend
# For other backends (like Mamba), use [0] (no splitting) # For other backends (like Mamba), use [0] (no splitting)
kernel_block_sizes = [] self.kernel_block_sizes = []
for kv_cache_group_id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): for kv_cache_group_id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
kv_cache_spec = kv_cache_group.kv_cache_spec kv_cache_spec = kv_cache_group.kv_cache_spec
if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs): if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
@@ -2993,15 +2976,15 @@ class NPUModelRunner(GPUModelRunner):
else: else:
# Fallback to cache config block_size if no backend found # Fallback to cache config block_size if no backend found
kernel_block_size_list = [self.cache_config.block_size] kernel_block_size_list = [self.cache_config.block_size]
kernel_block_sizes.append(kernel_block_size_list) self.kernel_block_sizes.append(kernel_block_size_list)
else: else:
# This is likely Mamba or other non-attention cache, # This is likely Mamba or other non-attention cache,
# no splitting. # no splitting.
# NOTE: set kernel_block_sizes to 0 to disable slotmapping computation # NOTE: set kernel_block_sizes to 0 to disable slotmapping computation
# of mamba block. In this case, BlockTable.block_size will never equal # of mamba block. In this case, BlockTable.block_size will never equal
# to kernel_block_sizes[0] # to kernel_block_sizes[0]
kernel_block_sizes.append([0]) self.kernel_block_sizes.append([0])
if block_sizes != [self.cache_config.block_size] or kernel_block_sizes != [[self.cache_config.block_size]]: if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
assert self.cache_config.cpu_offload_gb == 0, ( assert self.cache_config.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight " "Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
@@ -3023,7 +3006,7 @@ class NPUModelRunner(GPUModelRunner):
if self.vllm_config.speculative_config if self.vllm_config.speculative_config
else 0 else 0
), ),
kernel_block_sizes=kernel_block_sizes, kernel_block_sizes=self.kernel_block_sizes,
) )
def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: