upgrade to 0.18.0 (#7502)

### What this PR does / why we need it?
1. upgrade to 0.18.0
2. ensure kernel_block_sizes is int for Eagle drafter
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.17.0
- vLLM main:
8b6325758c

---------

Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
meihanc
2026-03-21 16:05:38 +08:00
committed by GitHub
parent 80a4265717
commit bff4fbfca5
16 changed files with 139 additions and 258 deletions

View File

@@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.17.0"
default: "v0.18.0"
type: string
description: vllm version to use
vllm_ascend_remote_url:

View File

@@ -39,7 +39,7 @@ on:
vllm_version:
required: false
type: string
default: "v0.17.0"
default: "v0.18.0"
is_pr_test:
required: true
type: boolean

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -277,7 +277,7 @@ jobs:
- Qwen3-Omni-30B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with:
vllm: v0.17.0
vllm: v0.18.0
runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'

View File

@@ -51,7 +51,7 @@ jobs:
strategy:
matrix:
include:
- vllm_branch: v0.17.0
- vllm_branch: v0.18.0
vllm_ascend_branch: main
max-parallel: 1
container:

View File

@@ -23,7 +23,7 @@ jobs:
name: e2e-test
strategy:
matrix:
vllm_version: [v0.17.0]
vllm_version: [v0.18.0]
type: [full, light]
uses: ./.github/workflows/_e2e_test.yaml
with:

View File

@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
## Release cadence

View File

@@ -32,12 +32,9 @@ TENSOR_PARALLELS = [1]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
from vllm_ascend.utils import vllm_version_is
if not vllm_version_is("0.17.0"):
pytest.skip(
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
)
pytest.skip(
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
)
encode_port = get_open_port()
pd_port = get_open_port()
vllm_server_args = [

View File

@@ -37,7 +37,7 @@ from vllm.v1.kv_cache_interface import (
)
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, vllm_version_is
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
@@ -451,18 +451,11 @@ class NPUModelRunner310(NPUModelRunner):
self.kernel_block_sizes.append([0])
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
if vllm_version_is("0.17.0"):
assert self.cache_config.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
else:
assert self.offload_config.uva.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
assert self.offload_config.uva.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
self.input_batch = NPUInputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=max(self.model_config.max_model_len, self.max_encoder_len),

View File

@@ -161,21 +161,11 @@ class AscendConfig:
@staticmethod
def _get_compile_ranges(compilation_config):
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.17.0"):
return compilation_config.compile_ranges_split_points
else:
return compilation_config.compile_ranges_endpoints
return compilation_config.compile_ranges_endpoints
@staticmethod
def _set_compile_ranges(compilation_config, value):
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.17.0"):
compilation_config.compile_ranges_split_points = value
else:
compilation_config.compile_ranges_endpoints = value
compilation_config.compile_ranges_endpoints = value
def update_compile_ranges_split_points(self):
vllm_config = self.vllm_config

View File

@@ -32,7 +32,7 @@ from vllm.config import VllmConfig
from vllm.config.utils import Range
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
from vllm_ascend.utils import COMPILATION_PASS_KEY
logger = logging.getLogger(__name__)
@@ -86,11 +86,10 @@ def npugraph_ex_compile(
config.mode = "reduce-overhead"
# execute FX graph in eager mode before graph mode to optimize FX graph.
config.debug.run_eagerly = True
if not vllm_version_is("0.17.0"):
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
# and cause copy_between_host_and_device error.
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
# and cause copy_between_host_and_device error.
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
if ascend_compilation_config.enable_static_kernel:
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
# According to the cudagraph_capture_size configuration, set the shapes
@@ -142,21 +141,20 @@ class AscendCompiler(CompilerInterface):
# see https://github.com/pytorch/pytorch/issues/138980
graph = copy.deepcopy(graph)
if not vllm_version_is("0.17.0"):
from torch._guards import detect_fake_mode
from torch._guards import detect_fake_mode
current_fake_mode = detect_fake_mode()
if current_fake_mode is not None:
example_inputs = [
current_fake_mode.from_tensor(inp)
if (
isinstance(inp, torch.Tensor)
and hasattr(inp, "fake_mode")
and inp.fake_mode is not current_fake_mode
)
else inp
for inp in example_inputs
]
current_fake_mode = detect_fake_mode()
if current_fake_mode is not None:
example_inputs = [
current_fake_mode.from_tensor(inp)
if (
isinstance(inp, torch.Tensor)
and hasattr(inp, "fake_mode")
and inp.fake_mode is not current_fake_mode
)
else inp
for inp in example_inputs
]
ascend_compilation_config = get_ascend_config().ascend_compilation_config
if ascend_compilation_config.enable_npugraph_ex:

View File

@@ -12,7 +12,6 @@ from vllm.v1.kv_offload.spec import OffloadingSpec
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
from vllm_ascend.utils import vllm_version_is
class NPUOffloadingSpec(OffloadingSpec):
@@ -32,23 +31,15 @@ class NPUOffloadingSpec(OffloadingSpec):
def get_manager(self) -> OffloadingManager:
if not self._manager:
if vllm_version_is("0.17.0"):
kv_events_config = self.vllm_config.kv_events_config
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
self._manager = LRUOffloadingManager(
CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
else:
kv_events_config = self.vllm_config.kv_events_config
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
offloaded_block_size = gpu_block_size * self.block_size_factor
self._manager = LRUOffloadingManager(
CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
kv_events_config = self.vllm_config.kv_events_config
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
offloaded_block_size = gpu_block_size * self.block_size_factor
self._manager = LRUOffloadingManager(
CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
return self._manager
def get_handlers(
@@ -57,24 +48,15 @@ class NPUOffloadingSpec(OffloadingSpec):
attn_backends: dict[str, type[AttentionBackend]],
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
if not self._handler:
if vllm_version_is("0.17.0"):
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
else:
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=gpu_block_size,
cpu_block_size=gpu_block_size * self.block_size_factor,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=gpu_block_size,
cpu_block_size=gpu_block_size * self.block_size_factor,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
assert self._handler is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler

View File

@@ -19,8 +19,7 @@ import os
import vllm_ascend.patch.platform.patch_distributed # noqa
import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops # noqa
import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa
from vllm_ascend import envs
from vllm_ascend.utils import is_310p, vllm_version_is
from vllm_ascend.utils import is_310p
if not is_310p():
import vllm_ascend.patch.platform.patch_mamba_config # noqa
@@ -32,5 +31,3 @@ import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.17.0"):
import vllm_ascend.patch.platform.patch_balance_schedule # noqa

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import threading
import weakref
from collections import deque
from collections.abc import Callable
@@ -21,8 +20,6 @@ from vllm.v1.executor.multiproc_executor import (
set_multiprocessing_worker_envs,
)
from vllm_ascend.utils import vllm_version_is
class AscendMultiprocExecutor(MultiprocExecutor):
def _init_executor(self) -> None:
@@ -30,8 +27,6 @@ class AscendMultiprocExecutor(MultiprocExecutor):
# and ensure workers will be terminated.
self._finalizer = weakref.finalize(self, self.shutdown)
self.is_failed = False
if vllm_version_is("0.17.0"):
self.shutdown_event = threading.Event()
self.failure_callback: FailureCallback | None = None
tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes()
@@ -71,44 +66,29 @@ class AscendMultiprocExecutor(MultiprocExecutor):
success = False
try:
global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
if vllm_version_is("0.17.0"):
for local_rank in range(self.local_world_size):
global_rank = global_start_rank + local_rank
is_driver_worker = self._is_driver_worker(global_rank)
unready_workers.append(
AscendWorkerProc.make_worker_process(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=global_rank,
distributed_init_method=distributed_init_method,
input_shm_handle=scheduler_output_handle,
shared_worker_lock=shared_worker_lock,
is_driver_worker=is_driver_worker,
)
)
else:
# When using fork, keep track of socket file descriptors that are
# inherited by the worker, so that we can close them in subsequent
# workers
inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
for local_rank in range(self.local_world_size):
global_rank = global_start_rank + local_rank
is_driver_worker = self._is_driver_worker(global_rank)
unready_worker_handle = AscendWorkerProc.make_worker_process(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=global_rank,
distributed_init_method=distributed_init_method,
input_shm_handle=scheduler_output_handle,
shared_worker_lock=shared_worker_lock,
is_driver_worker=is_driver_worker,
inherited_fds=inherited_fds,
)
unready_workers.append(unready_worker_handle)
if inherited_fds is not None:
inherited_fds.append(unready_worker_handle.death_writer.fileno())
inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
# When using fork, keep track of socket file descriptors that are
# inherited by the worker, so that we can close them in subsequent
# workers
inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
for local_rank in range(self.local_world_size):
global_rank = global_start_rank + local_rank
is_driver_worker = self._is_driver_worker(global_rank)
unready_worker_handle = AscendWorkerProc.make_worker_process(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=global_rank,
distributed_init_method=distributed_init_method,
input_shm_handle=scheduler_output_handle,
shared_worker_lock=shared_worker_lock,
is_driver_worker=is_driver_worker,
inherited_fds=inherited_fds,
)
unready_workers.append(unready_worker_handle)
if inherited_fds is not None:
inherited_fds.append(unready_worker_handle.death_writer.fileno())
inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
# Workers must be created before wait_for_ready to avoid
# deadlock, since worker.init_device() does a device sync.
@@ -153,8 +133,7 @@ class AscendMultiprocExecutor(MultiprocExecutor):
for uw in unready_workers:
if uw.death_writer is not None:
uw.death_writer.close()
if not vllm_version_is("0.17.0"):
uw.death_writer = None
uw.death_writer = None
self._ensure_worker_termination([uw.proc for uw in unready_workers])
self.output_rank = self._get_output_rank()
@@ -192,73 +171,41 @@ class AscendWorkerProc(WorkerProc):
inherited_fds: list[int] | None = None,
) -> UnreadyWorkerProcHandle:
context = get_mp_context()
if vllm_version_is("0.17.0"):
# (reader, writer)
reader, writer = context.Pipe(duplex=False)
# Ready pipe to communicate readiness from child to parent
ready_reader, ready_writer = context.Pipe(duplex=False)
# Death pipe to let child detect parent process exit
death_reader, death_writer = context.Pipe(duplex=False)
if inherited_fds is not None:
inherited_fds = inherited_fds.copy()
inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
process_kwargs = {
"vllm_config": vllm_config,
"local_rank": local_rank,
"rank": rank,
"distributed_init_method": distributed_init_method,
"input_shm_handle": input_shm_handle,
"ready_pipe": ready_writer,
"death_pipe": death_reader,
"shared_worker_lock": shared_worker_lock,
"is_driver_worker": is_driver_worker,
# Have the worker close parent end of this worker's pipes too
"inherited_fds": inherited_fds if inherited_fds is not None else [],
}
# Run EngineCore busy loop in background process.
proc = context.Process(
target=WorkerProc.worker_main,
kwargs=process_kwargs,
name=f"VllmWorker-{rank}",
daemon=False,
)
# Create death pipe to detect parent process exit
death_reader, death_writer = context.Pipe(duplex=False)
process_kwargs = {
"vllm_config": vllm_config,
"local_rank": local_rank,
"rank": rank,
"distributed_init_method": distributed_init_method,
"input_shm_handle": input_shm_handle,
"ready_pipe": (reader, writer),
"death_pipe": death_reader,
"shared_worker_lock": shared_worker_lock,
"is_driver_worker": is_driver_worker,
}
# Run EngineCore busy loop in background process.
proc = context.Process(
target=WorkerProc.worker_main,
kwargs=process_kwargs,
name=f"VllmWorker-{rank}",
daemon=False,
)
proc.start()
writer.close()
# Keep death_writer open in parent - when parent exits,
# death_reader in child will get EOFError
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
else:
# Ready pipe to communicate readiness from child to parent
ready_reader, ready_writer = context.Pipe(duplex=False)
# Death pipe to let child detect parent process exit
death_reader, death_writer = context.Pipe(duplex=False)
if inherited_fds is not None:
inherited_fds = inherited_fds.copy()
inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
process_kwargs = {
"vllm_config": vllm_config,
"local_rank": local_rank,
"rank": rank,
"distributed_init_method": distributed_init_method,
"input_shm_handle": input_shm_handle,
"ready_pipe": ready_writer,
"death_pipe": death_reader,
"shared_worker_lock": shared_worker_lock,
"is_driver_worker": is_driver_worker,
# Have the worker close parent end of this worker's pipes too
"inherited_fds": inherited_fds if inherited_fds is not None else [],
}
# Run EngineCore busy loop in background process.
proc = context.Process(
target=WorkerProc.worker_main,
kwargs=process_kwargs,
name=f"VllmWorker-{rank}",
daemon=False,
)
proc.start()
# Close child ends of pipes here in the parent
ready_writer.close()
death_reader.close()
# Keep death_writer open in parent - when parent exits,
# death_reader in child will get EOFError
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
proc.start()
# Close child ends of pipes here in the parent
ready_writer.close()
death_reader.close()
# Keep death_writer open in parent - when parent exits,
# death_reader in child will get EOFError
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor

View File

@@ -126,7 +126,6 @@ from vllm_ascend.utils import (
is_moe_model,
lmhead_tp_enable,
set_weight_prefetch_method,
vllm_version_is,
)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
from vllm_ascend.worker.pcp_utils import PCPManager
@@ -292,27 +291,15 @@ class NPUModelRunner(GPUModelRunner):
if self.use_sparse_c8_indexer:
self.c8_k_cache_dtype = torch.int8
self.c8_k_scale_cache_dtype = torch.float16
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.17.0"):
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
)
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
)
try:
self.dcp_size = get_dcp_group().world_size
@@ -1389,9 +1376,7 @@ class NPUModelRunner(GPUModelRunner):
self.maybe_get_kv_connector_output(
scheduler_output,
**(
{"clear_metadata": clear_kv_metadata}
if vllm_version_is("0.17.0")
else {"defer_finalize": not clear_kv_metadata}
{"defer_finalize": not clear_kv_metadata}
),
) as kv_connector_output,
):
@@ -2567,17 +2552,14 @@ class NPUModelRunner(GPUModelRunner):
with get_tp_context(self.drafter):
self.drafter.load_model(self.model)
if self.use_aux_hidden_state_outputs:
if vllm_version_is("0.17.0"):
self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers())
else:
from vllm.model_executor.models.interfaces import supports_eagle3
if not supports_eagle3(self.model):
raise RuntimeError(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers()
self.model.set_aux_hidden_state_layers(aux_layers)
from vllm.model_executor.models.interfaces import supports_eagle3
if not supports_eagle3(self.model):
raise RuntimeError(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers()
self.model.set_aux_hidden_state_layers(aux_layers)
if self.lora_config:
self.model = self.load_lora_model(self.model, self.vllm_config, self.device)
@@ -2617,7 +2599,9 @@ class NPUModelRunner(GPUModelRunner):
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
):
assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
block_size = (self.kernel_block_sizes[0] if isinstance(
self.kernel_block_sizes, list) else self.kernel_block_sizes)
self.drafter.initialize_attn_backend(kv_cache_config, block_size)
if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -3091,18 +3075,11 @@ class NPUModelRunner(GPUModelRunner):
max_num_blocks.append(max_num_blocks_per_req)
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
if vllm_version_is("0.17.0"):
assert self.cache_config.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
else:
assert self.offload_config.uva.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
assert self.offload_config.uva.cpu_offload_gb == 0, (
"Cannot re-initialize the input batch when CPU weight "
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
"for more details."
)
self.input_batch = NPUInputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=max_model_len,