upgrade to 0.18.0 (#7502)
### What this PR does / why we need it?
1. upgrade to 0.18.0
2. ensure kernel_block_sizes is int for Eagle drafter
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.17.0
- vLLM main:
8b6325758c
---------
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||
vllm_version:
|
||||
required: false
|
||||
default: "v0.17.0"
|
||||
default: "v0.18.0"
|
||||
type: string
|
||||
description: vllm version to use
|
||||
vllm_ascend_remote_url:
|
||||
|
||||
@@ -39,7 +39,7 @@ on:
|
||||
vllm_version:
|
||||
required: false
|
||||
type: string
|
||||
default: "v0.17.0"
|
||||
default: "v0.18.0"
|
||||
is_pr_test:
|
||||
required: true
|
||||
type: boolean
|
||||
|
||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
4
.github/workflows/pr_test_light.yaml
vendored
4
.github/workflows/pr_test_light.yaml
vendored
@@ -90,7 +90,7 @@ jobs:
|
||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
|
||||
uses: ./.github/workflows/_unit_test.yaml
|
||||
with:
|
||||
vllm: ${{ matrix.vllm_version }}
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
@@ -277,7 +277,7 @@ jobs:
|
||||
- Qwen3-Omni-30B-A3B-Instruct
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||
with:
|
||||
vllm: v0.17.0
|
||||
vllm: v0.18.0
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'
|
||||
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- vllm_branch: v0.17.0
|
||||
- vllm_branch: v0.18.0
|
||||
vllm_ascend_branch: main
|
||||
max-parallel: 1
|
||||
container:
|
||||
|
||||
@@ -23,7 +23,7 @@ jobs:
|
||||
name: e2e-test
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [v0.17.0]
|
||||
vllm_version: [v0.18.0]
|
||||
type: [full, light]
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
with:
|
||||
|
||||
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
|
||||
|
||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||
|-------------|--------------|------------------|-------------|--------------------|
|
||||
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||
|
||||
## Release cadence
|
||||
|
||||
|
||||
@@ -32,12 +32,9 @@ TENSOR_PARALLELS = [1]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
async def test_models(model: str, tp_size: int) -> None:
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if not vllm_version_is("0.17.0"):
|
||||
pytest.skip(
|
||||
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
|
||||
)
|
||||
pytest.skip(
|
||||
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
|
||||
)
|
||||
encode_port = get_open_port()
|
||||
pd_port = get_open_port()
|
||||
vllm_server_args = [
|
||||
|
||||
@@ -37,7 +37,7 @@ from vllm.v1.kv_cache_interface import (
|
||||
)
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, vllm_version_is
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||
|
||||
@@ -451,18 +451,11 @@ class NPUModelRunner310(NPUModelRunner):
|
||||
self.kernel_block_sizes.append([0])
|
||||
|
||||
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||
if vllm_version_is("0.17.0"):
|
||||
assert self.cache_config.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
"for more details."
|
||||
)
|
||||
else:
|
||||
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
"for more details."
|
||||
)
|
||||
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
"for more details."
|
||||
)
|
||||
self.input_batch = NPUInputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=max(self.model_config.max_model_len, self.max_encoder_len),
|
||||
|
||||
@@ -161,21 +161,11 @@ class AscendConfig:
|
||||
|
||||
@staticmethod
|
||||
def _get_compile_ranges(compilation_config):
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.17.0"):
|
||||
return compilation_config.compile_ranges_split_points
|
||||
else:
|
||||
return compilation_config.compile_ranges_endpoints
|
||||
return compilation_config.compile_ranges_endpoints
|
||||
|
||||
@staticmethod
|
||||
def _set_compile_ranges(compilation_config, value):
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.17.0"):
|
||||
compilation_config.compile_ranges_split_points = value
|
||||
else:
|
||||
compilation_config.compile_ranges_endpoints = value
|
||||
compilation_config.compile_ranges_endpoints = value
|
||||
|
||||
def update_compile_ranges_split_points(self):
|
||||
vllm_config = self.vllm_config
|
||||
|
||||
@@ -32,7 +32,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.config.utils import Range
|
||||
|
||||
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
|
||||
from vllm_ascend.utils import COMPILATION_PASS_KEY
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -86,11 +86,10 @@ def npugraph_ex_compile(
|
||||
config.mode = "reduce-overhead"
|
||||
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
||||
config.debug.run_eagerly = True
|
||||
if not vllm_version_is("0.17.0"):
|
||||
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||
# and cause copy_between_host_and_device error.
|
||||
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
|
||||
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||
# and cause copy_between_host_and_device error.
|
||||
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
|
||||
if ascend_compilation_config.enable_static_kernel:
|
||||
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
|
||||
# According to the cudagraph_capture_size configuration, set the shapes
|
||||
@@ -142,21 +141,20 @@ class AscendCompiler(CompilerInterface):
|
||||
# see https://github.com/pytorch/pytorch/issues/138980
|
||||
graph = copy.deepcopy(graph)
|
||||
|
||||
if not vllm_version_is("0.17.0"):
|
||||
from torch._guards import detect_fake_mode
|
||||
from torch._guards import detect_fake_mode
|
||||
|
||||
current_fake_mode = detect_fake_mode()
|
||||
if current_fake_mode is not None:
|
||||
example_inputs = [
|
||||
current_fake_mode.from_tensor(inp)
|
||||
if (
|
||||
isinstance(inp, torch.Tensor)
|
||||
and hasattr(inp, "fake_mode")
|
||||
and inp.fake_mode is not current_fake_mode
|
||||
)
|
||||
else inp
|
||||
for inp in example_inputs
|
||||
]
|
||||
current_fake_mode = detect_fake_mode()
|
||||
if current_fake_mode is not None:
|
||||
example_inputs = [
|
||||
current_fake_mode.from_tensor(inp)
|
||||
if (
|
||||
isinstance(inp, torch.Tensor)
|
||||
and hasattr(inp, "fake_mode")
|
||||
and inp.fake_mode is not current_fake_mode
|
||||
)
|
||||
else inp
|
||||
for inp in example_inputs
|
||||
]
|
||||
|
||||
ascend_compilation_config = get_ascend_config().ascend_compilation_config
|
||||
if ascend_compilation_config.enable_npugraph_ex:
|
||||
|
||||
@@ -12,7 +12,6 @@ from vllm.v1.kv_offload.spec import OffloadingSpec
|
||||
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
|
||||
|
||||
from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class NPUOffloadingSpec(OffloadingSpec):
|
||||
@@ -32,23 +31,15 @@ class NPUOffloadingSpec(OffloadingSpec):
|
||||
|
||||
def get_manager(self) -> OffloadingManager:
|
||||
if not self._manager:
|
||||
if vllm_version_is("0.17.0"):
|
||||
kv_events_config = self.vllm_config.kv_events_config
|
||||
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
|
||||
self._manager = LRUOffloadingManager(
|
||||
CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks),
|
||||
enable_events=enable_events,
|
||||
)
|
||||
else:
|
||||
kv_events_config = self.vllm_config.kv_events_config
|
||||
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
|
||||
assert len(self.gpu_block_size) == 1
|
||||
gpu_block_size = self.gpu_block_size[0]
|
||||
offloaded_block_size = gpu_block_size * self.block_size_factor
|
||||
self._manager = LRUOffloadingManager(
|
||||
CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
|
||||
enable_events=enable_events,
|
||||
)
|
||||
kv_events_config = self.vllm_config.kv_events_config
|
||||
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
|
||||
assert len(self.gpu_block_size) == 1
|
||||
gpu_block_size = self.gpu_block_size[0]
|
||||
offloaded_block_size = gpu_block_size * self.block_size_factor
|
||||
self._manager = LRUOffloadingManager(
|
||||
CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
|
||||
enable_events=enable_events,
|
||||
)
|
||||
return self._manager
|
||||
|
||||
def get_handlers(
|
||||
@@ -57,24 +48,15 @@ class NPUOffloadingSpec(OffloadingSpec):
|
||||
attn_backends: dict[str, type[AttentionBackend]],
|
||||
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
|
||||
if not self._handler:
|
||||
if vllm_version_is("0.17.0"):
|
||||
self._handler = CpuNpuOffloadingHandler(
|
||||
attn_backends=attn_backends,
|
||||
gpu_block_size=self.gpu_block_size,
|
||||
cpu_block_size=self.offloaded_block_size,
|
||||
num_cpu_blocks=self.num_cpu_blocks,
|
||||
gpu_caches=kv_caches,
|
||||
)
|
||||
else:
|
||||
assert len(self.gpu_block_size) == 1
|
||||
gpu_block_size = self.gpu_block_size[0]
|
||||
self._handler = CpuNpuOffloadingHandler(
|
||||
attn_backends=attn_backends,
|
||||
gpu_block_size=gpu_block_size,
|
||||
cpu_block_size=gpu_block_size * self.block_size_factor,
|
||||
num_cpu_blocks=self.num_cpu_blocks,
|
||||
gpu_caches=kv_caches,
|
||||
)
|
||||
assert len(self.gpu_block_size) == 1
|
||||
gpu_block_size = self.gpu_block_size[0]
|
||||
self._handler = CpuNpuOffloadingHandler(
|
||||
attn_backends=attn_backends,
|
||||
gpu_block_size=gpu_block_size,
|
||||
cpu_block_size=gpu_block_size * self.block_size_factor,
|
||||
num_cpu_blocks=self.num_cpu_blocks,
|
||||
gpu_caches=kv_caches,
|
||||
)
|
||||
|
||||
assert self._handler is not None
|
||||
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
|
||||
|
||||
@@ -19,8 +19,7 @@ import os
|
||||
import vllm_ascend.patch.platform.patch_distributed # noqa
|
||||
import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops # noqa
|
||||
import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa
|
||||
from vllm_ascend import envs
|
||||
from vllm_ascend.utils import is_310p, vllm_version_is
|
||||
from vllm_ascend.utils import is_310p
|
||||
|
||||
if not is_310p():
|
||||
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
||||
@@ -32,5 +31,3 @@ import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
|
||||
|
||||
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
||||
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
||||
if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.17.0"):
|
||||
import vllm_ascend.patch.platform.patch_balance_schedule # noqa
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
import weakref
|
||||
from collections import deque
|
||||
from collections.abc import Callable
|
||||
@@ -21,8 +20,6 @@ from vllm.v1.executor.multiproc_executor import (
|
||||
set_multiprocessing_worker_envs,
|
||||
)
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendMultiprocExecutor(MultiprocExecutor):
|
||||
def _init_executor(self) -> None:
|
||||
@@ -30,8 +27,6 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
||||
# and ensure workers will be terminated.
|
||||
self._finalizer = weakref.finalize(self, self.shutdown)
|
||||
self.is_failed = False
|
||||
if vllm_version_is("0.17.0"):
|
||||
self.shutdown_event = threading.Event()
|
||||
self.failure_callback: FailureCallback | None = None
|
||||
|
||||
tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes()
|
||||
@@ -71,44 +66,29 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
||||
success = False
|
||||
try:
|
||||
global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
|
||||
if vllm_version_is("0.17.0"):
|
||||
for local_rank in range(self.local_world_size):
|
||||
global_rank = global_start_rank + local_rank
|
||||
is_driver_worker = self._is_driver_worker(global_rank)
|
||||
unready_workers.append(
|
||||
AscendWorkerProc.make_worker_process(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=global_rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
input_shm_handle=scheduler_output_handle,
|
||||
shared_worker_lock=shared_worker_lock,
|
||||
is_driver_worker=is_driver_worker,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# When using fork, keep track of socket file descriptors that are
|
||||
# inherited by the worker, so that we can close them in subsequent
|
||||
# workers
|
||||
inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
|
||||
|
||||
for local_rank in range(self.local_world_size):
|
||||
global_rank = global_start_rank + local_rank
|
||||
is_driver_worker = self._is_driver_worker(global_rank)
|
||||
unready_worker_handle = AscendWorkerProc.make_worker_process(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=global_rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
input_shm_handle=scheduler_output_handle,
|
||||
shared_worker_lock=shared_worker_lock,
|
||||
is_driver_worker=is_driver_worker,
|
||||
inherited_fds=inherited_fds,
|
||||
)
|
||||
unready_workers.append(unready_worker_handle)
|
||||
if inherited_fds is not None:
|
||||
inherited_fds.append(unready_worker_handle.death_writer.fileno())
|
||||
inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
|
||||
# When using fork, keep track of socket file descriptors that are
|
||||
# inherited by the worker, so that we can close them in subsequent
|
||||
# workers
|
||||
inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
|
||||
|
||||
for local_rank in range(self.local_world_size):
|
||||
global_rank = global_start_rank + local_rank
|
||||
is_driver_worker = self._is_driver_worker(global_rank)
|
||||
unready_worker_handle = AscendWorkerProc.make_worker_process(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=global_rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
input_shm_handle=scheduler_output_handle,
|
||||
shared_worker_lock=shared_worker_lock,
|
||||
is_driver_worker=is_driver_worker,
|
||||
inherited_fds=inherited_fds,
|
||||
)
|
||||
unready_workers.append(unready_worker_handle)
|
||||
if inherited_fds is not None:
|
||||
inherited_fds.append(unready_worker_handle.death_writer.fileno())
|
||||
inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
|
||||
|
||||
# Workers must be created before wait_for_ready to avoid
|
||||
# deadlock, since worker.init_device() does a device sync.
|
||||
@@ -153,8 +133,7 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
||||
for uw in unready_workers:
|
||||
if uw.death_writer is not None:
|
||||
uw.death_writer.close()
|
||||
if not vllm_version_is("0.17.0"):
|
||||
uw.death_writer = None
|
||||
uw.death_writer = None
|
||||
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
||||
|
||||
self.output_rank = self._get_output_rank()
|
||||
@@ -192,73 +171,41 @@ class AscendWorkerProc(WorkerProc):
|
||||
inherited_fds: list[int] | None = None,
|
||||
) -> UnreadyWorkerProcHandle:
|
||||
context = get_mp_context()
|
||||
if vllm_version_is("0.17.0"):
|
||||
# (reader, writer)
|
||||
reader, writer = context.Pipe(duplex=False)
|
||||
# Ready pipe to communicate readiness from child to parent
|
||||
ready_reader, ready_writer = context.Pipe(duplex=False)
|
||||
# Death pipe to let child detect parent process exit
|
||||
death_reader, death_writer = context.Pipe(duplex=False)
|
||||
if inherited_fds is not None:
|
||||
inherited_fds = inherited_fds.copy()
|
||||
inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
|
||||
process_kwargs = {
|
||||
"vllm_config": vllm_config,
|
||||
"local_rank": local_rank,
|
||||
"rank": rank,
|
||||
"distributed_init_method": distributed_init_method,
|
||||
"input_shm_handle": input_shm_handle,
|
||||
"ready_pipe": ready_writer,
|
||||
"death_pipe": death_reader,
|
||||
"shared_worker_lock": shared_worker_lock,
|
||||
"is_driver_worker": is_driver_worker,
|
||||
# Have the worker close parent end of this worker's pipes too
|
||||
"inherited_fds": inherited_fds if inherited_fds is not None else [],
|
||||
}
|
||||
# Run EngineCore busy loop in background process.
|
||||
proc = context.Process(
|
||||
target=WorkerProc.worker_main,
|
||||
kwargs=process_kwargs,
|
||||
name=f"VllmWorker-{rank}",
|
||||
daemon=False,
|
||||
)
|
||||
|
||||
# Create death pipe to detect parent process exit
|
||||
death_reader, death_writer = context.Pipe(duplex=False)
|
||||
|
||||
process_kwargs = {
|
||||
"vllm_config": vllm_config,
|
||||
"local_rank": local_rank,
|
||||
"rank": rank,
|
||||
"distributed_init_method": distributed_init_method,
|
||||
"input_shm_handle": input_shm_handle,
|
||||
"ready_pipe": (reader, writer),
|
||||
"death_pipe": death_reader,
|
||||
"shared_worker_lock": shared_worker_lock,
|
||||
"is_driver_worker": is_driver_worker,
|
||||
}
|
||||
# Run EngineCore busy loop in background process.
|
||||
proc = context.Process(
|
||||
target=WorkerProc.worker_main,
|
||||
kwargs=process_kwargs,
|
||||
name=f"VllmWorker-{rank}",
|
||||
daemon=False,
|
||||
)
|
||||
|
||||
proc.start()
|
||||
writer.close()
|
||||
# Keep death_writer open in parent - when parent exits,
|
||||
# death_reader in child will get EOFError
|
||||
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
|
||||
else:
|
||||
# Ready pipe to communicate readiness from child to parent
|
||||
ready_reader, ready_writer = context.Pipe(duplex=False)
|
||||
# Death pipe to let child detect parent process exit
|
||||
death_reader, death_writer = context.Pipe(duplex=False)
|
||||
if inherited_fds is not None:
|
||||
inherited_fds = inherited_fds.copy()
|
||||
inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
|
||||
process_kwargs = {
|
||||
"vllm_config": vllm_config,
|
||||
"local_rank": local_rank,
|
||||
"rank": rank,
|
||||
"distributed_init_method": distributed_init_method,
|
||||
"input_shm_handle": input_shm_handle,
|
||||
"ready_pipe": ready_writer,
|
||||
"death_pipe": death_reader,
|
||||
"shared_worker_lock": shared_worker_lock,
|
||||
"is_driver_worker": is_driver_worker,
|
||||
# Have the worker close parent end of this worker's pipes too
|
||||
"inherited_fds": inherited_fds if inherited_fds is not None else [],
|
||||
}
|
||||
# Run EngineCore busy loop in background process.
|
||||
proc = context.Process(
|
||||
target=WorkerProc.worker_main,
|
||||
kwargs=process_kwargs,
|
||||
name=f"VllmWorker-{rank}",
|
||||
daemon=False,
|
||||
)
|
||||
|
||||
proc.start()
|
||||
# Close child ends of pipes here in the parent
|
||||
ready_writer.close()
|
||||
death_reader.close()
|
||||
# Keep death_writer open in parent - when parent exits,
|
||||
# death_reader in child will get EOFError
|
||||
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
|
||||
proc.start()
|
||||
# Close child ends of pipes here in the parent
|
||||
ready_writer.close()
|
||||
death_reader.close()
|
||||
# Keep death_writer open in parent - when parent exits,
|
||||
# death_reader in child will get EOFError
|
||||
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
|
||||
|
||||
|
||||
vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor
|
||||
|
||||
@@ -126,7 +126,6 @@ from vllm_ascend.utils import (
|
||||
is_moe_model,
|
||||
lmhead_tp_enable,
|
||||
set_weight_prefetch_method,
|
||||
vllm_version_is,
|
||||
)
|
||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||
from vllm_ascend.worker.pcp_utils import PCPManager
|
||||
@@ -292,27 +291,15 @@ class NPUModelRunner(GPUModelRunner):
|
||||
if self.use_sparse_c8_indexer:
|
||||
self.c8_k_cache_dtype = torch.int8
|
||||
self.c8_k_scale_cache_dtype = torch.float16
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.17.0"):
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
use_mla=self.model_config.use_mla,
|
||||
use_sparse=self.use_sparse,
|
||||
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
|
||||
)
|
||||
else:
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
use_mla=self.model_config.use_mla,
|
||||
use_sparse=self.use_sparse,
|
||||
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
|
||||
)
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
use_mla=self.model_config.use_mla,
|
||||
use_sparse=self.use_sparse,
|
||||
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
|
||||
)
|
||||
|
||||
try:
|
||||
self.dcp_size = get_dcp_group().world_size
|
||||
@@ -1389,9 +1376,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.maybe_get_kv_connector_output(
|
||||
scheduler_output,
|
||||
**(
|
||||
{"clear_metadata": clear_kv_metadata}
|
||||
if vllm_version_is("0.17.0")
|
||||
else {"defer_finalize": not clear_kv_metadata}
|
||||
{"defer_finalize": not clear_kv_metadata}
|
||||
),
|
||||
) as kv_connector_output,
|
||||
):
|
||||
@@ -2567,17 +2552,14 @@ class NPUModelRunner(GPUModelRunner):
|
||||
with get_tp_context(self.drafter):
|
||||
self.drafter.load_model(self.model)
|
||||
if self.use_aux_hidden_state_outputs:
|
||||
if vllm_version_is("0.17.0"):
|
||||
self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers())
|
||||
else:
|
||||
from vllm.model_executor.models.interfaces import supports_eagle3
|
||||
if not supports_eagle3(self.model):
|
||||
raise RuntimeError(
|
||||
"Model does not support EAGLE3 interface but "
|
||||
"aux_hidden_state_outputs was requested"
|
||||
)
|
||||
aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers()
|
||||
self.model.set_aux_hidden_state_layers(aux_layers)
|
||||
from vllm.model_executor.models.interfaces import supports_eagle3
|
||||
if not supports_eagle3(self.model):
|
||||
raise RuntimeError(
|
||||
"Model does not support EAGLE3 interface but "
|
||||
"aux_hidden_state_outputs was requested"
|
||||
)
|
||||
aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers()
|
||||
self.model.set_aux_hidden_state_layers(aux_layers)
|
||||
|
||||
if self.lora_config:
|
||||
self.model = self.load_lora_model(self.model, self.vllm_config, self.device)
|
||||
@@ -2617,7 +2599,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
|
||||
):
|
||||
assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
|
||||
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
|
||||
block_size = (self.kernel_block_sizes[0] if isinstance(
|
||||
self.kernel_block_sizes, list) else self.kernel_block_sizes)
|
||||
self.drafter.initialize_attn_backend(kv_cache_config, block_size)
|
||||
|
||||
if has_kv_transfer_group():
|
||||
get_kv_transfer_group().register_kv_caches(kv_caches)
|
||||
@@ -3091,18 +3075,11 @@ class NPUModelRunner(GPUModelRunner):
|
||||
max_num_blocks.append(max_num_blocks_per_req)
|
||||
|
||||
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||
if vllm_version_is("0.17.0"):
|
||||
assert self.cache_config.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
"for more details."
|
||||
)
|
||||
else:
|
||||
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
"for more details."
|
||||
)
|
||||
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||
"Cannot re-initialize the input batch when CPU weight "
|
||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||
"for more details."
|
||||
)
|
||||
self.input_batch = NPUInputBatch(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
max_model_len=max_model_len,
|
||||
|
||||
Reference in New Issue
Block a user