upgrade to 0.18.0 (#7502)
### What this PR does / why we need it?
1. upgrade to 0.18.0
2. ensure kernel_block_sizes is int for Eagle drafter
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.17.0
- vLLM main:
8b6325758c
---------
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
|||||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||||
vllm_version:
|
vllm_version:
|
||||||
required: false
|
required: false
|
||||||
default: "v0.17.0"
|
default: "v0.18.0"
|
||||||
type: string
|
type: string
|
||||||
description: vllm version to use
|
description: vllm version to use
|
||||||
vllm_ascend_remote_url:
|
vllm_ascend_remote_url:
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ on:
|
|||||||
vllm_version:
|
vllm_version:
|
||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
default: "v0.17.0"
|
default: "v0.18.0"
|
||||||
is_pr_test:
|
is_pr_test:
|
||||||
required: true
|
required: true
|
||||||
type: boolean
|
type: boolean
|
||||||
|
|||||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
4
.github/workflows/pr_test_light.yaml
vendored
4
.github/workflows/pr_test_light.yaml
vendored
@@ -90,7 +90,7 @@ jobs:
|
|||||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
|
||||||
uses: ./.github/workflows/_unit_test.yaml
|
uses: ./.github/workflows/_unit_test.yaml
|
||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
@@ -102,7 +102,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
@@ -277,7 +277,7 @@ jobs:
|
|||||||
- Qwen3-Omni-30B-A3B-Instruct
|
- Qwen3-Omni-30B-A3B-Instruct
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||||
with:
|
with:
|
||||||
vllm: v0.17.0
|
vllm: v0.18.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11'
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- vllm_branch: v0.17.0
|
- vllm_branch: v0.18.0
|
||||||
vllm_ascend_branch: main
|
vllm_ascend_branch: main
|
||||||
max-parallel: 1
|
max-parallel: 1
|
||||||
container:
|
container:
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ jobs:
|
|||||||
name: e2e-test
|
name: e2e-test
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [v0.17.0]
|
vllm_version: [v0.18.0]
|
||||||
type: [full, light]
|
type: [full, light]
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
with:
|
with:
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
|
|||||||
|
|
||||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||||
|-------------|--------------|------------------|-------------|--------------------|
|
|-------------|--------------|------------------|-------------|--------------------|
|
||||||
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||||
|
|
||||||
## Release cadence
|
## Release cadence
|
||||||
|
|
||||||
|
|||||||
@@ -32,9 +32,6 @@ TENSOR_PARALLELS = [1]
|
|||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||||
async def test_models(model: str, tp_size: int) -> None:
|
async def test_models(model: str, tp_size: int) -> None:
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if not vllm_version_is("0.17.0"):
|
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
|
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ from vllm.v1.kv_cache_interface import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, vllm_version_is
|
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
|
||||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||||
|
|
||||||
@@ -451,13 +451,6 @@ class NPUModelRunner310(NPUModelRunner):
|
|||||||
self.kernel_block_sizes.append([0])
|
self.kernel_block_sizes.append([0])
|
||||||
|
|
||||||
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
assert self.cache_config.cpu_offload_gb == 0, (
|
|
||||||
"Cannot re-initialize the input batch when CPU weight "
|
|
||||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
|
||||||
"for more details."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||||
"Cannot re-initialize the input batch when CPU weight "
|
"Cannot re-initialize the input batch when CPU weight "
|
||||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||||
|
|||||||
@@ -161,20 +161,10 @@ class AscendConfig:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_compile_ranges(compilation_config):
|
def _get_compile_ranges(compilation_config):
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
return compilation_config.compile_ranges_split_points
|
|
||||||
else:
|
|
||||||
return compilation_config.compile_ranges_endpoints
|
return compilation_config.compile_ranges_endpoints
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _set_compile_ranges(compilation_config, value):
|
def _set_compile_ranges(compilation_config, value):
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
compilation_config.compile_ranges_split_points = value
|
|
||||||
else:
|
|
||||||
compilation_config.compile_ranges_endpoints = value
|
compilation_config.compile_ranges_endpoints = value
|
||||||
|
|
||||||
def update_compile_ranges_split_points(self):
|
def update_compile_ranges_split_points(self):
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.config.utils import Range
|
from vllm.config.utils import Range
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
||||||
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
|
from vllm_ascend.utils import COMPILATION_PASS_KEY
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -86,7 +86,6 @@ def npugraph_ex_compile(
|
|||||||
config.mode = "reduce-overhead"
|
config.mode = "reduce-overhead"
|
||||||
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
||||||
config.debug.run_eagerly = True
|
config.debug.run_eagerly = True
|
||||||
if not vllm_version_is("0.17.0"):
|
|
||||||
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||||
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||||
# and cause copy_between_host_and_device error.
|
# and cause copy_between_host_and_device error.
|
||||||
@@ -142,7 +141,6 @@ class AscendCompiler(CompilerInterface):
|
|||||||
# see https://github.com/pytorch/pytorch/issues/138980
|
# see https://github.com/pytorch/pytorch/issues/138980
|
||||||
graph = copy.deepcopy(graph)
|
graph = copy.deepcopy(graph)
|
||||||
|
|
||||||
if not vllm_version_is("0.17.0"):
|
|
||||||
from torch._guards import detect_fake_mode
|
from torch._guards import detect_fake_mode
|
||||||
|
|
||||||
current_fake_mode = detect_fake_mode()
|
current_fake_mode = detect_fake_mode()
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ from vllm.v1.kv_offload.spec import OffloadingSpec
|
|||||||
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
|
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
|
||||||
|
|
||||||
from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
|
from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class NPUOffloadingSpec(OffloadingSpec):
|
class NPUOffloadingSpec(OffloadingSpec):
|
||||||
@@ -32,14 +31,6 @@ class NPUOffloadingSpec(OffloadingSpec):
|
|||||||
|
|
||||||
def get_manager(self) -> OffloadingManager:
|
def get_manager(self) -> OffloadingManager:
|
||||||
if not self._manager:
|
if not self._manager:
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
kv_events_config = self.vllm_config.kv_events_config
|
|
||||||
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
|
|
||||||
self._manager = LRUOffloadingManager(
|
|
||||||
CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks),
|
|
||||||
enable_events=enable_events,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
kv_events_config = self.vllm_config.kv_events_config
|
kv_events_config = self.vllm_config.kv_events_config
|
||||||
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
|
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
|
||||||
assert len(self.gpu_block_size) == 1
|
assert len(self.gpu_block_size) == 1
|
||||||
@@ -57,15 +48,6 @@ class NPUOffloadingSpec(OffloadingSpec):
|
|||||||
attn_backends: dict[str, type[AttentionBackend]],
|
attn_backends: dict[str, type[AttentionBackend]],
|
||||||
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
|
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
|
||||||
if not self._handler:
|
if not self._handler:
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
self._handler = CpuNpuOffloadingHandler(
|
|
||||||
attn_backends=attn_backends,
|
|
||||||
gpu_block_size=self.gpu_block_size,
|
|
||||||
cpu_block_size=self.offloaded_block_size,
|
|
||||||
num_cpu_blocks=self.num_cpu_blocks,
|
|
||||||
gpu_caches=kv_caches,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert len(self.gpu_block_size) == 1
|
assert len(self.gpu_block_size) == 1
|
||||||
gpu_block_size = self.gpu_block_size[0]
|
gpu_block_size = self.gpu_block_size[0]
|
||||||
self._handler = CpuNpuOffloadingHandler(
|
self._handler = CpuNpuOffloadingHandler(
|
||||||
|
|||||||
@@ -19,8 +19,7 @@ import os
|
|||||||
import vllm_ascend.patch.platform.patch_distributed # noqa
|
import vllm_ascend.patch.platform.patch_distributed # noqa
|
||||||
import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops # noqa
|
import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops # noqa
|
||||||
import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa
|
import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa
|
||||||
from vllm_ascend import envs
|
from vllm_ascend.utils import is_310p
|
||||||
from vllm_ascend.utils import is_310p, vllm_version_is
|
|
||||||
|
|
||||||
if not is_310p():
|
if not is_310p():
|
||||||
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
||||||
@@ -32,5 +31,3 @@ import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
|
|||||||
|
|
||||||
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
||||||
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
||||||
if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.17.0"):
|
|
||||||
import vllm_ascend.patch.platform.patch_balance_schedule # noqa
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import threading
|
|
||||||
import weakref
|
import weakref
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
@@ -21,8 +20,6 @@ from vllm.v1.executor.multiproc_executor import (
|
|||||||
set_multiprocessing_worker_envs,
|
set_multiprocessing_worker_envs,
|
||||||
)
|
)
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
class AscendMultiprocExecutor(MultiprocExecutor):
|
class AscendMultiprocExecutor(MultiprocExecutor):
|
||||||
def _init_executor(self) -> None:
|
def _init_executor(self) -> None:
|
||||||
@@ -30,8 +27,6 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
# and ensure workers will be terminated.
|
# and ensure workers will be terminated.
|
||||||
self._finalizer = weakref.finalize(self, self.shutdown)
|
self._finalizer = weakref.finalize(self, self.shutdown)
|
||||||
self.is_failed = False
|
self.is_failed = False
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
self.shutdown_event = threading.Event()
|
|
||||||
self.failure_callback: FailureCallback | None = None
|
self.failure_callback: FailureCallback | None = None
|
||||||
|
|
||||||
tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes()
|
tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes()
|
||||||
@@ -71,22 +66,7 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
|
global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
for local_rank in range(self.local_world_size):
|
|
||||||
global_rank = global_start_rank + local_rank
|
|
||||||
is_driver_worker = self._is_driver_worker(global_rank)
|
|
||||||
unready_workers.append(
|
|
||||||
AscendWorkerProc.make_worker_process(
|
|
||||||
vllm_config=self.vllm_config,
|
|
||||||
local_rank=local_rank,
|
|
||||||
rank=global_rank,
|
|
||||||
distributed_init_method=distributed_init_method,
|
|
||||||
input_shm_handle=scheduler_output_handle,
|
|
||||||
shared_worker_lock=shared_worker_lock,
|
|
||||||
is_driver_worker=is_driver_worker,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# When using fork, keep track of socket file descriptors that are
|
# When using fork, keep track of socket file descriptors that are
|
||||||
# inherited by the worker, so that we can close them in subsequent
|
# inherited by the worker, so that we can close them in subsequent
|
||||||
# workers
|
# workers
|
||||||
@@ -153,7 +133,6 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
for uw in unready_workers:
|
for uw in unready_workers:
|
||||||
if uw.death_writer is not None:
|
if uw.death_writer is not None:
|
||||||
uw.death_writer.close()
|
uw.death_writer.close()
|
||||||
if not vllm_version_is("0.17.0"):
|
|
||||||
uw.death_writer = None
|
uw.death_writer = None
|
||||||
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
||||||
|
|
||||||
@@ -192,38 +171,6 @@ class AscendWorkerProc(WorkerProc):
|
|||||||
inherited_fds: list[int] | None = None,
|
inherited_fds: list[int] | None = None,
|
||||||
) -> UnreadyWorkerProcHandle:
|
) -> UnreadyWorkerProcHandle:
|
||||||
context = get_mp_context()
|
context = get_mp_context()
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
# (reader, writer)
|
|
||||||
reader, writer = context.Pipe(duplex=False)
|
|
||||||
|
|
||||||
# Create death pipe to detect parent process exit
|
|
||||||
death_reader, death_writer = context.Pipe(duplex=False)
|
|
||||||
|
|
||||||
process_kwargs = {
|
|
||||||
"vllm_config": vllm_config,
|
|
||||||
"local_rank": local_rank,
|
|
||||||
"rank": rank,
|
|
||||||
"distributed_init_method": distributed_init_method,
|
|
||||||
"input_shm_handle": input_shm_handle,
|
|
||||||
"ready_pipe": (reader, writer),
|
|
||||||
"death_pipe": death_reader,
|
|
||||||
"shared_worker_lock": shared_worker_lock,
|
|
||||||
"is_driver_worker": is_driver_worker,
|
|
||||||
}
|
|
||||||
# Run EngineCore busy loop in background process.
|
|
||||||
proc = context.Process(
|
|
||||||
target=WorkerProc.worker_main,
|
|
||||||
kwargs=process_kwargs,
|
|
||||||
name=f"VllmWorker-{rank}",
|
|
||||||
daemon=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
proc.start()
|
|
||||||
writer.close()
|
|
||||||
# Keep death_writer open in parent - when parent exits,
|
|
||||||
# death_reader in child will get EOFError
|
|
||||||
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
|
|
||||||
else:
|
|
||||||
# Ready pipe to communicate readiness from child to parent
|
# Ready pipe to communicate readiness from child to parent
|
||||||
ready_reader, ready_writer = context.Pipe(duplex=False)
|
ready_reader, ready_writer = context.Pipe(duplex=False)
|
||||||
# Death pipe to let child detect parent process exit
|
# Death pipe to let child detect parent process exit
|
||||||
|
|||||||
@@ -126,7 +126,6 @@ from vllm_ascend.utils import (
|
|||||||
is_moe_model,
|
is_moe_model,
|
||||||
lmhead_tp_enable,
|
lmhead_tp_enable,
|
||||||
set_weight_prefetch_method,
|
set_weight_prefetch_method,
|
||||||
vllm_version_is,
|
|
||||||
)
|
)
|
||||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||||
from vllm_ascend.worker.pcp_utils import PCPManager
|
from vllm_ascend.worker.pcp_utils import PCPManager
|
||||||
@@ -292,19 +291,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
if self.use_sparse_c8_indexer:
|
if self.use_sparse_c8_indexer:
|
||||||
self.c8_k_cache_dtype = torch.int8
|
self.c8_k_cache_dtype = torch.int8
|
||||||
self.c8_k_scale_cache_dtype = torch.float16
|
self.c8_k_scale_cache_dtype = torch.float16
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
self.attn_backend = get_attn_backend(
|
|
||||||
0,
|
|
||||||
self.dtype,
|
|
||||||
None,
|
|
||||||
self.block_size,
|
|
||||||
use_mla=self.model_config.use_mla,
|
|
||||||
use_sparse=self.use_sparse,
|
|
||||||
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.attn_backend = get_attn_backend(
|
self.attn_backend = get_attn_backend(
|
||||||
0,
|
0,
|
||||||
self.dtype,
|
self.dtype,
|
||||||
@@ -1389,9 +1376,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.maybe_get_kv_connector_output(
|
self.maybe_get_kv_connector_output(
|
||||||
scheduler_output,
|
scheduler_output,
|
||||||
**(
|
**(
|
||||||
{"clear_metadata": clear_kv_metadata}
|
{"defer_finalize": not clear_kv_metadata}
|
||||||
if vllm_version_is("0.17.0")
|
|
||||||
else {"defer_finalize": not clear_kv_metadata}
|
|
||||||
),
|
),
|
||||||
) as kv_connector_output,
|
) as kv_connector_output,
|
||||||
):
|
):
|
||||||
@@ -2567,9 +2552,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
with get_tp_context(self.drafter):
|
with get_tp_context(self.drafter):
|
||||||
self.drafter.load_model(self.model)
|
self.drafter.load_model(self.model)
|
||||||
if self.use_aux_hidden_state_outputs:
|
if self.use_aux_hidden_state_outputs:
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers())
|
|
||||||
else:
|
|
||||||
from vllm.model_executor.models.interfaces import supports_eagle3
|
from vllm.model_executor.models.interfaces import supports_eagle3
|
||||||
if not supports_eagle3(self.model):
|
if not supports_eagle3(self.model):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@@ -2617,7 +2599,9 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
|
self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
|
||||||
):
|
):
|
||||||
assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
|
assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
|
||||||
self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
|
block_size = (self.kernel_block_sizes[0] if isinstance(
|
||||||
|
self.kernel_block_sizes, list) else self.kernel_block_sizes)
|
||||||
|
self.drafter.initialize_attn_backend(kv_cache_config, block_size)
|
||||||
|
|
||||||
if has_kv_transfer_group():
|
if has_kv_transfer_group():
|
||||||
get_kv_transfer_group().register_kv_caches(kv_caches)
|
get_kv_transfer_group().register_kv_caches(kv_caches)
|
||||||
@@ -3091,13 +3075,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
max_num_blocks.append(max_num_blocks_per_req)
|
max_num_blocks.append(max_num_blocks_per_req)
|
||||||
|
|
||||||
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
assert self.cache_config.cpu_offload_gb == 0, (
|
|
||||||
"Cannot re-initialize the input batch when CPU weight "
|
|
||||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
|
||||||
"for more details."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||||
"Cannot re-initialize the input batch when CPU weight "
|
"Cannot re-initialize the input batch when CPU weight "
|
||||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||||
|
|||||||
Reference in New Issue
Block a user