diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 42f96e12..ff9513f6 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "v0.17.0" + default: "v0.18.0" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index b2f319c4..c2c55be6 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -39,7 +39,7 @@ on: vllm_version: required: false type: string - default: "v0.17.0" + default: "v0.18.0" is_pr_test: required: true type: boolean diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index ca15a121..a1c9f9a9 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0] + vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 0c693fdd..9ee40f3a 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0] + vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0] + vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index f79d9496..11026061 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -277,7 +277,7 @@ jobs: - Qwen3-Omni-30B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.17.0 + vllm: v0.18.0 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11' diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml index 60d5f14c..a62c4333 100644 --- a/.github/workflows/schedule_test_benchmarks.yaml +++ b/.github/workflows/schedule_test_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.17.0 + - vllm_branch: v0.18.0 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/.github/workflows/schedule_update_estimated_time.yaml b/.github/workflows/schedule_update_estimated_time.yaml index 4f2d2ebd..01175419 100644 --- a/.github/workflows/schedule_update_estimated_time.yaml +++ b/.github/workflows/schedule_update_estimated_time.yaml @@ -23,7 +23,7 @@ jobs: name: e2e-test strategy: matrix: - vllm_version: [v0.17.0] + vllm_version: [v0.18.0] type: [full, light] uses: ./.github/workflows/_e2e_test.yaml with: diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 4cb47037..a221d657 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py index 6635f491..8128a119 100644 --- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py +++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py @@ -32,12 +32,9 @@ TENSOR_PARALLELS = [1] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) async def test_models(model: str, tp_size: int) -> None: - from vllm_ascend.utils import vllm_version_is - - if not vllm_version_is("0.17.0"): - pytest.skip( - "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", - ) + pytest.skip( + "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", + ) encode_port = get_open_port() pd_port = get_open_port() vllm_server_args = [ diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py index 4a071f12..5c09c0e7 100644 --- a/vllm_ascend/_310p/model_runner_310p.py +++ b/vllm_ascend/_310p/model_runner_310p.py @@ -37,7 +37,7 @@ from vllm.v1.kv_cache_interface import ( ) from vllm_ascend.attention.attention_v1 import AscendAttentionState -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, vllm_version_is +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner from vllm_ascend.worker.npu_input_batch import NPUInputBatch @@ -451,18 +451,11 @@ class NPUModelRunner310(NPUModelRunner): self.kernel_block_sizes.append([0]) if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]: - if vllm_version_is("0.17.0"): - assert self.cache_config.cpu_offload_gb == 0, ( - "Cannot re-initialize the input batch when CPU weight " - "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 - "for more details." - ) - else: - assert self.offload_config.uva.cpu_offload_gb == 0, ( - "Cannot re-initialize the input batch when CPU weight " - "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 - "for more details." - ) + assert self.offload_config.uva.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details." + ) self.input_batch = NPUInputBatch( max_num_reqs=self.max_num_reqs, max_model_len=max(self.model_config.max_model_len, self.max_encoder_len), diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 413c2bd0..158db86d 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -161,21 +161,11 @@ class AscendConfig: @staticmethod def _get_compile_ranges(compilation_config): - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("0.17.0"): - return compilation_config.compile_ranges_split_points - else: - return compilation_config.compile_ranges_endpoints + return compilation_config.compile_ranges_endpoints @staticmethod def _set_compile_ranges(compilation_config, value): - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("0.17.0"): - compilation_config.compile_ranges_split_points = value - else: - compilation_config.compile_ranges_endpoints = value + compilation_config.compile_ranges_endpoints = value def update_compile_ranges_split_points(self): vllm_config = self.vllm_config diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py index d82259e5..8d29316d 100644 --- a/vllm_ascend/compilation/compiler_interface.py +++ b/vllm_ascend/compilation/compiler_interface.py @@ -32,7 +32,7 @@ from vllm.config import VllmConfig from vllm.config.utils import Range from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config -from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is +from vllm_ascend.utils import COMPILATION_PASS_KEY logger = logging.getLogger(__name__) @@ -86,11 +86,10 @@ def npugraph_ex_compile( config.mode = "reduce-overhead" # execute FX graph in eager mode before graph mode to optimize FX graph. config.debug.run_eagerly = True - if not vllm_version_is("0.17.0"): - # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper. - # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU - # and cause copy_between_host_and_device error. - config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True + # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper. + # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU + # and cause copy_between_host_and_device error. + config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True if ascend_compilation_config.enable_static_kernel: config.experimental_config.aclgraph._aclnn_static_shape_kernel = True # According to the cudagraph_capture_size configuration, set the shapes @@ -142,21 +141,20 @@ class AscendCompiler(CompilerInterface): # see https://github.com/pytorch/pytorch/issues/138980 graph = copy.deepcopy(graph) - if not vllm_version_is("0.17.0"): - from torch._guards import detect_fake_mode + from torch._guards import detect_fake_mode - current_fake_mode = detect_fake_mode() - if current_fake_mode is not None: - example_inputs = [ - current_fake_mode.from_tensor(inp) - if ( - isinstance(inp, torch.Tensor) - and hasattr(inp, "fake_mode") - and inp.fake_mode is not current_fake_mode - ) - else inp - for inp in example_inputs - ] + current_fake_mode = detect_fake_mode() + if current_fake_mode is not None: + example_inputs = [ + current_fake_mode.from_tensor(inp) + if ( + isinstance(inp, torch.Tensor) + and hasattr(inp, "fake_mode") + and inp.fake_mode is not current_fake_mode + ) + else inp + for inp in example_inputs + ] ascend_compilation_config = get_ascend_config().ascend_compilation_config if ascend_compilation_config.enable_npugraph_ex: diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index 828df509..bd68ed16 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -12,7 +12,6 @@ from vllm.v1.kv_offload.spec import OffloadingSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler -from vllm_ascend.utils import vllm_version_is class NPUOffloadingSpec(OffloadingSpec): @@ -32,23 +31,15 @@ class NPUOffloadingSpec(OffloadingSpec): def get_manager(self) -> OffloadingManager: if not self._manager: - if vllm_version_is("0.17.0"): - kv_events_config = self.vllm_config.kv_events_config - enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events - self._manager = LRUOffloadingManager( - CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks), - enable_events=enable_events, - ) - else: - kv_events_config = self.vllm_config.kv_events_config - enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events - assert len(self.gpu_block_size) == 1 - gpu_block_size = self.gpu_block_size[0] - offloaded_block_size = gpu_block_size * self.block_size_factor - self._manager = LRUOffloadingManager( - CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), - enable_events=enable_events, - ) + kv_events_config = self.vllm_config.kv_events_config + enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + offloaded_block_size = gpu_block_size * self.block_size_factor + self._manager = LRUOffloadingManager( + CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), + enable_events=enable_events, + ) return self._manager def get_handlers( @@ -57,24 +48,15 @@ class NPUOffloadingSpec(OffloadingSpec): attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: - if vllm_version_is("0.17.0"): - self._handler = CpuNpuOffloadingHandler( - attn_backends=attn_backends, - gpu_block_size=self.gpu_block_size, - cpu_block_size=self.offloaded_block_size, - num_cpu_blocks=self.num_cpu_blocks, - gpu_caches=kv_caches, - ) - else: - assert len(self.gpu_block_size) == 1 - gpu_block_size = self.gpu_block_size[0] - self._handler = CpuNpuOffloadingHandler( - attn_backends=attn_backends, - gpu_block_size=gpu_block_size, - cpu_block_size=gpu_block_size * self.block_size_factor, - num_cpu_blocks=self.num_cpu_blocks, - gpu_caches=kv_caches, - ) + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + self._handler = CpuNpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=gpu_block_size, + cpu_block_size=gpu_block_size * self.block_size_factor, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches, + ) assert self._handler is not None yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index d304af1c..1e4e5b49 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -19,8 +19,7 @@ import os import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops # noqa import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa -from vllm_ascend import envs -from vllm_ascend.utils import is_310p, vllm_version_is +from vllm_ascend.utils import is_310p if not is_310p(): import vllm_ascend.patch.platform.patch_mamba_config # noqa @@ -32,5 +31,3 @@ import vllm_ascend.patch.platform.patch_torch_accelerator # noqa if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa -if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.17.0"): - import vllm_ascend.patch.platform.patch_balance_schedule # noqa diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index da2c93ac..56e64040 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -1,6 +1,5 @@ from __future__ import annotations -import threading import weakref from collections import deque from collections.abc import Callable @@ -21,8 +20,6 @@ from vllm.v1.executor.multiproc_executor import ( set_multiprocessing_worker_envs, ) -from vllm_ascend.utils import vllm_version_is - class AscendMultiprocExecutor(MultiprocExecutor): def _init_executor(self) -> None: @@ -30,8 +27,6 @@ class AscendMultiprocExecutor(MultiprocExecutor): # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) self.is_failed = False - if vllm_version_is("0.17.0"): - self.shutdown_event = threading.Event() self.failure_callback: FailureCallback | None = None tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes() @@ -71,44 +66,29 @@ class AscendMultiprocExecutor(MultiprocExecutor): success = False try: global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp - if vllm_version_is("0.17.0"): - for local_rank in range(self.local_world_size): - global_rank = global_start_rank + local_rank - is_driver_worker = self._is_driver_worker(global_rank) - unready_workers.append( - AscendWorkerProc.make_worker_process( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=global_rank, - distributed_init_method=distributed_init_method, - input_shm_handle=scheduler_output_handle, - shared_worker_lock=shared_worker_lock, - is_driver_worker=is_driver_worker, - ) - ) - else: - # When using fork, keep track of socket file descriptors that are - # inherited by the worker, so that we can close them in subsequent - # workers - inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None - for local_rank in range(self.local_world_size): - global_rank = global_start_rank + local_rank - is_driver_worker = self._is_driver_worker(global_rank) - unready_worker_handle = AscendWorkerProc.make_worker_process( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=global_rank, - distributed_init_method=distributed_init_method, - input_shm_handle=scheduler_output_handle, - shared_worker_lock=shared_worker_lock, - is_driver_worker=is_driver_worker, - inherited_fds=inherited_fds, - ) - unready_workers.append(unready_worker_handle) - if inherited_fds is not None: - inherited_fds.append(unready_worker_handle.death_writer.fileno()) - inherited_fds.append(unready_worker_handle.ready_pipe.fileno()) + # When using fork, keep track of socket file descriptors that are + # inherited by the worker, so that we can close them in subsequent + # workers + inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None + + for local_rank in range(self.local_world_size): + global_rank = global_start_rank + local_rank + is_driver_worker = self._is_driver_worker(global_rank) + unready_worker_handle = AscendWorkerProc.make_worker_process( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=global_rank, + distributed_init_method=distributed_init_method, + input_shm_handle=scheduler_output_handle, + shared_worker_lock=shared_worker_lock, + is_driver_worker=is_driver_worker, + inherited_fds=inherited_fds, + ) + unready_workers.append(unready_worker_handle) + if inherited_fds is not None: + inherited_fds.append(unready_worker_handle.death_writer.fileno()) + inherited_fds.append(unready_worker_handle.ready_pipe.fileno()) # Workers must be created before wait_for_ready to avoid # deadlock, since worker.init_device() does a device sync. @@ -153,8 +133,7 @@ class AscendMultiprocExecutor(MultiprocExecutor): for uw in unready_workers: if uw.death_writer is not None: uw.death_writer.close() - if not vllm_version_is("0.17.0"): - uw.death_writer = None + uw.death_writer = None self._ensure_worker_termination([uw.proc for uw in unready_workers]) self.output_rank = self._get_output_rank() @@ -192,73 +171,41 @@ class AscendWorkerProc(WorkerProc): inherited_fds: list[int] | None = None, ) -> UnreadyWorkerProcHandle: context = get_mp_context() - if vllm_version_is("0.17.0"): - # (reader, writer) - reader, writer = context.Pipe(duplex=False) + # Ready pipe to communicate readiness from child to parent + ready_reader, ready_writer = context.Pipe(duplex=False) + # Death pipe to let child detect parent process exit + death_reader, death_writer = context.Pipe(duplex=False) + if inherited_fds is not None: + inherited_fds = inherited_fds.copy() + inherited_fds.extend((ready_reader.fileno(), death_writer.fileno())) + process_kwargs = { + "vllm_config": vllm_config, + "local_rank": local_rank, + "rank": rank, + "distributed_init_method": distributed_init_method, + "input_shm_handle": input_shm_handle, + "ready_pipe": ready_writer, + "death_pipe": death_reader, + "shared_worker_lock": shared_worker_lock, + "is_driver_worker": is_driver_worker, + # Have the worker close parent end of this worker's pipes too + "inherited_fds": inherited_fds if inherited_fds is not None else [], + } + # Run EngineCore busy loop in background process. + proc = context.Process( + target=WorkerProc.worker_main, + kwargs=process_kwargs, + name=f"VllmWorker-{rank}", + daemon=False, + ) - # Create death pipe to detect parent process exit - death_reader, death_writer = context.Pipe(duplex=False) - - process_kwargs = { - "vllm_config": vllm_config, - "local_rank": local_rank, - "rank": rank, - "distributed_init_method": distributed_init_method, - "input_shm_handle": input_shm_handle, - "ready_pipe": (reader, writer), - "death_pipe": death_reader, - "shared_worker_lock": shared_worker_lock, - "is_driver_worker": is_driver_worker, - } - # Run EngineCore busy loop in background process. - proc = context.Process( - target=WorkerProc.worker_main, - kwargs=process_kwargs, - name=f"VllmWorker-{rank}", - daemon=False, - ) - - proc.start() - writer.close() - # Keep death_writer open in parent - when parent exits, - # death_reader in child will get EOFError - return UnreadyWorkerProcHandle(proc, rank, reader, death_writer) - else: - # Ready pipe to communicate readiness from child to parent - ready_reader, ready_writer = context.Pipe(duplex=False) - # Death pipe to let child detect parent process exit - death_reader, death_writer = context.Pipe(duplex=False) - if inherited_fds is not None: - inherited_fds = inherited_fds.copy() - inherited_fds.extend((ready_reader.fileno(), death_writer.fileno())) - process_kwargs = { - "vllm_config": vllm_config, - "local_rank": local_rank, - "rank": rank, - "distributed_init_method": distributed_init_method, - "input_shm_handle": input_shm_handle, - "ready_pipe": ready_writer, - "death_pipe": death_reader, - "shared_worker_lock": shared_worker_lock, - "is_driver_worker": is_driver_worker, - # Have the worker close parent end of this worker's pipes too - "inherited_fds": inherited_fds if inherited_fds is not None else [], - } - # Run EngineCore busy loop in background process. - proc = context.Process( - target=WorkerProc.worker_main, - kwargs=process_kwargs, - name=f"VllmWorker-{rank}", - daemon=False, - ) - - proc.start() - # Close child ends of pipes here in the parent - ready_writer.close() - death_reader.close() - # Keep death_writer open in parent - when parent exits, - # death_reader in child will get EOFError - return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer) + proc.start() + # Close child ends of pipes here in the parent + ready_writer.close() + death_reader.close() + # Keep death_writer open in parent - when parent exits, + # death_reader in child will get EOFError + return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer) vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d1f86bf0..f8227f3f 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -126,7 +126,6 @@ from vllm_ascend.utils import ( is_moe_model, lmhead_tp_enable, set_weight_prefetch_method, - vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -292,27 +291,15 @@ class NPUModelRunner(GPUModelRunner): if self.use_sparse_c8_indexer: self.c8_k_cache_dtype = torch.int8 self.c8_k_scale_cache_dtype = torch.float16 - from vllm_ascend.utils import vllm_version_is - if vllm_version_is("0.17.0"): - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - use_mla=self.model_config.use_mla, - use_sparse=self.use_sparse, - use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, - ) - else: - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - use_mla=self.model_config.use_mla, - use_sparse=self.use_sparse, - use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, - ) + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + use_mla=self.model_config.use_mla, + use_sparse=self.use_sparse, + use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, + ) try: self.dcp_size = get_dcp_group().world_size @@ -1389,9 +1376,7 @@ class NPUModelRunner(GPUModelRunner): self.maybe_get_kv_connector_output( scheduler_output, **( - {"clear_metadata": clear_kv_metadata} - if vllm_version_is("0.17.0") - else {"defer_finalize": not clear_kv_metadata} + {"defer_finalize": not clear_kv_metadata} ), ) as kv_connector_output, ): @@ -2567,17 +2552,14 @@ class NPUModelRunner(GPUModelRunner): with get_tp_context(self.drafter): self.drafter.load_model(self.model) if self.use_aux_hidden_state_outputs: - if vllm_version_is("0.17.0"): - self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers()) - else: - from vllm.model_executor.models.interfaces import supports_eagle3 - if not supports_eagle3(self.model): - raise RuntimeError( - "Model does not support EAGLE3 interface but " - "aux_hidden_state_outputs was requested" - ) - aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers() - self.model.set_aux_hidden_state_layers(aux_layers) + from vllm.model_executor.models.interfaces import supports_eagle3 + if not supports_eagle3(self.model): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) + aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers() + self.model.set_aux_hidden_state_layers(aux_layers) if self.lora_config: self.model = self.load_lora_model(self.model, self.vllm_config, self.device) @@ -2617,7 +2599,9 @@ class NPUModelRunner(GPUModelRunner): self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model() ): assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer) - self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes) + block_size = (self.kernel_block_sizes[0] if isinstance( + self.kernel_block_sizes, list) else self.kernel_block_sizes) + self.drafter.initialize_attn_backend(kv_cache_config, block_size) if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) @@ -3091,18 +3075,11 @@ class NPUModelRunner(GPUModelRunner): max_num_blocks.append(max_num_blocks_per_req) if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]: - if vllm_version_is("0.17.0"): - assert self.cache_config.cpu_offload_gb == 0, ( - "Cannot re-initialize the input batch when CPU weight " - "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 - "for more details." - ) - else: - assert self.offload_config.uva.cpu_offload_gb == 0, ( - "Cannot re-initialize the input batch when CPU weight " - "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 - "for more details." - ) + assert self.offload_config.uva.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details." + ) self.input_batch = NPUInputBatch( max_num_reqs=self.max_num_reqs, max_model_len=max_model_len,