diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 01ce509f..5404cfb0 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -110,7 +110,7 @@ jobs: - name: Upload timing data uses: actions/upload-artifact@v4 - if: ${{ inputs.continue_on_error == true }} + if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }} with: name: timing-data-singlecard-light-part${{ matrix.part }} path: test_timing_data.json @@ -200,7 +200,7 @@ jobs: - name: Upload timing data uses: actions/upload-artifact@v4 - if: ${{ inputs.continue_on_error == true }} + if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }} with: name: timing-data-singlecard-full-part${{ matrix.part }} path: test_timing_data.json @@ -289,7 +289,7 @@ jobs: - name: Upload timing data uses: actions/upload-artifact@v4 - if: ${{ inputs.continue_on_error == true }} + if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }} with: name: timing-data-2card-light-part${{ matrix.part }} path: test_timing_data.json @@ -378,7 +378,7 @@ jobs: - name: Upload timing data uses: actions/upload-artifact@v4 - if: ${{ inputs.continue_on_error == true }} + if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }} with: name: timing-data-2card-full-part${{ matrix.part }} path: test_timing_data.json @@ -475,7 +475,7 @@ jobs: - name: Upload timing data uses: actions/upload-artifact@v4 - if: ${{ inputs.continue_on_error == true }} + if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }} with: name: timing-data-4card-full-part${{ matrix.part }} path: test_timing_data.json diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 4bf3fc08..357a1e32 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d + VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 64068c64..9116b5a6 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d +ARG VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index dfa0b74b..b7f35825 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0] + vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 76664e0f..0ce86dfa 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 4034c3d32e30d01639459edd3ab486f56993876d + vllm: 4497431df654e46fb1fb5e64bf8611e762ae5d87 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0] + vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0] + vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index dd4f2c84..a50a9c16 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d] + vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 65bcad44..9bab96b9 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index 1265ddba..f5388680 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -9,7 +9,6 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.eplb.core.eplb_utils import init_eplb_config -from vllm_ascend.utils import vllm_version_is # isort: on @@ -22,38 +21,22 @@ class TestAscendConfig(unittest.TestCase): "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - if vllm_version_is("0.16.0"): - moe_parallel_config = FusedMoEParallelConfig( - 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True) - moe_config = FusedMoEConfig( - num_experts=8, - experts_per_token=8, - hidden_dim=8192, - intermediate_size_per_partition=5, - num_local_experts=8, - activation="silu", - device="npu", - routing_method=RoutingMethodType.Simulated, - moe_parallel_config=moe_parallel_config, - in_dtype=torch.float16, - ) - else: - moe_parallel_config = FusedMoEParallelConfig( - 2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl", - enable_eplb=True) - moe_config = FusedMoEConfig( - num_experts=8, - experts_per_token=8, - hidden_dim=8192, - intermediate_size_per_partition=5, - num_local_experts=8, - num_logical_experts=8, - activation="silu", - device="npu", - routing_method=RoutingMethodType.Simulated, - moe_parallel_config=moe_parallel_config, - in_dtype=torch.float16, - ) + moe_parallel_config = FusedMoEParallelConfig( + 2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl", + enable_eplb=True) + moe_config = FusedMoEConfig( + num_experts=8, + experts_per_token=8, + hidden_dim=8192, + intermediate_size_per_partition=5, + num_local_experts=8, + num_logical_experts=8, + activation="silu", + device="npu", + routing_method=RoutingMethodType.Simulated, + moe_parallel_config=moe_parallel_config, + in_dtype=torch.float16, + ) moe_config.supports_eplb = True self.vllm_config = vllm_config self.moe_config = moe_config diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py index 9e1e3985..19e3acde 100644 --- a/vllm_ascend/_310p/model_runner_310p.py +++ b/vllm_ascend/_310p/model_runner_310p.py @@ -152,6 +152,7 @@ class NPUModelRunner310(NPUModelRunner): remove_lora: bool = True, is_graph_capturing: bool = False, num_active_loras: int = 0, + profile_seq_lens: int | None = None, ): temporary_context = self.temporary_modify_uniform_decode_query_len() if uniform_decode else nullcontext() with temporary_context: @@ -168,6 +169,7 @@ class NPUModelRunner310(NPUModelRunner): remove_lora=remove_lora, is_graph_capturing=is_graph_capturing, num_active_loras=num_active_loras, + profile_seq_lens=profile_seq_lens, ) def _check_and_update_cudagraph_mode( diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py index 22b6f8a1..d82259e5 100644 --- a/vllm_ascend/compilation/compiler_interface.py +++ b/vllm_ascend/compilation/compiler_interface.py @@ -17,6 +17,7 @@ # import copy import functools +import logging from collections.abc import Callable from typing import Any @@ -31,7 +32,9 @@ from vllm.config import VllmConfig from vllm.config.utils import Range from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config -from vllm_ascend.utils import COMPILATION_PASS_KEY +from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is + +logger = logging.getLogger(__name__) def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable: @@ -83,6 +86,11 @@ def npugraph_ex_compile( config.mode = "reduce-overhead" # execute FX graph in eager mode before graph mode to optimize FX graph. config.debug.run_eagerly = True + if not vllm_version_is("0.17.0"): + # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper. + # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU + # and cause copy_between_host_and_device error. + config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True if ascend_compilation_config.enable_static_kernel: config.experimental_config.aclgraph._aclnn_static_shape_kernel = True # According to the cudagraph_capture_size configuration, set the shapes @@ -134,6 +142,22 @@ class AscendCompiler(CompilerInterface): # see https://github.com/pytorch/pytorch/issues/138980 graph = copy.deepcopy(graph) + if not vllm_version_is("0.17.0"): + from torch._guards import detect_fake_mode + + current_fake_mode = detect_fake_mode() + if current_fake_mode is not None: + example_inputs = [ + current_fake_mode.from_tensor(inp) + if ( + isinstance(inp, torch.Tensor) + and hasattr(inp, "fake_mode") + and inp.fake_mode is not current_fake_mode + ) + else inp + for inp in example_inputs + ] + ascend_compilation_config = get_ascend_config().ascend_compilation_config if ascend_compilation_config.enable_npugraph_ex: assert hasattr(self, "vllm_config") diff --git a/vllm_ascend/compilation/passes/muls_add_pass.py b/vllm_ascend/compilation/passes/muls_add_pass.py index 0a379d17..74612a7a 100644 --- a/vllm_ascend/compilation/passes/muls_add_pass.py +++ b/vllm_ascend/compilation/passes/muls_add_pass.py @@ -18,17 +18,12 @@ from __future__ import annotations import torch from torch._inductor.pattern_matcher import PatternMatcherPass +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig from vllm.config.compilation import Range from vllm.logger import logger from vllm_ascend.compilation.passes.base_pattern import BasePattern -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass class MulsAddPattern(BasePattern): diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index ba7a8f3d..6bda63f0 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -22,6 +22,7 @@ import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa import vllm_ascend.patch.platform.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_minimax_m2_config # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa +import vllm_ascend.patch.platform.patch_torch_accelerator # noqa if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index 50f74e60..da2c93ac 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import threading import weakref from collections import deque @@ -19,6 +21,8 @@ from vllm.v1.executor.multiproc_executor import ( set_multiprocessing_worker_envs, ) +from vllm_ascend.utils import vllm_version_is + class AscendMultiprocExecutor(MultiprocExecutor): def _init_executor(self) -> None: @@ -26,7 +30,8 @@ class AscendMultiprocExecutor(MultiprocExecutor): # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) self.is_failed = False - self.shutdown_event = threading.Event() + if vllm_version_is("0.17.0"): + self.shutdown_event = threading.Event() self.failure_callback: FailureCallback | None = None tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes() @@ -66,11 +71,31 @@ class AscendMultiprocExecutor(MultiprocExecutor): success = False try: global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp - for local_rank in range(self.local_world_size): - global_rank = global_start_rank + local_rank - is_driver_worker = self._is_driver_worker(global_rank) - unready_workers.append( - AscendWorkerProc.make_worker_process( + if vllm_version_is("0.17.0"): + for local_rank in range(self.local_world_size): + global_rank = global_start_rank + local_rank + is_driver_worker = self._is_driver_worker(global_rank) + unready_workers.append( + AscendWorkerProc.make_worker_process( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=global_rank, + distributed_init_method=distributed_init_method, + input_shm_handle=scheduler_output_handle, + shared_worker_lock=shared_worker_lock, + is_driver_worker=is_driver_worker, + ) + ) + else: + # When using fork, keep track of socket file descriptors that are + # inherited by the worker, so that we can close them in subsequent + # workers + inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None + + for local_rank in range(self.local_world_size): + global_rank = global_start_rank + local_rank + is_driver_worker = self._is_driver_worker(global_rank) + unready_worker_handle = AscendWorkerProc.make_worker_process( vllm_config=self.vllm_config, local_rank=local_rank, rank=global_rank, @@ -78,8 +103,12 @@ class AscendMultiprocExecutor(MultiprocExecutor): input_shm_handle=scheduler_output_handle, shared_worker_lock=shared_worker_lock, is_driver_worker=is_driver_worker, + inherited_fds=inherited_fds, ) - ) + unready_workers.append(unready_worker_handle) + if inherited_fds is not None: + inherited_fds.append(unready_worker_handle.death_writer.fileno()) + inherited_fds.append(unready_worker_handle.ready_pipe.fileno()) # Workers must be created before wait_for_ready to avoid # deadlock, since worker.init_device() does a device sync. @@ -124,6 +153,8 @@ class AscendMultiprocExecutor(MultiprocExecutor): for uw in unready_workers: if uw.death_writer is not None: uw.death_writer.close() + if not vllm_version_is("0.17.0"): + uw.death_writer = None self._ensure_worker_termination([uw.proc for uw in unready_workers]) self.output_rank = self._get_output_rank() @@ -158,38 +189,76 @@ class AscendWorkerProc(WorkerProc): input_shm_handle, # Receive SchedulerOutput shared_worker_lock: LockType, is_driver_worker: bool = False, + inherited_fds: list[int] | None = None, ) -> UnreadyWorkerProcHandle: context = get_mp_context() - # (reader, writer) - reader, writer = context.Pipe(duplex=False) + if vllm_version_is("0.17.0"): + # (reader, writer) + reader, writer = context.Pipe(duplex=False) - # Create death pipe to detect parent process exit - death_reader, death_writer = context.Pipe(duplex=False) + # Create death pipe to detect parent process exit + death_reader, death_writer = context.Pipe(duplex=False) - process_kwargs = { - "vllm_config": vllm_config, - "local_rank": local_rank, - "rank": rank, - "distributed_init_method": distributed_init_method, - "input_shm_handle": input_shm_handle, - "ready_pipe": (reader, writer), - "death_pipe": death_reader, - "shared_worker_lock": shared_worker_lock, - "is_driver_worker": is_driver_worker, - } - # Run EngineCore busy loop in background process. - proc = context.Process( - target=WorkerProc.worker_main, - kwargs=process_kwargs, - name=f"VllmWorker-{rank}", - daemon=False, - ) + process_kwargs = { + "vllm_config": vllm_config, + "local_rank": local_rank, + "rank": rank, + "distributed_init_method": distributed_init_method, + "input_shm_handle": input_shm_handle, + "ready_pipe": (reader, writer), + "death_pipe": death_reader, + "shared_worker_lock": shared_worker_lock, + "is_driver_worker": is_driver_worker, + } + # Run EngineCore busy loop in background process. + proc = context.Process( + target=WorkerProc.worker_main, + kwargs=process_kwargs, + name=f"VllmWorker-{rank}", + daemon=False, + ) - proc.start() - writer.close() - # Keep death_writer open in parent - when parent exits, - # death_reader in child will get EOFError - return UnreadyWorkerProcHandle(proc, rank, reader, death_writer) + proc.start() + writer.close() + # Keep death_writer open in parent - when parent exits, + # death_reader in child will get EOFError + return UnreadyWorkerProcHandle(proc, rank, reader, death_writer) + else: + # Ready pipe to communicate readiness from child to parent + ready_reader, ready_writer = context.Pipe(duplex=False) + # Death pipe to let child detect parent process exit + death_reader, death_writer = context.Pipe(duplex=False) + if inherited_fds is not None: + inherited_fds = inherited_fds.copy() + inherited_fds.extend((ready_reader.fileno(), death_writer.fileno())) + process_kwargs = { + "vllm_config": vllm_config, + "local_rank": local_rank, + "rank": rank, + "distributed_init_method": distributed_init_method, + "input_shm_handle": input_shm_handle, + "ready_pipe": ready_writer, + "death_pipe": death_reader, + "shared_worker_lock": shared_worker_lock, + "is_driver_worker": is_driver_worker, + # Have the worker close parent end of this worker's pipes too + "inherited_fds": inherited_fds if inherited_fds is not None else [], + } + # Run EngineCore busy loop in background process. + proc = context.Process( + target=WorkerProc.worker_main, + kwargs=process_kwargs, + name=f"VllmWorker-{rank}", + daemon=False, + ) + + proc.start() + # Close child ends of pipes here in the parent + ready_writer.close() + death_reader.close() + # Keep death_writer open in parent - when parent exits, + # death_reader in child will get EOFError + return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer) vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor diff --git a/vllm_ascend/patch/platform/patch_torch_accelerator.py b/vllm_ascend/patch/platform/patch_torch_accelerator.py new file mode 100644 index 00000000..431dce4e --- /dev/null +++ b/vllm_ascend/patch/platform/patch_torch_accelerator.py @@ -0,0 +1,8 @@ +import torch + + +def patch_empty_cache() -> None: + torch.npu.empty_cache() + + +torch.accelerator.empty_cache = patch_empty_cache diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 62e7fe80..b9b6c644 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is +from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -615,24 +615,7 @@ class SpecDecodeBaseProposer(EagleProposer): if not self.parallel_drafting: for draft_step in range(1, self.num_speculative_tokens): per_layer_attn_metadata = dict() - if vllm_version_is("0.17.0"): - for attn_group in self.draft_attn_groups: - common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( - draft_step, - attn_metadata, - common_attn_metadata, - batch_size, - num_input_tokens, - used_update_positions, - aclgraph_runtime_mode, - ori_seq_len, - slot_indices, - mtp_slot_mapping, - attn_group=attn_group, - ) - for layer_name in self.attn_layer_names: - per_layer_attn_metadata[layer_name] = attn_metadata - else: + for attn_group in self.draft_attn_groups: common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( draft_step, attn_metadata, @@ -644,6 +627,7 @@ class SpecDecodeBaseProposer(EagleProposer): ori_seq_len, slot_indices, mtp_slot_mapping, + attn_group=attn_group, ) for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata @@ -653,21 +637,7 @@ class SpecDecodeBaseProposer(EagleProposer): if not self.parallel_drafting: for draft_step in range(1, self.num_speculative_tokens): per_layer_attn_metadata = dict() - if vllm_version_is("0.17.0"): - for attn_group in self.draft_attn_groups: - common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( - draft_step, - attn_metadata, - common_attn_metadata, - batch_size, - num_input_tokens, - used_update_positions, - aclgraph_runtime_mode, - attn_group=attn_group, - ) - for layer_name in self.attn_layer_names: - per_layer_attn_metadata[layer_name] = attn_metadata - else: + for attn_group in self.draft_attn_groups: common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( draft_step, attn_metadata, @@ -676,6 +646,7 @@ class SpecDecodeBaseProposer(EagleProposer): num_input_tokens, used_update_positions, aclgraph_runtime_mode, + attn_group=attn_group, ) for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata @@ -1082,16 +1053,11 @@ class SpecDecodeBaseProposer(EagleProposer): # 2. # Recompute the slot mapping based on the new positions and # rejection mask. - if vllm_version_is("0.17.0"): - # Use the first draft attention group's kv_cache_spec for block_size - # (all draft layers share the same kv-cache group) - assert len(self.draft_attn_groups) > 0 - block_size = self.draft_attn_groups[0].kv_cache_spec.block_size - else: - if self.attn_metadata_builder is None: - block_size = self._get_attention_metadata_builder().kv_cache_spec.block_size - else: - block_size = self.attn_metadata_builder.kv_cache_spec.block_size + # Use the first draft attention group's kv_cache_spec for block_size + # (all draft layers share the same kv-cache group) + assert len(self.draft_attn_groups) > 0 + block_size = self.draft_attn_groups[0].kv_cache_spec.block_size + new_slot_mapping = compute_new_slot_mapping( cad=cad, new_positions=self.positions[:total_num_output_tokens], @@ -1130,8 +1096,7 @@ class SpecDecodeBaseProposer(EagleProposer): attn_group=None, ): assert draft_step > 0 - if vllm_version_is("0.17.0"): - assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group" + assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group" common_attn_metadata = self.shallow_copy_metadata(old_common_metadata) if draft_step == 1: @@ -1243,13 +1208,7 @@ class SpecDecodeBaseProposer(EagleProposer): # Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx] common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step] - if vllm_version_is("0.17.0"): - attn_metadata_builder = attn_group.get_metadata_builder() - else: - if self.attn_metadata_builder is None: - attn_metadata_builder = self._get_attention_metadata_builder() - else: - attn_metadata_builder = self.attn_metadata_builder + attn_metadata_builder = attn_group.get_metadata_builder() attn_metadata = attn_metadata_builder.build_for_drafting( common_attn_metadata=common_attn_metadata, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index f2ab3074..96f0f78b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -412,15 +412,14 @@ class NPUModelRunner(GPUModelRunner): self.cpu_slot_mapping = None self.sampling_done_event: torch.npu.Event | None = None - if vllm_version_is("0.17.0"): - # self.cudagraph_batch_sizes sorts in ascending order. - if ( - self.compilation_config.cudagraph_capture_sizes - and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - ): - self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) - else: - self.cudagraph_batch_sizes = [] + # self.cudagraph_batch_sizes sorts in ascending order. + if ( + self.compilation_config.cudagraph_capture_sizes + and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ): + self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + else: + self.cudagraph_batch_sizes = [] self.mamba_state_idx: dict[str, int] = {} self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None @@ -1376,7 +1375,12 @@ class NPUModelRunner(GPUModelRunner): skip_compiled=has_encoder_input, ), self.maybe_get_kv_connector_output( - scheduler_output, clear_metadata=clear_kv_metadata + scheduler_output, + **( + {"clear_metadata": clear_kv_metadata} + if vllm_version_is("0.17.0") + else {"defer_finalize": not clear_kv_metadata} + ), ) as kv_connector_output, ): hidden_states = self._model_forward( @@ -2253,6 +2257,7 @@ class NPUModelRunner(GPUModelRunner): remove_lora: bool = True, is_graph_capturing: bool = False, num_active_loras: int = 0, + profile_seq_lens: int | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: # only support eager mode and piecewise graph now assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() @@ -2359,11 +2364,14 @@ class NPUModelRunner(GPUModelRunner): # seq_lens. We use this seq_len only when capturing graph, and still use max_query_len # in inference. This will be removed once npu_fused_infer_attention_score # outperforms _npu_paged_attention on all cases. - seq_lens = ( - SEQ_LEN_WITH_MAX_PA_WORKSPACE - if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config) - else max_query_len - ) # type: ignore[assignment] + if profile_seq_lens is not None: + seq_lens = profile_seq_lens + else: + seq_lens = ( + SEQ_LEN_WITH_MAX_PA_WORKSPACE + if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config) + else max_query_len + ) # type: ignore[assignment] self.seq_lens.np[:num_reqs_padded] = seq_lens self.seq_lens.np[num_reqs_padded:] = 0 self.seq_lens.copy_to_gpu() @@ -2579,14 +2587,13 @@ class NPUModelRunner(GPUModelRunner): self.may_reinitialize_input_batch(kv_cache_config) kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) - if vllm_version_is("0.17.0"): - # TODO: refactor the logic of attention - # Initialize drafter attention group initialization - if self.speculative_config and ( - self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model() - ): - assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer) - self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes) + # TODO: refactor the logic of attention + # Initialize drafter attention group initialization + if self.speculative_config and ( + self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model() + ): + assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer) + self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes) if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) @@ -3031,11 +3038,18 @@ class NPUModelRunner(GPUModelRunner): max_num_blocks.append(max_num_blocks_per_req) if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]: - assert self.cache_config.cpu_offload_gb == 0, ( - "Cannot re-initialize the input batch when CPU weight " - "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 - "for more details." - ) + if vllm_version_is("0.17.0"): + assert self.cache_config.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details." + ) + else: + assert self.offload_config.uva.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details." + ) self.input_batch = NPUInputBatch( max_num_reqs=self.max_num_reqs, max_model_len=max_model_len,