diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 90a3d5ce..1dea09f8 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=d68209402ddab3f54a09bc1f4de9a9495a283b60 + VLLM_COMMIT=dc917cceb877dfd13f98c538c4c96158047d98bd echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index b01e7b7f..9bb2a5b8 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=d68209402ddab3f54a09bc1f4de9a9495a283b60 +ARG VLLM_COMMIT=dc917cceb877dfd13f98c538c4c96158047d98bd RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 5e875325..5937d221 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.1] + vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 390f6343..79d812cc 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: d68209402ddab3f54a09bc1f4de9a9495a283b60 + vllm: dc917cceb877dfd13f98c538c4c96158047d98bd changes: runs-on: linux-aarch64-a2-0 outputs: @@ -85,7 +85,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.1] + vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -97,7 +97,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.1] + vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index 89f11f02..98f86291 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60] + vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index b5cabf41..801c8a2a 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -55,7 +55,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.1 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py b/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py index 0c384606..8f39f5a5 100644 --- a/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py @@ -109,8 +109,6 @@ def _run_worker_process( quantization="ascend" if "W8A8" in model_path else None, enable_expert_parallel=True if "DeepSeek" in model_path else False, trust_remote_code=True, - # vllm enables async scheduling by default, remove below when vllm >= 0.14.0 - async_scheduling=False, ) # Expose model config to the main test process @@ -206,7 +204,8 @@ def test_models_aclgraph_capture_replay_metrics_dp2( # 2. Generation steps (max_tokens) # 3. Final step (likely EOS/idle step), no replay here total_steps = max_tokens + 1 # this includes the 1 and 2 above - expected_exec_model = (total_steps + 1) * dp_size + # vllm default enables Async scheduler, this will take 1 more steps + expected_exec_model = (total_steps + 1 + 1) * dp_size assert ( num_execute_model == expected_exec_model diff --git a/tests/e2e/multicard/2-cards/test_qwen3_performance.py b/tests/e2e/multicard/2-cards/test_qwen3_performance.py index ae93940b..ef30db68 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_performance.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_performance.py @@ -42,6 +42,7 @@ vllm_bench_cases = { "random_input_len": 128, "max_concurrency": 40, "random_output_len": 100, + "temperature": 0.0, } # NOTE: Any changes for the baseline throughput should be approved by team members. diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py index eb5234e6..1121ee7d 100755 --- a/tests/ut/attention/test_mla_v1.py +++ b/tests/ut/attention/test_mla_v1.py @@ -17,6 +17,7 @@ from vllm_ascend.attention.mla_v1 import (AscendMLABackend, AscendMLAPrefillMetadata, ChunkedContextMetadata) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.utils import vllm_version_is class TestAscendMLABackend(TestBase): @@ -223,7 +224,9 @@ class TestAscendMLAMetadataBuilder(TestBase): ) self.parent_init_patcher = patch( - "vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__", + ("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__" + if vllm_version_is('0.14.1') else + "vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"), mock_parent_init) self.parent_init_patcher.start() @@ -449,7 +452,9 @@ class TestAscendMLAMetadataBuilderBuild(TestBase): ) self.parent_init_patcher = patch( - "vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__", + ("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__" + if vllm_version_is('0.14.1') else + "vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"), mock_parent_init) self.parent_init_patcher.start() diff --git a/tests/ut/attention/test_sfa_v1.py b/tests/ut/attention/test_sfa_v1.py index 015779a1..e1ad2a1e 100644 --- a/tests/ut/attention/test_sfa_v1.py +++ b/tests/ut/attention/test_sfa_v1.py @@ -13,7 +13,7 @@ if 'torch_npu._inductor' not in sys.modules: from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder) -from vllm_ascend.utils import enable_dsa_cp +from vllm_ascend.utils import enable_dsa_cp, vllm_version_is class TestAscendSFABackend(TestBase): @@ -124,7 +124,9 @@ class TestAscendSFAMetadataBuilder(TestBase): ) self.parent_init_patcher = patch( - "vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__", + ("vllm.v1.attention.backends.mla.common.MLACommonMetadataBuilder.__init__" + if vllm_version_is('0.14.1') else + "vllm.model_executor.layers.attention.mla_attention.MLACommonMetadataBuilder.__init__"), mock_parent_init) self.parent_init_patcher.start() diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index a20fa893..515715b9 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -9,6 +9,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.eplb.core.eplb_utils import init_eplb_config +from vllm_ascend.utils import vllm_version_is # isort: on @@ -20,8 +21,24 @@ class TestAscendConfig(unittest.TestCase): "refresh": True, "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } - moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl") - moe_config = FusedMoEConfig(8, 8, 8192, 5, moe_parallel_config, torch.float16) + if vllm_version_is('0.14.1'): + moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl") + moe_config = FusedMoEConfig(8, 8, 8192, 5, moe_parallel_config, torch.float16) + else: + from vllm.model_executor.layers.fused_moe.config import RoutingMethodType + moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + moe_config = FusedMoEConfig( + num_experts=8, + experts_per_token=8, + hidden_dim=8192, + intermediate_size_per_partition=5, + num_local_experts=8, + activation="silu", + device="npu", + routing_method=RoutingMethodType.Simulated, + moe_parallel_config=moe_parallel_config, + in_dtype=torch.float16, + ) moe_config.supports_eplb = True self.vllm_config = vllm_config self.moe_config = moe_config diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index 5f3c8eb8..0a6cbfb5 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -51,6 +51,7 @@ class TestEagleProposerInitialization(TestBase): def test_initialization_eagle_graph(self): self.vllm_config.speculative_config.method = "eagle" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 + self.vllm_config.speculative_config.draft_model_config.uses_mrope = False self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False self.vllm_config.model_config.uses_mrope = False @@ -65,10 +66,11 @@ class TestEagleProposerInitialization(TestBase): self.assertEqual(proposer.hidden_size, 4096) self.assertTrue(proposer.use_cuda_graph) - self.assertEqual(proposer.input_ids.shape, (1024, )) - self.assertEqual(proposer.positions.shape, (1024, )) - self.assertEqual(proposer.hidden_states.shape, (1024, 4096)) - self.assertEqual(proposer.arange.shape, (1024, )) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.input_ids.shape, (expected_max_num_tokens, )) + self.assertEqual(proposer.positions.shape, (expected_max_num_tokens, )) + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 4096)) + self.assertEqual(proposer.arange.shape, (expected_max_num_tokens, )) def test_initialization_eagle3_enforce_eager(self): self.vllm_config.speculative_config.method = "eagle3" @@ -83,7 +85,8 @@ class TestEagleProposerInitialization(TestBase): self.assertEqual(proposer.hidden_size, 2048) self.assertFalse(proposer.use_cuda_graph) - self.assertEqual(proposer.hidden_states.shape, (1024, 2048)) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) def test_initialization_eagle3_full_graph_async(self): self.vllm_config.speculative_config.method = "eagle3" @@ -100,7 +103,8 @@ class TestEagleProposerInitialization(TestBase): self.assertEqual(proposer.hidden_size, 2048) self.assertTrue(proposer.use_cuda_graph) - self.assertEqual(proposer.hidden_states.shape, (1024, 2048)) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) def test_initialization_mtp_full_graph_async(self): self.vllm_config.speculative_config.method = "mtp" @@ -117,7 +121,8 @@ class TestEagleProposerInitialization(TestBase): self.assertEqual(proposer.hidden_size, 2048) self.assertFalse(proposer.use_cuda_graph) - self.assertEqual(proposer.hidden_states.shape, (1024, 2048)) + expected_max_num_tokens = proposer.max_num_tokens + self.assertEqual(proposer.hidden_states.shape, (expected_max_num_tokens, 2048)) class TestEagleProposerLoadModel(TestBase): diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index e800a8d5..29a55c06 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -33,6 +33,7 @@ class TestMtpProposer: config.speculative_config.method = "mtp" config.speculative_config.draft_model_config = MagicMock() config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 + config.speculative_config.draft_model_config.uses_mrope = False config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index d59f02be..faa11d2d 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -19,6 +19,7 @@ from vllm_ascend.utils import ( is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, + vllm_version_is, ) @@ -42,20 +43,26 @@ def set_ascend_forward_context( batch_descriptor: BatchDescriptor | None = None, model_instance: torch.nn.Module = None, is_draft_model=False, + skip_compiled: bool = False, ): """A context manager that stores the current forward context, can be attention metadata, etc. We add some additional param into forward_context. """ - with set_forward_context( - attn_metadata, - vllm_config, - virtual_engine=virtual_engine, - num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp, - cudagraph_runtime_mode=aclgraph_runtime_mode, - batch_descriptor=batch_descriptor, - ): + forward_context_kwargs = { + "attn_metadata": attn_metadata, + "vllm_config": vllm_config, + "virtual_engine": virtual_engine, + "num_tokens": num_tokens, + "num_tokens_across_dp": num_tokens_across_dp, + "cudagraph_runtime_mode": aclgraph_runtime_mode, + "batch_descriptor": batch_descriptor, + } + + if not vllm_version_is("0.14.1"): + forward_context_kwargs["skip_compiled"] = skip_compiled + + with set_forward_context(**forward_context_kwargs): forward_context = get_forward_context() from vllm_ascend.ops.fused_moe.moe_comm_method import get_moe_comm_method diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 5b81f3ba..4c8831bf 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -11,7 +11,6 @@ from vllm.logger import logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.utils.math_utils import cdiv, round_down from vllm.v1.attention.backend import AttentionBackend, AttentionCGSupport, MLAAttentionImpl # type: ignore -from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec @@ -45,12 +44,18 @@ from vllm_ascend.ops.layer_shard_linear import ( from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.quantization.methods import AscendW8A8LinearMethod -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, weak_ref_tensors +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, vllm_version_is, weak_ref_tensors from vllm_ascend.worker.npu_input_batch import NPUInputBatch if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput +# isort: off +if vllm_version_is("0.14.1"): + from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore +else: + from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder +# isort: on MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 BUILD_METADATA_STEP_PREFILL = 0 diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 5a01b1ca..56730cc5 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -12,7 +12,6 @@ from vllm.logger import logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.triton_utils import HAS_TRITON from vllm.v1.attention.backend import AttentionBackend, AttentionCGSupport, MLAAttentionImpl # type: ignore -from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.kv_cache_interface import AttentionSpec from vllm_ascend import envs @@ -46,11 +45,17 @@ from vllm_ascend.utils import ( enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz, + vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput +if vllm_version_is("0.14.1"): + from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore +else: + from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder +# isort: on # token count limits within bmm_transpose operator BMM_TRANS_MAX_SUPPORTED_TOKENS = 1024 diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index 540ad238..abc955e5 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -19,6 +19,8 @@ from vllm.v1.executor.multiproc_executor import ( set_multiprocessing_worker_envs, ) +from vllm_ascend.utils import vllm_version_is + class AscendMultiprocExecutor(MultiprocExecutor): def _init_executor(self) -> None: @@ -29,16 +31,7 @@ class AscendMultiprocExecutor(MultiprocExecutor): self.shutdown_event = threading.Event() self.failure_callback: FailureCallback | None = None - self.world_size = self.parallel_config.world_size - assert self.world_size % self.parallel_config.nnodes_within_dp == 0, ( - f"global world_size ({self.parallel_config.world_size}) must be " - f"divisible by nnodes_within_dp " - f"({self.parallel_config.nnodes_within_dp}). " - ) - self.local_world_size = self.parallel_config.local_world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - pp_parallel_size = self.parallel_config.pipeline_parallel_size - pcp_parallel_size = self.parallel_config.prefill_context_parallel_size + tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes() assert self.world_size == tensor_parallel_size * pp_parallel_size * pcp_parallel_size, ( f"world_size ({self.world_size}) must be equal to the " f"tensor_parallel_size ({tensor_parallel_size}) x pipeline" @@ -77,6 +70,7 @@ class AscendMultiprocExecutor(MultiprocExecutor): global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp for local_rank in range(self.local_world_size): global_rank = global_start_rank + local_rank + is_driver_worker = self._is_driver_worker(global_rank) unready_workers.append( AscendWorkerProc.make_worker_process( vllm_config=self.vllm_config, @@ -85,6 +79,7 @@ class AscendMultiprocExecutor(MultiprocExecutor): distributed_init_method=distributed_init_method, input_shm_handle=scheduler_output_handle, shared_worker_lock=shared_worker_lock, + is_driver_worker=is_driver_worker, ) ) @@ -120,6 +115,9 @@ class AscendMultiprocExecutor(MultiprocExecutor): # Wait for all remote response mqs to be ready. for response_mq in self.response_mqs: response_mq.wait_until_ready() + self.futures_queue = deque[tuple[FutureWrapper, Callable]]() + self._post_init_executor() + success = True finally: if not success: @@ -130,10 +128,27 @@ class AscendMultiprocExecutor(MultiprocExecutor): uw.death_writer.close() self._ensure_worker_termination([uw.proc for uw in unready_workers]) - self.futures_queue = deque[tuple[FutureWrapper, Callable]]() - self.output_rank = self._get_output_rank() + def _get_parallel_sizes(self) -> tuple[int, int, int]: + self.world_size = self.parallel_config.world_size + assert self.world_size % self.parallel_config.nnodes_within_dp == 0, ( + f"global world_size ({self.parallel_config.world_size}) must be " + f"divisible by nnodes_within_dp " + f"({self.parallel_config.nnodes_within_dp}). " + ) + self.local_world_size = self.parallel_config.local_world_size + tp_size = self.parallel_config.tensor_parallel_size + pp_size = self.parallel_config.pipeline_parallel_size + pcp_size = self.parallel_config.prefill_context_parallel_size + return tp_size, pp_size, pcp_size + + def _post_init_executor(self) -> None: + pass + + def _is_driver_worker(self, rank: int) -> bool: + return rank % self.parallel_config.tensor_parallel_size == 0 + class AscendWorkerProc(WorkerProc): @staticmethod @@ -144,6 +159,7 @@ class AscendWorkerProc(WorkerProc): distributed_init_method: str, input_shm_handle, # Receive SchedulerOutput shared_worker_lock: LockType, + is_driver_worker: bool = False, ) -> UnreadyWorkerProcHandle: context = get_mp_context() # (reader, writer) @@ -162,6 +178,8 @@ class AscendWorkerProc(WorkerProc): "death_pipe": death_reader, "shared_worker_lock": shared_worker_lock, } + if not vllm_version_is("0.14.1"): + process_kwargs["is_driver_worker"] = is_driver_worker # Run EngineCore busy loop in background process. proc = context.Process( target=WorkerProc.worker_main, diff --git a/vllm_ascend/patch/worker/patch_v2_egale.py b/vllm_ascend/patch/worker/patch_v2_egale.py index 108df8cc..24470e63 100644 --- a/vllm_ascend/patch/worker/patch_v2_egale.py +++ b/vllm_ascend/patch/worker/patch_v2_egale.py @@ -21,7 +21,7 @@ import torch import vllm from vllm.v1.worker.gpu.input_batch import InputBatch from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.spec_decode.eagle import (prepare_eagle_decode, prepare_eagle_inputs) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index a6fc106a..0f6e7c3c 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -41,7 +41,7 @@ from vllm_ascend.ops.rotary_embedding import update_cos_sin from vllm_ascend.ops.triton.spec_decode.utils import \ prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled +from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, vllm_version_is # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -455,7 +455,11 @@ class EagleProposer(VllmEagleProposer): self.input_ids[last_token_indices] = next_token_ids if self.use_cuda_graph and \ num_tokens <= self.runner.cudagraph_batch_sizes[-1]: - num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + if vllm_version_is('0.14.1'): + num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + else: + num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[ + num_tokens] else: num_input_tokens = num_tokens diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 05f03389..314d745e 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -17,7 +17,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable +from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable, vllm_version_is class MtpProposer(EagleProposer): @@ -97,7 +97,7 @@ class MtpProposer(EagleProposer): attn_metadata = None input_ids = self.input_ids[:num_tokens] - positions = self.positions[:num_tokens] + positions = self._get_positions(num_tokens) previous_hidden_states = self.hidden_states[:num_tokens] for i in range(self.num_speculative_tokens): if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL: @@ -244,14 +244,18 @@ class MtpProposer(EagleProposer): # Note(qcs): We may need to refactor these check logics. if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[ -1]: - num_input_tokens = self.vllm_config.pad_for_cudagraph( - num_scheduled_tokens) + if vllm_version_is('0.14.1'): + num_input_tokens = self.vllm_config.pad_for_cudagraph( + num_scheduled_tokens) + else: + num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[ + num_scheduled_tokens] else: # Eager mode, no padding needed num_input_tokens = num_tokens # copy inputs to buffer for cudagraph - self.positions[:num_tokens] = target_positions + self._set_positions(num_tokens, target_positions) self.hidden_states[:num_tokens] = target_hidden_states # eager/acl piecewise mode need to update num_tokens_across_dp (num_input_tokens, num_tokens_across_dp, @@ -311,7 +315,7 @@ class MtpProposer(EagleProposer): model_kwargs = {} model_kwargs["attn_metadata"] = attn_metadata input_ids = self.input_ids[:num_input_tokens] - positions = self.positions[:num_input_tokens] + positions = self._get_positions(num_input_tokens) hidden_states = self.hidden_states[:num_input_tokens] hidden_states, positions = self.maybe_pad_and_reduce( @@ -474,7 +478,7 @@ class MtpProposer(EagleProposer): # copy inputs to buffer for cudagraph self.input_ids[:batch_size] = input_ids - self.positions[:batch_size] = clamped_positions + self._set_positions(batch_size, clamped_positions) self.hidden_states[:hidden_states.shape[0]] = hidden_states if self.pcp_size * self.dcp_size > 1: # update local seq_len @@ -495,7 +499,10 @@ class MtpProposer(EagleProposer): else: attn_metadata_i.slot_mapping[:batch_size] = slot_mapping if self.speculative_config.disable_padded_drafter_batch: - self.positions[batch_size:num_input_tokens] = 0 + if self.uses_mrope: + self.mrope_positions[:, batch_size:num_input_tokens] = 0 + else: + self.positions[batch_size:num_input_tokens] = 0 self.input_ids[batch_size:num_input_tokens] = 0 self.hidden_states[batch_size:num_input_tokens].fill_(0) @@ -504,8 +511,8 @@ class MtpProposer(EagleProposer): prefill_metadata.seq_lens_list = prefill_metadata.seq_lens.tolist( ) prefill_metadata.context_lens = attn_metadata_i.seq_lens - prefill_metadata.input_positions = self.positions[: - num_input_tokens] + prefill_metadata.input_positions = self._get_positions( + num_input_tokens) prefill_metadata.max_seq_lens += 1 prefill_metadata.max_seq_lens = min( prefill_metadata.max_seq_lens, @@ -520,8 +527,8 @@ class MtpProposer(EagleProposer): decode_metadata.seq_lens_list = decode_seq_lens_list + [ 0 ] * (graph_pad_size - len(decode_seq_lens_list)) - decode_metadata.input_positions = self.positions[: - num_input_tokens] + decode_metadata.input_positions = self._get_positions( + num_input_tokens) decode_metadata.max_seq_lens += 1 decode_metadata.max_seq_lens = min( decode_metadata.max_seq_lens, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c63717c1..23947740 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -103,7 +103,7 @@ from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, enable_sp, get_ascend_device_type, is_drafter_moe_model, is_moe_model, lmhead_tp_enable, maybe_trans_nz, - set_weight_prefetch_method) + set_weight_prefetch_method, vllm_version_is) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -587,8 +587,12 @@ class NPUModelRunner(GPUModelRunner): if (self.use_aclgraph and total_num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): # Add padding to the batch size. - num_input_tokens = self.vllm_config.pad_for_cudagraph( - total_num_scheduled_tokens) + if vllm_version_is('0.14.1'): + num_input_tokens = self.vllm_config.pad_for_cudagraph( + total_num_scheduled_tokens) + else: + num_input_tokens = self.cudagraph_dispatcher._bs_to_padded_graph_size[ + total_num_scheduled_tokens] elif self.use_aclgraph and enable_sp(self.vllm_config): # When using aclgraph, if total_num_scheduled_tokens exceeds the maximum graph size, # the model will fall back to running its FX graph in eager mode. @@ -1403,9 +1407,17 @@ class NPUModelRunner(GPUModelRunner): head_dim=self.model_config.get_vocab_size(), generators=self.input_batch.sampling_metadata.generators) + # Encoder-decoder models can only compile the pure decode steps where no + # encoder inputs are present. Use eager for the first pass. + num_encoder_reqs = len(scheduler_output.scheduled_encoder_inputs) + has_encoder_input = ( + self.model_config.is_encoder_decoder and num_encoder_reqs > 0 + ) + # Run forward pass with ProfileExecuteDuration().capture_async("forward"): - with set_ascend_forward_context( + with ( + set_ascend_forward_context( attn_metadata, self.vllm_config, num_tokens=num_input_tokens, @@ -1414,26 +1426,18 @@ class NPUModelRunner(GPUModelRunner): batch_descriptor=batch_descriptor, num_actual_tokens=scheduler_output. total_num_scheduled_tokens, - model_instance=self.model): - self.maybe_setup_kv_connector(scheduler_output) - + model_instance=self.model, + skip_compiled=has_encoder_input), + self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output, + ): hidden_states = self._generate_process_reqs_hidden_states( num_input_tokens, input_ids, positions, intermediate_tensors, inputs_embeds, model_kwargs) - self.maybe_wait_for_kv_save() - finished_sending, finished_recving = self.get_finished_kv_transfer( - scheduler_output) - aux_hidden_states = None if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = hidden_states - kv_connector_output = KVConnectorOutput( - finished_sending=finished_sending, - finished_recving=finished_recving) - finished_sending = None - finished_recving = None with ProfileExecuteDuration().capture_async("post process"): # Broadcast PP output for external_launcher (torchrun) # to make sure we are synced across pp ranks diff --git a/vllm_ascend/worker/v2/aclgraph_utils.py b/vllm_ascend/worker/v2/aclgraph_utils.py index 1fab82d2..ba39f040 100644 --- a/vllm_ascend/worker/v2/aclgraph_utils.py +++ b/vllm_ascend/worker/v2/aclgraph_utils.py @@ -22,7 +22,6 @@ from typing import Any import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager @@ -31,6 +30,12 @@ from vllm.v1.worker.gpu.cudagraph_utils import \ from vllm.v1.worker.gpu.input_batch import InputBuffers from vllm_ascend.worker.v2.utils import torch_cuda_wrapper +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.14.1'): + from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +else: + from vllm.v1.attention.backend import AttentionMetadataBuilder class AclGraphManager(CudaGraphManager): diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index e8ed5a28..b058c54d 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -23,13 +23,18 @@ from typing import Any, Tuple import numpy as np import torch from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec, KVCacheConfig from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.14.1'): + from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +else: + from vllm.v1.attention.backend import AttentionMetadataBuilder _ATTENTION_MASK_BUILDER = None diff --git a/vllm_ascend/worker/v2/sample/penalties.py b/vllm_ascend/worker/v2/sample/penalties.py index aaec6cee..59aff72a 100644 --- a/vllm_ascend/worker/v2/sample/penalties.py +++ b/vllm_ascend/worker/v2/sample/penalties.py @@ -20,7 +20,7 @@ import torch from vllm.triton_utils import tl, triton -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata @triton.jit diff --git a/vllm_ascend/worker/v2/sample/sampler.py b/vllm_ascend/worker/v2/sample/sampler.py index e54536c7..8989363b 100644 --- a/vllm_ascend/worker/v2/sample/sampler.py +++ b/vllm_ascend/worker/v2/sample/sampler.py @@ -17,7 +17,7 @@ import torch from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.min_p import apply_min_p from vllm.v1.worker.gpu.sample.sampler import Sampler