diff --git a/.github/workflows/_pre_commit.yml b/.github/workflows/_pre_commit.yml index dc848c2a..0e6b6ddb 100644 --- a/.github/workflows/_pre_commit.yml +++ b/.github/workflows/_pre_commit.yml @@ -38,6 +38,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty ref: ${{ inputs.vllm }} + - uses: dorny/paths-filter@v3 id: filter with: @@ -62,10 +63,12 @@ jobs: run: | git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend pre-commit run --all-files --hook-stage manual --show-diff-on-failure + - name: Run mypy run: | PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty" export PYTHONPATH + env git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend # Run mypy for Python 3.10, 3.11, 3.12 manually # Note: We are now separating mypy from pre-commit hooks for performance reasons. diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index dfc8047f..776931bf 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=v0.15.0 + VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 17801c1c..d8d52935 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=v0.15.0 +ARG VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 99817ed1..5bd82e18 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 7a5eb5b9..4f8086b8 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: v0.15.0 + vllm: d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a changes: runs-on: linux-aarch64-a2-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index 6c761099..614cb78b 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 6976f24e..44c6b84e 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -55,7 +55,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index fdad87df..365a36c9 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -922,4 +922,7 @@ PROMPT_CONFIGS = { @pytest.fixture(params=PROMPT_CONFIGS.keys()) def vl_config(request): - return PROMPT_CONFIGS[request.param] + config = PROMPT_CONFIGS[request.param] + if "skip" in config: + pytest.skip(config["skip"]) + return config diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index 553c715f..51133d80 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -9,6 +9,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.eplb.core.eplb_utils import init_eplb_config +from vllm_ascend.utils import vllm_version_is # isort: on @@ -21,7 +22,13 @@ class TestAscendConfig(unittest.TestCase): "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + if vllm_version_is("0.15.0"): + moe_parallel_config = FusedMoEParallelConfig( + 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + else: + moe_parallel_config = FusedMoEParallelConfig( + 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", + is_sequence_parallel=False, enable_eplb=True) moe_config = FusedMoEConfig( num_experts=8, experts_per_token=8, diff --git a/tests/ut/ops/test_mla.py b/tests/ut/ops/test_mla.py index d4501145..22503a57 100644 --- a/tests/ut/ops/test_mla.py +++ b/tests/ut/ops/test_mla.py @@ -82,8 +82,13 @@ class TestAscendMultiHeadLatentAttention(TestBase): @patch("vllm_ascend.ops.mla.get_tensor_model_parallel_world_size") def test_initialization(self, mock_tp_size, mock_ascend_config, mock_get_vllm_config): + # Create a proper mock for MLAAttention that has the required attributes + mock_mla_attn = MagicMock() + mock_mla_attn.process_weights_after_loading = MagicMock() + mock_mla_attn.impl = MagicMock() + mock_mla_attn.impl.process_weights_after_loading = MagicMock() - with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True): + with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn): mock_tp_size.return_value = 2 mock_ascend_config.return_value.enable_shared_expert_dp = True mock_vllm_config = MagicMock(spec=VllmConfig) @@ -126,7 +131,14 @@ class TestAscendMultiHeadLatentAttention(TestBase): num_hidden_layers=32, first_k_dense_replace=False) mock_get_vllm_config.return_value = mock_vllm_config mock_vllm_config.compilation_config = CompilationConfig() - with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True): + + # Create a proper mock for MLAAttention that has the required attributes + mock_mla_attn = MagicMock() + mock_mla_attn.process_weights_after_loading = MagicMock() + mock_mla_attn.impl = MagicMock() + mock_mla_attn.impl.process_weights_after_loading = MagicMock() + + with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn): attn = AscendMultiHeadLatentAttention( hidden_size=self.hidden_size, num_heads=self.num_heads, diff --git a/tests/ut/quantization/test_modelslim_config.py b/tests/ut/quantization/test_modelslim_config.py index 667a7c0d..2a9e0215 100644 --- a/tests/ut/quantization/test_modelslim_config.py +++ b/tests/ut/quantization/test_modelslim_config.py @@ -1,6 +1,5 @@ from unittest.mock import MagicMock, patch -from vllm.attention.layer import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.linear import LinearBase @@ -8,7 +7,12 @@ from vllm.model_executor.layers.linear import LinearBase from tests.ut.base import TestBase from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention class TestAscendModelSlimConfig(TestBase): diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index 0a6cbfb5..57eabef5 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -28,12 +28,15 @@ class TestEagleProposerInitialization(TestBase): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None self.mock_cpugpubuffer = patch( @@ -141,12 +144,15 @@ class TestEagleProposerLoadModel(TestBase): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) @@ -285,12 +291,15 @@ class TestEagleProposerDummyRun(TestBase): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.model_config.use_mla = False self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(4) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) @@ -404,12 +413,15 @@ class TestEagleProposerHelperMethods(TestBase): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index 29a55c06..c6d28185 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -34,6 +34,7 @@ class TestMtpProposer: config.speculative_config.draft_model_config = MagicMock() config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 config.speculative_config.draft_model_config.uses_mrope = False + config.speculative_config.draft_model_config.uses_xdrope_dim = 0 config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) @@ -42,9 +43,11 @@ class TestMtpProposer: config.model_config.dtype = torch.float16 config.model_config.max_model_len = 2048 config.model_config.uses_mrope = False + config.model_config.uses_xdrope_dim = 0 config.model_config.hf_text_config = None config.model_config.hf_config = None config.parallel_config.tensor_parallel_size = 1 + config.parallel_config.data_parallel_rank = 0 config.speculative_config.draft_tensor_parallel_size = 1 config.load_config = None diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index e3c4b0e8..7d936ab2 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1450,6 +1450,28 @@ class AscendMLAImpl(MLAAttentionImpl): def get_num_actual_tokens(self, attn_metadata: M): return attn_metadata.num_actual_tokens + def forward_mha( + self, + layer_name: str, + hidden_states: torch.Tensor, + kv_cache: tuple[torch.Tensor], + attn_metadata: M, + need_gather_q_kv: bool = False, + output: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError("forward_mha is not supported for MLA attention. Use forward() instead.") + + def forward_mqa( + self, + layer_name: str, + hidden_states: torch.Tensor, + kv_cache: tuple[torch.Tensor], + attn_metadata: M, + need_gather_q_kv: bool = False, + output: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError("forward_mqa is not supported for MLA attention. Use forward() instead.") + def forward( self, layer_name, diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 26f8c927..e2271956 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -1062,3 +1062,24 @@ class AscendSFAImpl(MLAAttentionImpl): torch.distributed.all_to_all_single(attn_output, send, group=get_tp_group().device_group) return attn_output, True + + def forward_mha( + self, + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: M, + k_scale: torch.Tensor, + output: torch.Tensor, + ) -> None: + raise NotImplementedError("forward_mha is not supported for SFA attention. Use forward() instead.") + + def forward_mqa( + self, + q: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: M, + layer, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + raise NotImplementedError("forward_mqa is not supported for SFA attention. Use forward() instead.") diff --git a/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py index 8586e6d9..984a0579 100644 --- a/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py @@ -18,7 +18,6 @@ import torch import torchair -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.compilation import Range from vllm.logger import logger @@ -27,6 +26,12 @@ from vllm_ascend.compilation.npugraph_ex_passes.utils.npugraph_ex_utils_check im check_and_register_fusion_pass, extra_stream_scope_check, ) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention class GraphEXQKNormRopeFusionPattern: diff --git a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py index f9dbf768..29b8ed84 100644 --- a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py @@ -18,12 +18,18 @@ import torch import torch._inductor.pattern_matcher as pm from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter -from vllm.attention.layer import Attention from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.compilation import Range from vllm.logger import logger +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention + class QKNormRopeFusionPattern: def __init__(self, vllm_config, head_dim, num_heads, num_kv_heads, eps=1e-6): diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py index c9d2cc1d..614372da 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py @@ -10,7 +10,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional import torch -from vllm.attention.layer import Attention, MLAAttention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole @@ -27,6 +26,7 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.cpu_offload.metadata import ( MetadataServerProc, MLAConfig, ) +from vllm_ascend.utils import vllm_version_is if TYPE_CHECKING: from vllm.forward_context import ForwardContext @@ -35,6 +35,11 @@ if TYPE_CHECKING: from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention, MLAAttention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention, MLAAttention + @dataclass class ReqMeta: diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index 76b1926f..98a4d892 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -6,6 +6,8 @@ from vllm.v1.attention.backend import AttentionBackend # type: ignore from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler, TransferResult, TransferSpec +from vllm_ascend.utils import vllm_version_is + logger = init_logger(__name__) @@ -153,12 +155,30 @@ class CpuNpuOffloadingHandler(OffloadingHandler): def get_finished(self) -> list[TransferResult]: results: list[TransferResult] = [] - for job_id, event in self.transfer_events.items(): - if event.query(): - results.append((job_id, True)) - self.events_pool.append(event) - for job_id, _ in results: - del self.transfer_events[job_id] + if vllm_version_is("v0.15.0"): + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append((job_id, True)) + self.events_pool.append(event) + for job_id, _ in results: + del self.transfer_events[job_id] + else: + finished_job_ids = [] + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append( + TransferResult( + job_id=job_id, + success=True, + transfer_size=None, + transfer_time=None, + transfer_type=None, + ) + ) + finished_job_ids.append(job_id) + self.events_pool.append(event) + for job_id in finished_job_ids: + del self.transfer_events[job_id] return results def wait(self, job_ids: set[int]) -> None: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index d301b402..86be6d99 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -46,7 +46,8 @@ from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType from vllm_ascend.utils import (AscendDeviceType, enable_sp, get_ascend_device_type, maybe_trans_nz, npu_stream_switch, shared_expert_dp_enabled, - shared_experts_calculation_stream) + shared_experts_calculation_stream, + vllm_version_is) @dataclass class FusedMoEResult: @@ -407,10 +408,13 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): shared_experts: torch.nn.Module, gate: Optional[torch.nn.Module] = None, use_overlapped: bool = True, + routed_input_transform: Optional[torch.nn.Module] = None, **kwargs, ): AscendFusedMoE.__init__(self, **kwargs) + if not vllm_version_is("0.15.0"): + self._routed_input_transform = routed_input_transform self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index c2a3f576..bf3bda6c 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -23,7 +23,6 @@ from typing import Optional import torch from torch import nn -from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context @@ -34,6 +33,12 @@ from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import MLAAttention # type: ignore +else: + from vllm.model_executor.layers.attention import MLAAttention class IndexerWrapper(nn.Module): @@ -125,6 +130,16 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): o_proj=mla_modules.o_proj, ) + original_process_weights = self.mla_attn.process_weights_after_loading + + def wrapped_process_weights(act_dtype: torch.dtype): + from vllm_ascend.attention.sfa_v1 import AscendSFAImpl + if not isinstance(self.mla_attn.impl, AscendSFAImpl): + original_process_weights(act_dtype) + self.mla_attn.impl.process_weights_after_loading(act_dtype) + + self.mla_attn.process_weights_after_loading = wrapped_process_weights + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d214dbad..2fd0498f 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -33,3 +33,4 @@ import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_v2_egale # noqa +import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa diff --git a/vllm_ascend/patch/worker/patch_huanyuan_vl.py b/vllm_ascend/patch/worker/patch_huanyuan_vl.py new file mode 100644 index 00000000..76371776 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_huanyuan_vl.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#from collections.abc import Iterable + +from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor + +_original_call = HunYuanVLProcessor.__call__ + +def _patched_call(self, images=None, text=None, videos=None, **kwargs): + """Remove add_special_tokens requirement.""" + kwargs.pop("add_special_tokens", None) + return _original_call(self, images=images, text=text, videos=videos, **kwargs) + +HunYuanVLProcessor.__call__ = _patched_call \ No newline at end of file diff --git a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py index e150d36f..cf97bc14 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py @@ -1,8 +1,12 @@ import torch import vllm.v1.worker.utils as utils -from vllm.attention.layer import Attention from vllm.v1.worker.utils import defaultdict, extract_layer_index +from vllm_ascend.utils import vllm_version_is +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention # Without this patch, it will raise an exception when initialize kv_cache. # TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError. diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 0303b0dc..227b97fb 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -401,7 +401,13 @@ class AscendModelSlimConfig(QuantizationConfig): self.packed_modules_mapping = packed_modules_model_mapping[ model_type] prefix = self.quant_prefix_mapper(model_type, prefix) - from vllm.attention.layer import Attention + + from vllm_ascend.utils import vllm_version_is + if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore + else: + from vllm.model_executor.layers.attention import Attention + if prefix.startswith("language_model"): prefix = prefix.split('.', 1)[-1] if isinstance(layer, LinearBase): diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 8864155b..986d7a71 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -41,7 +41,7 @@ from vllm_ascend.ops.rotary_embedding import update_cos_sin from vllm_ascend.ops.triton.spec_decode.utils import \ prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, lmhead_tp_enable +from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, lmhead_tp_enable, vllm_version_is # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -400,6 +400,12 @@ class EagleProposer(VllmEagleProposer): is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index before first model call + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + self._runnable( num_input_tokens=num_tokens, batch_size=batch_size, @@ -559,6 +565,12 @@ class EagleProposer(VllmEagleProposer): is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for forward pass + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + draft_token_ids = self._runnable( num_input_tokens=num_input_tokens, batch_size=batch_size, @@ -660,6 +672,12 @@ class EagleProposer(VllmEagleProposer): forward_context.num_accept_tokens = batch_size for draft_step in range(self.num_speculative_tokens - 1): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for each draft step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + # Update the inputs. # cast to int32 is crucial when eagle model is compiled. # tensor.argmax() returns int64 by default. diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 5a4326ab..03f21fed 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -18,7 +18,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import lmhead_tp_enable +from vllm_ascend.utils import lmhead_tp_enable, vllm_version_is class MtpProposer(EagleProposer): @@ -122,6 +122,11 @@ class MtpProposer(EagleProposer): batch_descriptor=batch_descriptor, is_draft_model=True, in_profile_run=is_profile): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for each MTP step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 previous_hidden_states, positions = self.maybe_pad_and_reduce( previous_hidden_states, positions) self.model(input_ids=input_ids, @@ -330,6 +335,13 @@ class MtpProposer(EagleProposer): batch_descriptor=batch_descriptor, num_actual_tokens=num_tokens, is_draft_model=True): + + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for each MTP step to match all_moe_layers registration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + with record_function_or_nullcontext('mtp_forward'): model_kwargs = {} model_kwargs["attn_metadata"] = attn_metadata diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index fb73bf98..1ffefd0b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -30,7 +30,6 @@ import numpy as np import torch import torch.distributed as dist import torch.nn as nn -from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.config import CompilationMode, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather @@ -137,6 +136,12 @@ if TYPE_CHECKING: else: xgr = LazyLoader("xgr", globals(), "xgrammar") +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention, MLAAttention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention, MLAAttention # if true, allow tensor initialization and casting with internal format (e.g., NZ) torch.npu.config.allow_internal_format = True @@ -2026,6 +2031,7 @@ class NPUModelRunner(GPUModelRunner): remove_lora: bool = True, activate_lora: bool = False, is_graph_capturing: bool = False, + num_active_loras: int = 0, ) -> tuple[torch.Tensor, torch.Tensor]: # only support eager mode and piecewise graph now assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()