diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index 420da50..2d74528 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=9607d5eb449711b349d4c2bee0a9c94afcc7ed14 + VLLM_COMMIT=5aeb9254521023f97aca292b3478aa7ff485ffb2 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 5a9d857..fbaeecb 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: 9607d5eb449711b349d4c2bee0a9c94afcc7ed14 + vllm: 5aeb9254521023f97aca292b3478aa7ff485ffb2 changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2] + vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2] steps: - name: Install packages run: | @@ -138,7 +138,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2] + vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index e98723c..1505096 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -68,7 +68,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2] + vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/docs/source/developer_guide/modeling/adding_a_new_model.md b/docs/source/developer_guide/modeling/adding_a_new_model.md index 117f559..5762fde 100644 --- a/docs/source/developer_guide/modeling/adding_a_new_model.md +++ b/docs/source/developer_guide/modeling/adding_a_new_model.md @@ -61,7 +61,6 @@ from torch import nn from vllm.attention import Attention from vllm.config import VllmConfig from vllm.sequence import IntermediateTensors -from vllm.model_executor.sampling_metadata import SamplingMetadata class CustomAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/tests/e2e/model_utils.py b/tests/e2e/model_utils.py index 1a3ea5b..e5b353e 100644 --- a/tests/e2e/model_utils.py +++ b/tests/e2e/model_utils.py @@ -19,7 +19,12 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union -from vllm.sequence import PromptLogprobs, SampleLogprobs +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.10.2"): + from vllm.sequence import PromptLogprobs, SampleLogprobs +else: + from vllm.logprobs import PromptLogprobs, SampleLogprobs TokensText = Tuple[List[int], str] diff --git a/tests/ut/core/test_schedule_config.py b/tests/ut/core/test_schedule_config.py index 17c162f..b0942da 100644 --- a/tests/ut/core/test_schedule_config.py +++ b/tests/ut/core/test_schedule_config.py @@ -27,7 +27,6 @@ class TestAscendSchedulerConfig(TestBase): max_model_len=8192, is_multimodal_model=False, send_delta_data=False, - scheduler_delay_factor=0, ) def test_initialize_from_config_with_default(self): @@ -90,21 +89,6 @@ class TestAscendSchedulerConfig(TestBase): str(context.exception), ) - def test_not_implemented_delay_factor(self): - with self.assertRaises(NotImplementedError) as context: - AscendSchedulerConfig.initialize_from_config( - self.basic_scheduler_config, - AscendSchedulerConfig( - delay_factor=1, - max_num_batched_tokens=2048, - max_model_len=2048, - ), - ) - self.assertIn( - "currently AscendScheduler doesn't support scheduler_delay_factor", - str(context.exception), - ) - def test_no_override(self): ascend_config = AscendSchedulerConfig.initialize_from_config( self.basic_scheduler_config, {}) diff --git a/tests/ut/models/test_deepseek_mtp.py b/tests/ut/models/test_deepseek_mtp.py index 80525f2..1dc7c9c 100644 --- a/tests/ut/models/test_deepseek_mtp.py +++ b/tests/ut/models/test_deepseek_mtp.py @@ -168,8 +168,6 @@ class TestCustomDeepSeekMTP(PytestBase): mocker.patch( "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__", return_value=None) - mocker.patch("vllm.model_executor.layers.sampler.get_sampler", - return_value=None) mocker.patch( "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", return_value=None) diff --git a/tests/ut/torchair/models/test_torchair_deepseek_mtp.py b/tests/ut/torchair/models/test_torchair_deepseek_mtp.py index 7aafdfc..7feeba9 100644 --- a/tests/ut/torchair/models/test_torchair_deepseek_mtp.py +++ b/tests/ut/torchair/models/test_torchair_deepseek_mtp.py @@ -165,8 +165,6 @@ class TestTorchairDeepSeekMTP(PytestBase): mocker.patch( "vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__", return_value=None) - mocker.patch("vllm.model_executor.layers.sampler.get_sampler", - return_value=None) mocker.patch( "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", return_value=None) diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py index 83d0675..dcd5d05 100644 --- a/vllm_ascend/core/schedule_config.py +++ b/vllm_ascend/core/schedule_config.py @@ -74,7 +74,7 @@ class AscendSchedulerConfig(SchedulerConfig): if self.send_delta_data: raise NotImplementedError( "currently AscendScheduler doesn't support send_delta_data.") - if self.delay_factor > 0: + if getattr(self, "scheduler_delay_factor", 0) > 0: raise NotImplementedError( "currently AscendScheduler doesn't support scheduler_delay_factor." ) diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index 34529d5..fceaf2f 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -25,7 +25,7 @@ def register_model(): ModelRegistry.register_model( "DeepseekV3ForCausalLM", - "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM") + "vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM") ModelRegistry.register_model( "DeepSeekMTPModel", diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py index 80bc66e..b3daa6c 100644 --- a/vllm_ascend/models/deepseek_mtp.py +++ b/vllm_ascend/models/deepseek_mtp.py @@ -28,7 +28,6 @@ from vllm.config import (CacheConfig, ModelConfig, VllmConfig, from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_mtp import ( @@ -36,7 +35,6 @@ from vllm.model_executor.models.deepseek_mtp import ( SharedHead) from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer from vllm.model_executor.models.utils import maybe_prefix -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -168,7 +166,7 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + sampling_metadata, # type: ignore spec_step_idx: int = 0, ) -> torch.Tensor: current_step_idx = (spec_step_idx % self.num_mtp_layers) @@ -188,8 +186,6 @@ class CustomDeepSeekMTP(DeepSeekMTP): prefix=maybe_prefix( prefix, "model")) - self.sampler = get_sampler() - def forward( self, input_ids: torch.Tensor, @@ -204,4 +200,4 @@ class CustomDeepSeekMTP(DeepSeekMTP): hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata, previous_hidden_states, inputs_embeds, spec_step_idx) - return hidden_states \ No newline at end of file + return hidden_states diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 33e5145..988de33 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -479,4 +479,8 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): return loaded_params +class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): + pass + + DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__ diff --git a/vllm_ascend/models/deepseek_v3.py b/vllm_ascend/models/deepseek_v3.py index 4d09ef0..e69de29 100644 --- a/vllm_ascend/models/deepseek_v3.py +++ b/vllm_ascend/models/deepseek_v3.py @@ -1,27 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM - - -class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): - pass diff --git a/vllm_ascend/models/qwen3_next.py b/vllm_ascend/models/qwen3_next.py index 1606f61..b4e71bc 100644 --- a/vllm_ascend/models/qwen3_next.py +++ b/vllm_ascend/models/qwen3_next.py @@ -50,7 +50,6 @@ from vllm.model_executor.models.utils import ( AutoWeightsLoader, PPMissingLayer, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -1079,9 +1078,9 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, use_v1=True) def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata, # type: ignore ) -> Optional[torch.Tensor]: return self.logits_processor(self.lm_head, hidden_states, sampling_metadata) diff --git a/vllm_ascend/ops/vocab_parallel_embedding.py b/vllm_ascend/ops/vocab_parallel_embedding.py index 0a7d7ef..6e317e9 100644 --- a/vllm_ascend/ops/vocab_parallel_embedding.py +++ b/vllm_ascend/ops/vocab_parallel_embedding.py @@ -253,3 +253,15 @@ class AscendLogitsProcessor(LogitsProcessor): logits = logits[..., :self.org_vocab_size] return logits + + def forward( + self, + lm_head: VocabParallelEmbedding, + hidden_states: torch.Tensor, + # keep this for version compatibility + sampling_metadata=None, # type: ignore + embedding_bias: Optional[torch.Tensor] = None, + ) -> Optional[torch.Tensor]: + return super().forward(lm_head, + hidden_states, + embedding_bias=embedding_bias) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 7104621..4bd29b1 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -142,7 +142,7 @@ class NPUPlatform(Platform): "functionality is currently suboptimal.") if not model_config.is_multimodal_model and \ structured_outputs_config.backend == "auto" and \ - not scheduler_config.delay_factor > 0 and \ + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ not scheduler_config.send_delta_data and \ scheduler_config.policy == "fcfs": ascend_scheduler_config.enabled = True diff --git a/vllm_ascend/torchair/models/qwen2.py b/vllm_ascend/torchair/models/qwen2.py index 3537aa8..56620dc 100644 --- a/vllm_ascend/torchair/models/qwen2.py +++ b/vllm_ascend/torchair/models/qwen2.py @@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM # noqa: F401 from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model from vllm.model_executor.models.utils import (AutoWeightsLoader, PPMissingLayer, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm_ascend.ascend_config import get_ascend_config @@ -343,9 +342,9 @@ class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): return hidden_states def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata, # type: ignore ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) diff --git a/vllm_ascend/torchair/models/torchair_deepseek_mtp.py b/vllm_ascend/torchair/models/torchair_deepseek_mtp.py index 6cb98a5..6ef90f4 100644 --- a/vllm_ascend/torchair/models/torchair_deepseek_mtp.py +++ b/vllm_ascend/torchair/models/torchair_deepseek_mtp.py @@ -27,14 +27,12 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_mtp import ( DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer, SharedHead) from vllm.model_executor.models.utils import maybe_prefix -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm_ascend.torchair.models.torchair_deepseek_v2 import \ @@ -172,7 +170,7 @@ class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + sampling_metadata, # type: ignore spec_step_idx: int = 0, ) -> torch.Tensor: current_step_idx = (spec_step_idx % self.num_mtp_layers) @@ -199,8 +197,6 @@ class TorchairDeepSeekMTP(DeepSeekMTP): self.model = TorchairDeepSeekMultiTokenPredictor( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - self.sampler = get_sampler() - def forward( self, input_ids: torch.Tensor, diff --git a/vllm_ascend/torchair/models/torchair_deepseek_v2.py b/vllm_ascend/torchair/models/torchair_deepseek_v2.py index 70877cb..1b34e84 100644 --- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py +++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py @@ -52,7 +52,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -945,7 +944,6 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm_ascend/torchair/models/torchair_pangu_moe.py b/vllm_ascend/torchair/models/torchair_pangu_moe.py index eb05760..e38dc78 100644 --- a/vllm_ascend/torchair/models/torchair_pangu_moe.py +++ b/vllm_ascend/torchair/models/torchair_pangu_moe.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.linear import (LinearBase, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -53,9 +52,9 @@ from vllm.model_executor.models.interfaces import SupportsPP from vllm.model_executor.models.utils import ( extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors +from vllm.v1.sample.sampler import Sampler from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p @@ -913,7 +912,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() + self.sampler = Sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -935,19 +934,19 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP): return hidden_states def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata, # type: ignore ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) return logits def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: + self, + logits: Optional[torch.Tensor], + sampling_metadata, # type: ignore + ): next_tokens = self.sampler(logits, sampling_metadata) return next_tokens diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3d2e6f3..242ff6e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): num_scheduled_tokens_np, finished_sending, finished_recving, kv_connector_output) sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, None) + logits = self._compute_logits_wrapper(sample_hidden_states, + None) if broadcast_pp_output: model_output_broadcast_data = { "logits": logits.contiguous(), @@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): dtype=torch.int32) def dummy_compute_logits(hidden_states): - return self.model.compute_logits( + return self._compute_logits_wrapper( hidden_states[dummy_indices], None) with set_ascend_forward_context( @@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin): logit_indices = np.cumsum(num_scheduled_tokens) - 1 # TODO: need to rum a dummy sampler for generate task hidden_states = hidden_states[logit_indices] - output = self.model.compute_logits(hidden_states, None) + output = self._compute_logits_wrapper(hidden_states, None) NPUPlatform.synchronize() del hidden_states, output self.encoder_cache.clear() gc.collect() + def _compute_logits_wrapper(self, hidden_states, sampling_metadata): + if vllm_version_is("0.10.2"): + return self.model.compute_logits(hidden_states, sampling_metadata) + return self.model.compute_logits(hidden_states) + def _dummy_pooler_run_task( self, hidden_states: torch.Tensor, @@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): req_idx = self.input_batch.req_id_to_index[req_id] offset = self.query_start_loc_np[req_idx].item() prompt_hidden_states = hidden_states[offset:offset + num_logits] - logits = self.model.compute_logits(prompt_hidden_states, None) + logits = self._compute_logits_wrapper(prompt_hidden_states, None) # Get the "target" tokens for each index. For prompt at index i, # the token at prompt index i+1 is the "sampled" token we want