[CI] Update vllm version to 20250922(5aeb925) (#3091)

### What this PR does / why we need it? This pr bump vllm commit hash to 5aeb925452 fix issues: 1. https://github.com/vllm-project/vllm/pull/25345 has remove v0 metadata 2. https://github.com/vllm-project/vllm/pull/25332 3. https://github.com/vllm-project/vllm/pull/25334 4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm commit update the model register logic, which will check all the model registered have the `vllm.model_executor.models` path , which breaks our custom registration of the deepseek_v3 model (it doesn't exist in the vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to solve temporary ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: 9607d5eb44 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-09-22 22:18:13 +08:00
parent 1c9f0fe26f
commit 02f89d166f
21 changed files with 58 additions and 92 deletions
--- a/vllm_ascend/core/schedule_config.py
+++ b/vllm_ascend/core/schedule_config.py
@@ -74,7 +74,7 @@ class AscendSchedulerConfig(SchedulerConfig):
        if self.send_delta_data:
            raise NotImplementedError(
                "currently AscendScheduler doesn't support send_delta_data.")
-        if self.delay_factor > 0:
+        if getattr(self, "scheduler_delay_factor", 0) > 0:
            raise NotImplementedError(
                "currently AscendScheduler doesn't support scheduler_delay_factor."
            )
--- a/vllm_ascend/models/init.py
+++ b/vllm_ascend/models/init.py
@@ -25,7 +25,7 @@ def register_model():

    ModelRegistry.register_model(
        "DeepseekV3ForCausalLM",
-        "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
+        "vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM")

    ModelRegistry.register_model(
        "DeepSeekMTPModel",
--- a/vllm_ascend/models/deepseek_mtp.py
+++ b/vllm_ascend/models/deepseek_mtp.py
@@ -28,7 +28,6 @@ from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.models.deepseek_mtp import (
@@ -36,7 +35,6 @@ from vllm.model_executor.models.deepseek_mtp import (
    SharedHead)
 from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
 from vllm.model_executor.models.utils import maybe_prefix
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors


@@ -168,7 +166,7 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        sampling_metadata,  # type: ignore
        spec_step_idx: int = 0,
    ) -> torch.Tensor:
        current_step_idx = (spec_step_idx % self.num_mtp_layers)
@@ -188,8 +186,6 @@ class CustomDeepSeekMTP(DeepSeekMTP):
                                                       prefix=maybe_prefix(
                                                           prefix, "model"))

-        self.sampler = get_sampler()
-
    def forward(
        self,
        input_ids: torch.Tensor,
@@ -204,4 +200,4 @@ class CustomDeepSeekMTP(DeepSeekMTP):
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   attn_metadata, previous_hidden_states,
                                   inputs_embeds, spec_step_idx)
-        return hidden_states
+        return hidden_states
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -479,4 +479,8 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
        return loaded_params


+class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
+    pass
+
+
 DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__
--- a/vllm_ascend/models/deepseek_v3.py
+++ b/vllm_ascend/models/deepseek_v3.py
@@ -1,27 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM
-
-
-class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
-    pass
--- a/vllm_ascend/models/qwen3_next.py
+++ b/vllm_ascend/models/qwen3_next.py
@@ -50,7 +50,6 @@ from vllm.model_executor.models.utils import (
    AutoWeightsLoader, PPMissingLayer, extract_layer_index,
    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
    make_layers, maybe_prefix)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -1079,9 +1078,9 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
            use_v1=True)

    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata,  # type: ignore
    ) -> Optional[torch.Tensor]:
        return self.logits_processor(self.lm_head, hidden_states,
                                     sampling_metadata)
--- a/vllm_ascend/ops/vocab_parallel_embedding.py
+++ b/vllm_ascend/ops/vocab_parallel_embedding.py
@@ -253,3 +253,15 @@ class AscendLogitsProcessor(LogitsProcessor):
            logits = logits[..., :self.org_vocab_size]

        return logits
+
+    def forward(
+        self,
+        lm_head: VocabParallelEmbedding,
+        hidden_states: torch.Tensor,
+        # keep this for version compatibility
+        sampling_metadata=None,  # type: ignore
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> Optional[torch.Tensor]:
+        return super().forward(lm_head,
+                               hidden_states,
+                               embedding_bias=embedding_bias)
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -142,7 +142,7 @@ class NPUPlatform(Platform):
                "functionality is currently suboptimal.")
            if not model_config.is_multimodal_model and \
                structured_outputs_config.backend == "auto" and \
-                not scheduler_config.delay_factor > 0 and \
+                not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
                not scheduler_config.send_delta_data and \
                scheduler_config.policy == "fcfs":
                ascend_scheduler_config.enabled = True
--- a/vllm_ascend/torchair/models/qwen2.py
+++ b/vllm_ascend/torchair/models/qwen2.py
@@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM  # noqa: F401
 from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
 from vllm.model_executor.models.utils import (AutoWeightsLoader,
                                              PPMissingLayer, maybe_prefix)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from vllm_ascend.ascend_config import get_ascend_config
@@ -343,9 +342,9 @@ class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        return hidden_states

    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata,  # type: ignore
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
--- a/vllm_ascend/torchair/models/torchair_deepseek_mtp.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_mtp.py
@@ -27,14 +27,12 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.models.deepseek_mtp import (
    DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
    SharedHead)
 from vllm.model_executor.models.utils import maybe_prefix
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
@@ -172,7 +170,7 @@ class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        sampling_metadata,  # type: ignore
        spec_step_idx: int = 0,
    ) -> torch.Tensor:
        current_step_idx = (spec_step_idx % self.num_mtp_layers)
@@ -199,8 +197,6 @@ class TorchairDeepSeekMTP(DeepSeekMTP):
        self.model = TorchairDeepSeekMultiTokenPredictor(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))

-        self.sampler = get_sampler()
-
    def forward(
        self,
        input_ids: torch.Tensor,
--- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py
+++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py
@@ -52,7 +52,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -945,7 +944,6 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
        else:
            self.lm_head = PPMissingLayer()
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

--- a/vllm_ascend/torchair/models/torchair_pangu_moe.py
+++ b/vllm_ascend/torchair/models/torchair_pangu_moe.py
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -53,9 +52,9 @@ from vllm.model_executor.models.interfaces import SupportsPP
 from vllm.model_executor.models.utils import (
    extract_layer_index, is_pp_missing_parameter,
    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import IntermediateTensors
+from vllm.v1.sample.sampler import Sampler

 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
@@ -913,7 +912,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
        if self.config.tie_word_embeddings:
            self.lm_head.weight = self.model.embed_tokens.weight
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
+        self.sampler = Sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

@@ -935,19 +934,19 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
        return hidden_states

    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata,  # type: ignore
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
+            self,
+            logits: Optional[torch.Tensor],
+            sampling_metadata,  # type: ignore
+    ):
        next_tokens = self.sampler(logits, sampling_metadata)
        return next_tokens

--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                        num_scheduled_tokens_np, finished_sending,
                        finished_recving, kv_connector_output)
                sample_hidden_states = hidden_states[logits_indices]
-                logits = self.model.compute_logits(sample_hidden_states, None)
+                logits = self._compute_logits_wrapper(sample_hidden_states,
+                                                      None)
            if broadcast_pp_output:
                model_output_broadcast_data = {
                    "logits": logits.contiguous(),
@@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                            dtype=torch.int32)

                def dummy_compute_logits(hidden_states):
-                    return self.model.compute_logits(
+                    return self._compute_logits_wrapper(
                        hidden_states[dummy_indices], None)

            with set_ascend_forward_context(
@@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                logit_indices = np.cumsum(num_scheduled_tokens) - 1
                # TODO: need to rum a dummy sampler for generate task
                hidden_states = hidden_states[logit_indices]
-                output = self.model.compute_logits(hidden_states, None)
+                output = self._compute_logits_wrapper(hidden_states, None)

        NPUPlatform.synchronize()
        del hidden_states, output
        self.encoder_cache.clear()
        gc.collect()

+    def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
+        if vllm_version_is("0.10.2"):
+            return self.model.compute_logits(hidden_states, sampling_metadata)
+        return self.model.compute_logits(hidden_states)
+
    def _dummy_pooler_run_task(
        self,
        hidden_states: torch.Tensor,
@@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            req_idx = self.input_batch.req_id_to_index[req_id]
            offset = self.query_start_loc_np[req_idx].item()
            prompt_hidden_states = hidden_states[offset:offset + num_logits]
-            logits = self.model.compute_logits(prompt_hidden_states, None)
+            logits = self._compute_logits_wrapper(prompt_hidden_states, None)

            # Get the "target" tokens for each index. For prompt at index i,
            # the token at prompt index i+1 is the "sampled" token we want