[CI] Update vllm version to 20250922(5aeb925) (#3091)

### What this PR does / why we need it?
This pr bump vllm commit hash to
5aeb925452
fix issues:  
1. https://github.com/vllm-project/vllm/pull/25345 has remove v0
metadata
2. https://github.com/vllm-project/vllm/pull/25332
3. https://github.com/vllm-project/vllm/pull/25334
4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm
commit update the model register logic, which will check all the model
registered have the `vllm.model_executor.models` path , which breaks our
custom registration of the deepseek_v3 model (it doesn't exist in the
vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to
solve temporary

### How was this patch tested?

- vLLM version: v0.10.2
- vLLM main:
9607d5eb44

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-09-22 22:18:13 +08:00
committed by GitHub
parent 1c9f0fe26f
commit 02f89d166f
21 changed files with 58 additions and 92 deletions

View File

@@ -36,7 +36,7 @@ jobs:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=9607d5eb449711b349d4c2bee0a9c94afcc7ed14 VLLM_COMMIT=5aeb9254521023f97aca292b3478aa7ff485ffb2
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -42,7 +42,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/pre-commit.yml uses: ./.github/workflows/pre-commit.yml
with: with:
vllm: 9607d5eb449711b349d4c2bee0a9c94afcc7ed14 vllm: 5aeb9254521023f97aca292b3478aa7ff485ffb2
changes: changes:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2] vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -138,7 +138,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2] vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -68,7 +68,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2] vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -61,7 +61,6 @@ from torch import nn
from vllm.attention import Attention from vllm.attention import Attention
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.model_executor.sampling_metadata import SamplingMetadata
class CustomAttention(nn.Module): class CustomAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):

View File

@@ -19,7 +19,12 @@
from typing import Dict, List, Optional, Sequence, Tuple, Union from typing import Dict, List, Optional, Sequence, Tuple, Union
from vllm.sequence import PromptLogprobs, SampleLogprobs from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
from vllm.sequence import PromptLogprobs, SampleLogprobs
else:
from vllm.logprobs import PromptLogprobs, SampleLogprobs
TokensText = Tuple[List[int], str] TokensText = Tuple[List[int], str]

View File

@@ -27,7 +27,6 @@ class TestAscendSchedulerConfig(TestBase):
max_model_len=8192, max_model_len=8192,
is_multimodal_model=False, is_multimodal_model=False,
send_delta_data=False, send_delta_data=False,
scheduler_delay_factor=0,
) )
def test_initialize_from_config_with_default(self): def test_initialize_from_config_with_default(self):
@@ -90,21 +89,6 @@ class TestAscendSchedulerConfig(TestBase):
str(context.exception), str(context.exception),
) )
def test_not_implemented_delay_factor(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
delay_factor=1,
max_num_batched_tokens=2048,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler doesn't support scheduler_delay_factor",
str(context.exception),
)
def test_no_override(self): def test_no_override(self):
ascend_config = AscendSchedulerConfig.initialize_from_config( ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {}) self.basic_scheduler_config, {})

View File

@@ -168,8 +168,6 @@ class TestCustomDeepSeekMTP(PytestBase):
mocker.patch( mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__", "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None) return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch( mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None) return_value=None)

View File

@@ -165,8 +165,6 @@ class TestTorchairDeepSeekMTP(PytestBase):
mocker.patch( mocker.patch(
"vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__", "vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None) return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch( mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None) return_value=None)

View File

@@ -74,7 +74,7 @@ class AscendSchedulerConfig(SchedulerConfig):
if self.send_delta_data: if self.send_delta_data:
raise NotImplementedError( raise NotImplementedError(
"currently AscendScheduler doesn't support send_delta_data.") "currently AscendScheduler doesn't support send_delta_data.")
if self.delay_factor > 0: if getattr(self, "scheduler_delay_factor", 0) > 0:
raise NotImplementedError( raise NotImplementedError(
"currently AscendScheduler doesn't support scheduler_delay_factor." "currently AscendScheduler doesn't support scheduler_delay_factor."
) )

View File

@@ -25,7 +25,7 @@ def register_model():
ModelRegistry.register_model( ModelRegistry.register_model(
"DeepseekV3ForCausalLM", "DeepseekV3ForCausalLM",
"vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM") "vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM")
ModelRegistry.register_model( ModelRegistry.register_model(
"DeepSeekMTPModel", "DeepSeekMTPModel",

View File

@@ -28,7 +28,6 @@ from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.models.deepseek_mtp import ( from vllm.model_executor.models.deepseek_mtp import (
@@ -36,7 +35,6 @@ from vllm.model_executor.models.deepseek_mtp import (
SharedHead) SharedHead)
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@@ -168,7 +166,7 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
def compute_logits( def compute_logits(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata, sampling_metadata, # type: ignore
spec_step_idx: int = 0, spec_step_idx: int = 0,
) -> torch.Tensor: ) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers) current_step_idx = (spec_step_idx % self.num_mtp_layers)
@@ -188,8 +186,6 @@ class CustomDeepSeekMTP(DeepSeekMTP):
prefix=maybe_prefix( prefix=maybe_prefix(
prefix, "model")) prefix, "model"))
self.sampler = get_sampler()
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,

View File

@@ -479,4 +479,8 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
return loaded_params return loaded_params
class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
pass
DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__ DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__

View File

@@ -1,27 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM
class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
pass

View File

@@ -50,7 +50,6 @@ from vllm.model_executor.models.utils import (
AutoWeightsLoader, PPMissingLayer, extract_layer_index, AutoWeightsLoader, PPMissingLayer, extract_layer_index,
is_pp_missing_parameter, make_empty_intermediate_tensors_factory, is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
make_layers, maybe_prefix) make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@@ -1079,9 +1078,9 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
use_v1=True) use_v1=True)
def compute_logits( def compute_logits(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata, sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]: ) -> Optional[torch.Tensor]:
return self.logits_processor(self.lm_head, hidden_states, return self.logits_processor(self.lm_head, hidden_states,
sampling_metadata) sampling_metadata)

View File

@@ -253,3 +253,15 @@ class AscendLogitsProcessor(LogitsProcessor):
logits = logits[..., :self.org_vocab_size] logits = logits[..., :self.org_vocab_size]
return logits return logits
def forward(
self,
lm_head: VocabParallelEmbedding,
hidden_states: torch.Tensor,
# keep this for version compatibility
sampling_metadata=None, # type: ignore
embedding_bias: Optional[torch.Tensor] = None,
) -> Optional[torch.Tensor]:
return super().forward(lm_head,
hidden_states,
embedding_bias=embedding_bias)

View File

@@ -142,7 +142,7 @@ class NPUPlatform(Platform):
"functionality is currently suboptimal.") "functionality is currently suboptimal.")
if not model_config.is_multimodal_model and \ if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \ structured_outputs_config.backend == "auto" and \
not scheduler_config.delay_factor > 0 and \ not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
not scheduler_config.send_delta_data and \ not scheduler_config.send_delta_data and \
scheduler_config.policy == "fcfs": scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True ascend_scheduler_config.enabled = True

View File

@@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM # noqa: F401
from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
from vllm.model_executor.models.utils import (AutoWeightsLoader, from vllm.model_executor.models.utils import (AutoWeightsLoader,
PPMissingLayer, maybe_prefix) PPMissingLayer, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
@@ -343,9 +342,9 @@ class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
return hidden_states return hidden_states
def compute_logits( def compute_logits(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata, sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]: ) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states, logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata) sampling_metadata)

View File

@@ -27,14 +27,12 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.models.deepseek_mtp import ( from vllm.model_executor.models.deepseek_mtp import (
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer, DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
SharedHead) SharedHead)
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm_ascend.torchair.models.torchair_deepseek_v2 import \ from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
@@ -172,7 +170,7 @@ class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
def compute_logits( def compute_logits(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata, sampling_metadata, # type: ignore
spec_step_idx: int = 0, spec_step_idx: int = 0,
) -> torch.Tensor: ) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers) current_step_idx = (spec_step_idx % self.num_mtp_layers)
@@ -199,8 +197,6 @@ class TorchairDeepSeekMTP(DeepSeekMTP):
self.model = TorchairDeepSeekMultiTokenPredictor( self.model = TorchairDeepSeekMultiTokenPredictor(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
self.sampler = get_sampler()
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,

View File

@@ -52,7 +52,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
@@ -945,7 +944,6 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
else: else:
self.lm_head = PPMissingLayer() self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size) self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors) self.model.make_empty_intermediate_tensors)

View File

@@ -45,7 +45,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -53,9 +52,9 @@ from vllm.model_executor.models.interfaces import SupportsPP
from vllm.model_executor.models.utils import ( from vllm.model_executor.models.utils import (
extract_layer_index, is_pp_missing_parameter, extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.v1.sample.sampler import Sampler
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
@@ -913,7 +912,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
if self.config.tie_word_embeddings: if self.config.tie_word_embeddings:
self.lm_head.weight = self.model.embed_tokens.weight self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor(config.vocab_size) self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler() self.sampler = Sampler()
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors) self.model.make_empty_intermediate_tensors)
@@ -935,19 +934,19 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
return hidden_states return hidden_states
def compute_logits( def compute_logits(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata, sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]: ) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states, logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata) sampling_metadata)
return logits return logits
def sample( def sample(
self, self,
logits: Optional[torch.Tensor], logits: Optional[torch.Tensor],
sampling_metadata: SamplingMetadata, sampling_metadata, # type: ignore
) -> Optional[SamplerOutput]: ):
next_tokens = self.sampler(logits, sampling_metadata) next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens return next_tokens

View File

@@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_scheduled_tokens_np, finished_sending, num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output) finished_recving, kv_connector_output)
sample_hidden_states = hidden_states[logits_indices] sample_hidden_states = hidden_states[logits_indices]
logits = self.model.compute_logits(sample_hidden_states, None) logits = self._compute_logits_wrapper(sample_hidden_states,
None)
if broadcast_pp_output: if broadcast_pp_output:
model_output_broadcast_data = { model_output_broadcast_data = {
"logits": logits.contiguous(), "logits": logits.contiguous(),
@@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=torch.int32) dtype=torch.int32)
def dummy_compute_logits(hidden_states): def dummy_compute_logits(hidden_states):
return self.model.compute_logits( return self._compute_logits_wrapper(
hidden_states[dummy_indices], None) hidden_states[dummy_indices], None)
with set_ascend_forward_context( with set_ascend_forward_context(
@@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
logit_indices = np.cumsum(num_scheduled_tokens) - 1 logit_indices = np.cumsum(num_scheduled_tokens) - 1
# TODO: need to rum a dummy sampler for generate task # TODO: need to rum a dummy sampler for generate task
hidden_states = hidden_states[logit_indices] hidden_states = hidden_states[logit_indices]
output = self.model.compute_logits(hidden_states, None) output = self._compute_logits_wrapper(hidden_states, None)
NPUPlatform.synchronize() NPUPlatform.synchronize()
del hidden_states, output del hidden_states, output
self.encoder_cache.clear() self.encoder_cache.clear()
gc.collect() gc.collect()
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
if vllm_version_is("0.10.2"):
return self.model.compute_logits(hidden_states, sampling_metadata)
return self.model.compute_logits(hidden_states)
def _dummy_pooler_run_task( def _dummy_pooler_run_task(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
@@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
req_idx = self.input_batch.req_id_to_index[req_id] req_idx = self.input_batch.req_id_to_index[req_id]
offset = self.query_start_loc_np[req_idx].item() offset = self.query_start_loc_np[req_idx].item()
prompt_hidden_states = hidden_states[offset:offset + num_logits] prompt_hidden_states = hidden_states[offset:offset + num_logits]
logits = self.model.compute_logits(prompt_hidden_states, None) logits = self._compute_logits_wrapper(prompt_hidden_states, None)
# Get the "target" tokens for each index. For prompt at index i, # Get the "target" tokens for each index. For prompt at index i,
# the token at prompt index i+1 is the "sampled" token we want # the token at prompt index i+1 is the "sampled" token we want