[CI] Update vllm version to 20250922(5aeb925) (#3091)
### What this PR does / why we need it? This pr bump vllm commit hash to5aeb925452fix issues: 1. https://github.com/vllm-project/vllm/pull/25345 has remove v0 metadata 2. https://github.com/vllm-project/vllm/pull/25332 3. https://github.com/vllm-project/vllm/pull/25334 4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm commit update the model register logic, which will check all the model registered have the `vllm.model_executor.models` path , which breaks our custom registration of the deepseek_v3 model (it doesn't exist in the vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to solve temporary ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main:9607d5eb44--------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=9607d5eb449711b349d4c2bee0a9c94afcc7ed14
|
VLLM_COMMIT=5aeb9254521023f97aca292b3478aa7ff485ffb2
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
6
.github/workflows/vllm_ascend_test.yaml
vendored
6
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/pre-commit.yml
|
uses: ./.github/workflows/pre-commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 9607d5eb449711b349d4c2bee0a9c94afcc7ed14
|
vllm: 5aeb9254521023f97aca292b3478aa7ff485ffb2
|
||||||
|
|
||||||
changes:
|
changes:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -83,7 +83,7 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2]
|
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
|
||||||
steps:
|
steps:
|
||||||
- name: Install packages
|
- name: Install packages
|
||||||
run: |
|
run: |
|
||||||
@@ -138,7 +138,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2]
|
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -68,7 +68,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [9607d5eb449711b349d4c2bee0a9c94afcc7ed14, v0.10.2]
|
vllm_version: [5aeb9254521023f97aca292b3478aa7ff485ffb2, v0.10.2]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
@@ -61,7 +61,6 @@ from torch import nn
|
|||||||
from vllm.attention import Attention
|
from vllm.attention import Attention
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
||||||
|
|
||||||
class CustomAttention(nn.Module):
|
class CustomAttention(nn.Module):
|
||||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||||
|
|||||||
@@ -19,7 +19,12 @@
|
|||||||
|
|
||||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.10.2"):
|
||||||
|
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||||
|
else:
|
||||||
|
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||||
|
|
||||||
TokensText = Tuple[List[int], str]
|
TokensText = Tuple[List[int], str]
|
||||||
|
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
is_multimodal_model=False,
|
is_multimodal_model=False,
|
||||||
send_delta_data=False,
|
send_delta_data=False,
|
||||||
scheduler_delay_factor=0,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_initialize_from_config_with_default(self):
|
def test_initialize_from_config_with_default(self):
|
||||||
@@ -90,21 +89,6 @@ class TestAscendSchedulerConfig(TestBase):
|
|||||||
str(context.exception),
|
str(context.exception),
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_not_implemented_delay_factor(self):
|
|
||||||
with self.assertRaises(NotImplementedError) as context:
|
|
||||||
AscendSchedulerConfig.initialize_from_config(
|
|
||||||
self.basic_scheduler_config,
|
|
||||||
AscendSchedulerConfig(
|
|
||||||
delay_factor=1,
|
|
||||||
max_num_batched_tokens=2048,
|
|
||||||
max_model_len=2048,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
self.assertIn(
|
|
||||||
"currently AscendScheduler doesn't support scheduler_delay_factor",
|
|
||||||
str(context.exception),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_no_override(self):
|
def test_no_override(self):
|
||||||
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||||
self.basic_scheduler_config, {})
|
self.basic_scheduler_config, {})
|
||||||
|
|||||||
@@ -168,8 +168,6 @@ class TestCustomDeepSeekMTP(PytestBase):
|
|||||||
mocker.patch(
|
mocker.patch(
|
||||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
|
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
|
||||||
return_value=None)
|
return_value=None)
|
||||||
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
|
|
||||||
return_value=None)
|
|
||||||
mocker.patch(
|
mocker.patch(
|
||||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||||
return_value=None)
|
return_value=None)
|
||||||
|
|||||||
@@ -165,8 +165,6 @@ class TestTorchairDeepSeekMTP(PytestBase):
|
|||||||
mocker.patch(
|
mocker.patch(
|
||||||
"vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__",
|
"vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__",
|
||||||
return_value=None)
|
return_value=None)
|
||||||
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
|
|
||||||
return_value=None)
|
|
||||||
mocker.patch(
|
mocker.patch(
|
||||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||||
return_value=None)
|
return_value=None)
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ class AscendSchedulerConfig(SchedulerConfig):
|
|||||||
if self.send_delta_data:
|
if self.send_delta_data:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"currently AscendScheduler doesn't support send_delta_data.")
|
"currently AscendScheduler doesn't support send_delta_data.")
|
||||||
if self.delay_factor > 0:
|
if getattr(self, "scheduler_delay_factor", 0) > 0:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"currently AscendScheduler doesn't support scheduler_delay_factor."
|
"currently AscendScheduler doesn't support scheduler_delay_factor."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ def register_model():
|
|||||||
|
|
||||||
ModelRegistry.register_model(
|
ModelRegistry.register_model(
|
||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
"vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
|
"vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM")
|
||||||
|
|
||||||
ModelRegistry.register_model(
|
ModelRegistry.register_model(
|
||||||
"DeepSeekMTPModel",
|
"DeepSeekMTPModel",
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
|
|||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.sampler import get_sampler
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
ParallelLMHead, VocabParallelEmbedding)
|
ParallelLMHead, VocabParallelEmbedding)
|
||||||
from vllm.model_executor.models.deepseek_mtp import (
|
from vllm.model_executor.models.deepseek_mtp import (
|
||||||
@@ -36,7 +35,6 @@ from vllm.model_executor.models.deepseek_mtp import (
|
|||||||
SharedHead)
|
SharedHead)
|
||||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
|
from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer
|
||||||
from vllm.model_executor.models.utils import maybe_prefix
|
from vllm.model_executor.models.utils import maybe_prefix
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
|
|
||||||
@@ -168,7 +166,7 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
|
|||||||
def compute_logits(
|
def compute_logits(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata, # type: ignore
|
||||||
spec_step_idx: int = 0,
|
spec_step_idx: int = 0,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
current_step_idx = (spec_step_idx % self.num_mtp_layers)
|
current_step_idx = (spec_step_idx % self.num_mtp_layers)
|
||||||
@@ -188,8 +186,6 @@ class CustomDeepSeekMTP(DeepSeekMTP):
|
|||||||
prefix=maybe_prefix(
|
prefix=maybe_prefix(
|
||||||
prefix, "model"))
|
prefix, "model"))
|
||||||
|
|
||||||
self.sampler = get_sampler()
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
|
|||||||
@@ -479,4 +479,8 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
|
|||||||
return loaded_params
|
return loaded_params
|
||||||
|
|
||||||
|
|
||||||
|
class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__
|
DeepseekV2DecoderLayer.__init__ = CustomDeepseekV2DecoderLayer.__init__
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
||||||
# Copyright 2023 The vLLM team.
|
|
||||||
# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
|
|
||||||
#
|
|
||||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
||||||
# and OPT implementations in this library. It has been modified from its
|
|
||||||
# original forms to accommodate minor architectural differences compared
|
|
||||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM
|
|
||||||
|
|
||||||
|
|
||||||
class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
|
|
||||||
pass
|
|
||||||
|
|||||||
@@ -50,7 +50,6 @@ from vllm.model_executor.models.utils import (
|
|||||||
AutoWeightsLoader, PPMissingLayer, extract_layer_index,
|
AutoWeightsLoader, PPMissingLayer, extract_layer_index,
|
||||||
is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
|
is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
|
||||||
make_layers, maybe_prefix)
|
make_layers, maybe_prefix)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
@@ -1081,7 +1080,7 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
|||||||
def compute_logits(
|
def compute_logits(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata, # type: ignore
|
||||||
) -> Optional[torch.Tensor]:
|
) -> Optional[torch.Tensor]:
|
||||||
return self.logits_processor(self.lm_head, hidden_states,
|
return self.logits_processor(self.lm_head, hidden_states,
|
||||||
sampling_metadata)
|
sampling_metadata)
|
||||||
|
|||||||
@@ -253,3 +253,15 @@ class AscendLogitsProcessor(LogitsProcessor):
|
|||||||
logits = logits[..., :self.org_vocab_size]
|
logits = logits[..., :self.org_vocab_size]
|
||||||
|
|
||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
lm_head: VocabParallelEmbedding,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
# keep this for version compatibility
|
||||||
|
sampling_metadata=None, # type: ignore
|
||||||
|
embedding_bias: Optional[torch.Tensor] = None,
|
||||||
|
) -> Optional[torch.Tensor]:
|
||||||
|
return super().forward(lm_head,
|
||||||
|
hidden_states,
|
||||||
|
embedding_bias=embedding_bias)
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ class NPUPlatform(Platform):
|
|||||||
"functionality is currently suboptimal.")
|
"functionality is currently suboptimal.")
|
||||||
if not model_config.is_multimodal_model and \
|
if not model_config.is_multimodal_model and \
|
||||||
structured_outputs_config.backend == "auto" and \
|
structured_outputs_config.backend == "auto" and \
|
||||||
not scheduler_config.delay_factor > 0 and \
|
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||||
not scheduler_config.send_delta_data and \
|
not scheduler_config.send_delta_data and \
|
||||||
scheduler_config.policy == "fcfs":
|
scheduler_config.policy == "fcfs":
|
||||||
ascend_scheduler_config.enabled = True
|
ascend_scheduler_config.enabled = True
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM # noqa: F401
|
|||||||
from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
|
from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
|
||||||
from vllm.model_executor.models.utils import (AutoWeightsLoader,
|
from vllm.model_executor.models.utils import (AutoWeightsLoader,
|
||||||
PPMissingLayer, maybe_prefix)
|
PPMissingLayer, maybe_prefix)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
@@ -345,7 +344,7 @@ class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
|||||||
def compute_logits(
|
def compute_logits(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata, # type: ignore
|
||||||
) -> Optional[torch.Tensor]:
|
) -> Optional[torch.Tensor]:
|
||||||
logits = self.logits_processor(self.lm_head, hidden_states,
|
logits = self.logits_processor(self.lm_head, hidden_states,
|
||||||
sampling_metadata)
|
sampling_metadata)
|
||||||
|
|||||||
@@ -27,14 +27,12 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
|
|||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.sampler import get_sampler
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
ParallelLMHead, VocabParallelEmbedding)
|
ParallelLMHead, VocabParallelEmbedding)
|
||||||
from vllm.model_executor.models.deepseek_mtp import (
|
from vllm.model_executor.models.deepseek_mtp import (
|
||||||
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
|
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
|
||||||
SharedHead)
|
SharedHead)
|
||||||
from vllm.model_executor.models.utils import maybe_prefix
|
from vllm.model_executor.models.utils import maybe_prefix
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
|
from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
|
||||||
@@ -172,7 +170,7 @@ class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
|
|||||||
def compute_logits(
|
def compute_logits(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata, # type: ignore
|
||||||
spec_step_idx: int = 0,
|
spec_step_idx: int = 0,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
current_step_idx = (spec_step_idx % self.num_mtp_layers)
|
current_step_idx = (spec_step_idx % self.num_mtp_layers)
|
||||||
@@ -199,8 +197,6 @@ class TorchairDeepSeekMTP(DeepSeekMTP):
|
|||||||
self.model = TorchairDeepSeekMultiTokenPredictor(
|
self.model = TorchairDeepSeekMultiTokenPredictor(
|
||||||
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
|
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
|
||||||
|
|
||||||
self.sampler = get_sampler()
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
|
|||||||
@@ -52,7 +52,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
|||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
from vllm.model_executor.layers.sampler import get_sampler
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
ParallelLMHead, VocabParallelEmbedding)
|
ParallelLMHead, VocabParallelEmbedding)
|
||||||
from vllm.model_executor.model_loader.weight_utils import (
|
from vllm.model_executor.model_loader.weight_utils import (
|
||||||
@@ -945,7 +944,6 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
|
|||||||
else:
|
else:
|
||||||
self.lm_head = PPMissingLayer()
|
self.lm_head = PPMissingLayer()
|
||||||
self.logits_processor = LogitsProcessor(config.vocab_size)
|
self.logits_processor = LogitsProcessor(config.vocab_size)
|
||||||
self.sampler = get_sampler()
|
|
||||||
self.make_empty_intermediate_tensors = (
|
self.make_empty_intermediate_tensors = (
|
||||||
self.model.make_empty_intermediate_tensors)
|
self.model.make_empty_intermediate_tensors)
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
|
|||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
ParallelLMHead, VocabParallelEmbedding)
|
ParallelLMHead, VocabParallelEmbedding)
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
@@ -53,9 +52,9 @@ from vllm.model_executor.models.interfaces import SupportsPP
|
|||||||
from vllm.model_executor.models.utils import (
|
from vllm.model_executor.models.utils import (
|
||||||
extract_layer_index, is_pp_missing_parameter,
|
extract_layer_index, is_pp_missing_parameter,
|
||||||
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
|
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
from vllm.v1.sample.sampler import Sampler
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
|
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
|
||||||
@@ -913,7 +912,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
|
|||||||
if self.config.tie_word_embeddings:
|
if self.config.tie_word_embeddings:
|
||||||
self.lm_head.weight = self.model.embed_tokens.weight
|
self.lm_head.weight = self.model.embed_tokens.weight
|
||||||
self.logits_processor = LogitsProcessor(config.vocab_size)
|
self.logits_processor = LogitsProcessor(config.vocab_size)
|
||||||
self.sampler = get_sampler()
|
self.sampler = Sampler()
|
||||||
self.make_empty_intermediate_tensors = (
|
self.make_empty_intermediate_tensors = (
|
||||||
self.model.make_empty_intermediate_tensors)
|
self.model.make_empty_intermediate_tensors)
|
||||||
|
|
||||||
@@ -937,7 +936,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
|
|||||||
def compute_logits(
|
def compute_logits(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata, # type: ignore
|
||||||
) -> Optional[torch.Tensor]:
|
) -> Optional[torch.Tensor]:
|
||||||
logits = self.logits_processor(self.lm_head, hidden_states,
|
logits = self.logits_processor(self.lm_head, hidden_states,
|
||||||
sampling_metadata)
|
sampling_metadata)
|
||||||
@@ -946,8 +945,8 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
|
|||||||
def sample(
|
def sample(
|
||||||
self,
|
self,
|
||||||
logits: Optional[torch.Tensor],
|
logits: Optional[torch.Tensor],
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata, # type: ignore
|
||||||
) -> Optional[SamplerOutput]:
|
):
|
||||||
next_tokens = self.sampler(logits, sampling_metadata)
|
next_tokens = self.sampler(logits, sampling_metadata)
|
||||||
return next_tokens
|
return next_tokens
|
||||||
|
|
||||||
|
|||||||
@@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
num_scheduled_tokens_np, finished_sending,
|
num_scheduled_tokens_np, finished_sending,
|
||||||
finished_recving, kv_connector_output)
|
finished_recving, kv_connector_output)
|
||||||
sample_hidden_states = hidden_states[logits_indices]
|
sample_hidden_states = hidden_states[logits_indices]
|
||||||
logits = self.model.compute_logits(sample_hidden_states, None)
|
logits = self._compute_logits_wrapper(sample_hidden_states,
|
||||||
|
None)
|
||||||
if broadcast_pp_output:
|
if broadcast_pp_output:
|
||||||
model_output_broadcast_data = {
|
model_output_broadcast_data = {
|
||||||
"logits": logits.contiguous(),
|
"logits": logits.contiguous(),
|
||||||
@@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
|
|
||||||
def dummy_compute_logits(hidden_states):
|
def dummy_compute_logits(hidden_states):
|
||||||
return self.model.compute_logits(
|
return self._compute_logits_wrapper(
|
||||||
hidden_states[dummy_indices], None)
|
hidden_states[dummy_indices], None)
|
||||||
|
|
||||||
with set_ascend_forward_context(
|
with set_ascend_forward_context(
|
||||||
@@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
||||||
# TODO: need to rum a dummy sampler for generate task
|
# TODO: need to rum a dummy sampler for generate task
|
||||||
hidden_states = hidden_states[logit_indices]
|
hidden_states = hidden_states[logit_indices]
|
||||||
output = self.model.compute_logits(hidden_states, None)
|
output = self._compute_logits_wrapper(hidden_states, None)
|
||||||
|
|
||||||
NPUPlatform.synchronize()
|
NPUPlatform.synchronize()
|
||||||
del hidden_states, output
|
del hidden_states, output
|
||||||
self.encoder_cache.clear()
|
self.encoder_cache.clear()
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
|
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
|
||||||
|
if vllm_version_is("0.10.2"):
|
||||||
|
return self.model.compute_logits(hidden_states, sampling_metadata)
|
||||||
|
return self.model.compute_logits(hidden_states)
|
||||||
|
|
||||||
def _dummy_pooler_run_task(
|
def _dummy_pooler_run_task(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
@@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
req_idx = self.input_batch.req_id_to_index[req_id]
|
req_idx = self.input_batch.req_id_to_index[req_id]
|
||||||
offset = self.query_start_loc_np[req_idx].item()
|
offset = self.query_start_loc_np[req_idx].item()
|
||||||
prompt_hidden_states = hidden_states[offset:offset + num_logits]
|
prompt_hidden_states = hidden_states[offset:offset + num_logits]
|
||||||
logits = self.model.compute_logits(prompt_hidden_states, None)
|
logits = self._compute_logits_wrapper(prompt_hidden_states, None)
|
||||||
|
|
||||||
# Get the "target" tokens for each index. For prompt at index i,
|
# Get the "target" tokens for each index. For prompt at index i,
|
||||||
# the token at prompt index i+1 is the "sampled" token we want
|
# the token at prompt index i+1 is the "sampled" token we want
|
||||||
|
|||||||
Reference in New Issue
Block a user