[CI] Update vllm version to 20250922(5aeb925) (#3091)

### What this PR does / why we need it?
This pr bump vllm commit hash to
5aeb925452
fix issues:  
1. https://github.com/vllm-project/vllm/pull/25345 has remove v0
metadata
2. https://github.com/vllm-project/vllm/pull/25332
3. https://github.com/vllm-project/vllm/pull/25334
4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm
commit update the model register logic, which will check all the model
registered have the `vllm.model_executor.models` path , which breaks our
custom registration of the deepseek_v3 model (it doesn't exist in the
vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to
solve temporary

### How was this patch tested?

- vLLM version: v0.10.2
- vLLM main:
9607d5eb44

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-09-22 22:18:13 +08:00
committed by GitHub
parent 1c9f0fe26f
commit 02f89d166f
21 changed files with 58 additions and 92 deletions

View File

@@ -40,7 +40,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM # noqa: F401
from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
from vllm.model_executor.models.utils import (AutoWeightsLoader,
PPMissingLayer, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
from vllm_ascend.ascend_config import get_ascend_config
@@ -343,9 +342,9 @@ class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
self,
hidden_states: torch.Tensor,
sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)

View File

@@ -27,14 +27,12 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.models.deepseek_mtp import (
DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
SharedHead)
from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
from vllm_ascend.torchair.models.torchair_deepseek_v2 import \
@@ -172,7 +170,7 @@ class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor):
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
sampling_metadata, # type: ignore
spec_step_idx: int = 0,
) -> torch.Tensor:
current_step_idx = (spec_step_idx % self.num_mtp_layers)
@@ -199,8 +197,6 @@ class TorchairDeepSeekMTP(DeepSeekMTP):
self.model = TorchairDeepSeekMultiTokenPredictor(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
self.sampler = get_sampler()
def forward(
self,
input_ids: torch.Tensor,

View File

@@ -52,7 +52,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import (
@@ -945,7 +944,6 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
else:
self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

View File

@@ -45,7 +45,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -53,9 +52,9 @@ from vllm.model_executor.models.interfaces import SupportsPP
from vllm.model_executor.models.utils import (
extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.sequence import IntermediateTensors
from vllm.v1.sample.sampler import Sampler
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
@@ -913,7 +912,7 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
if self.config.tie_word_embeddings:
self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()
self.sampler = Sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
@@ -935,19 +934,19 @@ class PanguProMoEForCausalLM(nn.Module, SupportsPP):
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
self,
hidden_states: torch.Tensor,
sampling_metadata, # type: ignore
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata)
return logits
def sample(
self,
logits: Optional[torch.Tensor],
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
self,
logits: Optional[torch.Tensor],
sampling_metadata, # type: ignore
):
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens