[CI] Update vllm version to 20250922(5aeb925) (#3091)

### What this PR does / why we need it?
This pr bump vllm commit hash to
5aeb925452
fix issues:  
1. https://github.com/vllm-project/vllm/pull/25345 has remove v0
metadata
2. https://github.com/vllm-project/vllm/pull/25332
3. https://github.com/vllm-project/vllm/pull/25334
4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm
commit update the model register logic, which will check all the model
registered have the `vllm.model_executor.models` path , which breaks our
custom registration of the deepseek_v3 model (it doesn't exist in the
vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to
solve temporary

### How was this patch tested?

- vLLM version: v0.10.2
- vLLM main:
9607d5eb44

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-09-22 22:18:13 +08:00
committed by GitHub
parent 1c9f0fe26f
commit 02f89d166f
21 changed files with 58 additions and 92 deletions

View File

@@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_scheduled_tokens_np, finished_sending,
finished_recving, kv_connector_output)
sample_hidden_states = hidden_states[logits_indices]
logits = self.model.compute_logits(sample_hidden_states, None)
logits = self._compute_logits_wrapper(sample_hidden_states,
None)
if broadcast_pp_output:
model_output_broadcast_data = {
"logits": logits.contiguous(),
@@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
dtype=torch.int32)
def dummy_compute_logits(hidden_states):
return self.model.compute_logits(
return self._compute_logits_wrapper(
hidden_states[dummy_indices], None)
with set_ascend_forward_context(
@@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
logit_indices = np.cumsum(num_scheduled_tokens) - 1
# TODO: need to rum a dummy sampler for generate task
hidden_states = hidden_states[logit_indices]
output = self.model.compute_logits(hidden_states, None)
output = self._compute_logits_wrapper(hidden_states, None)
NPUPlatform.synchronize()
del hidden_states, output
self.encoder_cache.clear()
gc.collect()
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
if vllm_version_is("0.10.2"):
return self.model.compute_logits(hidden_states, sampling_metadata)
return self.model.compute_logits(hidden_states)
def _dummy_pooler_run_task(
self,
hidden_states: torch.Tensor,
@@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
req_idx = self.input_batch.req_id_to_index[req_id]
offset = self.query_start_loc_np[req_idx].item()
prompt_hidden_states = hidden_states[offset:offset + num_logits]
logits = self.model.compute_logits(prompt_hidden_states, None)
logits = self._compute_logits_wrapper(prompt_hidden_states, None)
# Get the "target" tokens for each index. For prompt at index i,
# the token at prompt index i+1 is the "sampled" token we want