[CI] Update vllm version to 20250922(5aeb925) (#3091)
### What this PR does / why we need it? This pr bump vllm commit hash to5aeb925452fix issues: 1. https://github.com/vllm-project/vllm/pull/25345 has remove v0 metadata 2. https://github.com/vllm-project/vllm/pull/25332 3. https://github.com/vllm-project/vllm/pull/25334 4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm commit update the model register logic, which will check all the model registered have the `vllm.model_executor.models` path , which breaks our custom registration of the deepseek_v3 model (it doesn't exist in the vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to solve temporary ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main:9607d5eb44--------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
num_scheduled_tokens_np, finished_sending,
|
||||
finished_recving, kv_connector_output)
|
||||
sample_hidden_states = hidden_states[logits_indices]
|
||||
logits = self.model.compute_logits(sample_hidden_states, None)
|
||||
logits = self._compute_logits_wrapper(sample_hidden_states,
|
||||
None)
|
||||
if broadcast_pp_output:
|
||||
model_output_broadcast_data = {
|
||||
"logits": logits.contiguous(),
|
||||
@@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=torch.int32)
|
||||
|
||||
def dummy_compute_logits(hidden_states):
|
||||
return self.model.compute_logits(
|
||||
return self._compute_logits_wrapper(
|
||||
hidden_states[dummy_indices], None)
|
||||
|
||||
with set_ascend_forward_context(
|
||||
@@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
||||
# TODO: need to rum a dummy sampler for generate task
|
||||
hidden_states = hidden_states[logit_indices]
|
||||
output = self.model.compute_logits(hidden_states, None)
|
||||
output = self._compute_logits_wrapper(hidden_states, None)
|
||||
|
||||
NPUPlatform.synchronize()
|
||||
del hidden_states, output
|
||||
self.encoder_cache.clear()
|
||||
gc.collect()
|
||||
|
||||
def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
|
||||
if vllm_version_is("0.10.2"):
|
||||
return self.model.compute_logits(hidden_states, sampling_metadata)
|
||||
return self.model.compute_logits(hidden_states)
|
||||
|
||||
def _dummy_pooler_run_task(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
req_idx = self.input_batch.req_id_to_index[req_id]
|
||||
offset = self.query_start_loc_np[req_idx].item()
|
||||
prompt_hidden_states = hidden_states[offset:offset + num_logits]
|
||||
logits = self.model.compute_logits(prompt_hidden_states, None)
|
||||
logits = self._compute_logits_wrapper(prompt_hidden_states, None)
|
||||
|
||||
# Get the "target" tokens for each index. For prompt at index i,
|
||||
# the token at prompt index i+1 is the "sampled" token we want
|
||||
|
||||
Reference in New Issue
Block a user