[CI] Update vllm version to 20250922(5aeb925) (#3091)

### What this PR does / why we need it? This pr bump vllm commit hash to 5aeb925452 fix issues: 1. https://github.com/vllm-project/vllm/pull/25345 has remove v0 metadata 2. https://github.com/vllm-project/vllm/pull/25332 3. https://github.com/vllm-project/vllm/pull/25334 4. https://github.com/vllm-project/vllm/pull/23558, note that this vllm commit update the model register logic, which will check all the model registered have the `vllm.model_executor.models` path , which breaks our custom registration of the deepseek_v3 model (it doesn't exist in the vllm model path). so I move deepseek_v3 model registy to deepseek_v2 to solve temporary ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: 9607d5eb44 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-09-22 22:18:13 +08:00
parent 1c9f0fe26f
commit 02f89d166f
21 changed files with 58 additions and 92 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2022,7 +2022,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                        num_scheduled_tokens_np, finished_sending,
                        finished_recving, kv_connector_output)
                sample_hidden_states = hidden_states[logits_indices]
-                logits = self.model.compute_logits(sample_hidden_states, None)
+                logits = self._compute_logits_wrapper(sample_hidden_states,
+                                                      None)
            if broadcast_pp_output:
                model_output_broadcast_data = {
                    "logits": logits.contiguous(),
@@ -2469,7 +2470,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                            dtype=torch.int32)

                def dummy_compute_logits(hidden_states):
-                    return self.model.compute_logits(
+                    return self._compute_logits_wrapper(
                        hidden_states[dummy_indices], None)

            with set_ascend_forward_context(
@@ -2539,13 +2540,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                logit_indices = np.cumsum(num_scheduled_tokens) - 1
                # TODO: need to rum a dummy sampler for generate task
                hidden_states = hidden_states[logit_indices]
-                output = self.model.compute_logits(hidden_states, None)
+                output = self._compute_logits_wrapper(hidden_states, None)

        NPUPlatform.synchronize()
        del hidden_states, output
        self.encoder_cache.clear()
        gc.collect()

+    def _compute_logits_wrapper(self, hidden_states, sampling_metadata):
+        if vllm_version_is("0.10.2"):
+            return self.model.compute_logits(hidden_states, sampling_metadata)
+        return self.model.compute_logits(hidden_states)
+
    def _dummy_pooler_run_task(
        self,
        hidden_states: torch.Tensor,
@@ -3516,7 +3522,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            req_idx = self.input_batch.req_id_to_index[req_id]
            offset = self.query_start_loc_np[req_idx].item()
            prompt_hidden_states = hidden_states[offset:offset + num_logits]
-            logits = self.model.compute_logits(prompt_hidden_states, None)
+            logits = self._compute_logits_wrapper(prompt_hidden_states, None)

            # Get the "target" tokens for each index. For prompt at index i,
            # the token at prompt index i+1 is the "sampled" token we want