[Worker] Implement update max_model_len interface for NPUWorker (#6193)

### What this PR does / why we need it? This patch purpose to add the `update_max_model_len` interface. - vLLM version: v0.14.0 - vLLM main: d68209402d --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-01-26 09:03:33 +08:00
parent ca297eb57f
commit 63adbedb7a
3 changed files with 114 additions and 0 deletions
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -426,6 +426,19 @@ class NPUWorker(WorkerBase):
    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
        return self.model_runner.get_kv_cache_spec()

+    def update_max_model_len(self, max_model_len: int) -> None:
+        """Update max_model_len after auto-fit to NPU memory.
+
+        This is called when max_model_len=-1 is used and the engine
+        automatically determines the maximum context length that fits
+        in GPU memory. Workers need to update their cached max_model_len
+        to match the engine's decision.
+        """
+        self.model_config.max_model_len = max_model_len
+        if self.model_runner is not None:
+            self.model_runner.update_max_model_len(max_model_len)
+        logger.debug("Updated max_model_len to %d", max_model_len)
+
    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
        """Allocate NPU KV cache with the specified kv_cache_config."""
        if self.vllm_config.model_config.enable_sleep_mode: