[Worker] Implement update max_model_len interface for NPUWorker (#6193)
### What this PR does / why we need it?
This patch purpose to add the `update_max_model_len` interface.
- vLLM version: v0.14.0
- vLLM main:
d68209402d
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -426,6 +426,19 @@ class NPUWorker(WorkerBase):
|
||||
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
|
||||
return self.model_runner.get_kv_cache_spec()
|
||||
|
||||
def update_max_model_len(self, max_model_len: int) -> None:
|
||||
"""Update max_model_len after auto-fit to NPU memory.
|
||||
|
||||
This is called when max_model_len=-1 is used and the engine
|
||||
automatically determines the maximum context length that fits
|
||||
in GPU memory. Workers need to update their cached max_model_len
|
||||
to match the engine's decision.
|
||||
"""
|
||||
self.model_config.max_model_len = max_model_len
|
||||
if self.model_runner is not None:
|
||||
self.model_runner.update_max_model_len(max_model_len)
|
||||
logger.debug("Updated max_model_len to %d", max_model_len)
|
||||
|
||||
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
|
||||
"""Allocate NPU KV cache with the specified kv_cache_config."""
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
|
||||
Reference in New Issue
Block a user