From feb6bdb12e6e5c2dfce8dbd9232e97d435b52ecb Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Tue, 11 Mar 2025 18:50:28 +0800 Subject: [PATCH] [Platform][Model Runner] Add hash of request_ids; Change blocksize back to 128. (#293) This PR changes the initial value of blocksize back to 128 and adds hash value of request id list in model runner for implementing sampling param cache in sampler. Signed-off-by: hw_whx Co-authored-by: hw_whx --- vllm_ascend/platform.py | 3 +-- vllm_ascend/worker/model_runner.py | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 9e84a13..ce2dd90 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -108,8 +108,7 @@ class NPUPlatform(Platform): parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker" cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: - # TODO: Set block_size to 128 will lead unexpected accuracy issue in mla case. Please set block_size to 128 back once the problem is fixed. - cache_config.block_size = 16 + cache_config.block_size = 128 @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py index c5cd522..c30831f 100644 --- a/vllm_ascend/worker/model_runner.py +++ b/vllm_ascend/worker/model_runner.py @@ -1061,6 +1061,11 @@ class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]): # TODO (cmq): enable this after supported in vllm # pad_for_invariant_seq_len=True, ) + # Get hash value of request id list to perform sampling param cache in sampler. + request_ids = model_input.request_ids_to_seq_ids.keys( # type: ignore + ) # type: ignore + request_ids_hash = hash("".join(request_ids)) + sampling_metadata.request_ids_hash = request_ids_hash # type: ignore else: sampling_metadata = None is_prompt = (seq_group_metadata_list[0].is_prompt