[Platform][Model Runner] Add hash of request_ids; Change blocksize back to 128. (#293)
This PR changes the initial value of blocksize back to 128 and adds hash value of request id list in model runner for implementing sampling param cache in sampler. Signed-off-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com>
This commit is contained in:
@@ -108,8 +108,7 @@ class NPUPlatform(Platform):
|
|||||||
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
||||||
cache_config = vllm_config.cache_config
|
cache_config = vllm_config.cache_config
|
||||||
if cache_config and cache_config.block_size is None:
|
if cache_config and cache_config.block_size is None:
|
||||||
# TODO: Set block_size to 128 will lead unexpected accuracy issue in mla case. Please set block_size to 128 back once the problem is fixed.
|
cache_config.block_size = 128
|
||||||
cache_config.block_size = 16
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
||||||
|
|||||||
@@ -1061,6 +1061,11 @@ class NPUModelRunner(NPUModelRunnerBase[ModelInputForNPUWithSamplingMetadata]):
|
|||||||
# TODO (cmq): enable this after supported in vllm
|
# TODO (cmq): enable this after supported in vllm
|
||||||
# pad_for_invariant_seq_len=True,
|
# pad_for_invariant_seq_len=True,
|
||||||
)
|
)
|
||||||
|
# Get hash value of request id list to perform sampling param cache in sampler.
|
||||||
|
request_ids = model_input.request_ids_to_seq_ids.keys( # type: ignore
|
||||||
|
) # type: ignore
|
||||||
|
request_ids_hash = hash("".join(request_ids))
|
||||||
|
sampling_metadata.request_ids_hash = request_ids_hash # type: ignore
|
||||||
else:
|
else:
|
||||||
sampling_metadata = None
|
sampling_metadata = None
|
||||||
is_prompt = (seq_group_metadata_list[0].is_prompt
|
is_prompt = (seq_group_metadata_list[0].is_prompt
|
||||||
|
|||||||
Reference in New Issue
Block a user