[Misc] fix initialize_kv_cache (#1102)

KV cache manger has been changed by
f8a1a2d108

This PR adapt the change into vllm-ascend to make ci happy

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-06-06 16:46:23 +08:00
committed by GitHub
parent c94afd79ce
commit 973f993a13
3 changed files with 54 additions and 16 deletions

View File

@@ -25,7 +25,7 @@ from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec)
KVCacheGroupSpec, KVCacheTensor)
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
@@ -88,14 +88,26 @@ def create_scheduler(
model_config=model_config,
cache_config=cache_config)
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
tensors={},
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32, False))
],
)
if vllm_version_is("0.9.0"):
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
tensors={},
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32,
False))
],
)
else:
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32,
False, None))
],
)
cache_config.num_gpu_blocks = 10000
return AscendScheduler(
vllm_config,