HiCache Fix (#8288)
Co-authored-by: pansicheng <sicheng.pan.chn@gmail.com>
This commit is contained in:
@@ -358,6 +358,7 @@ class HiCacheController:
|
|||||||
if host_indices is None:
|
if host_indices is None:
|
||||||
return None
|
return None
|
||||||
self.mem_pool_host.protect_write(host_indices)
|
self.mem_pool_host.protect_write(host_indices)
|
||||||
|
torch.cuda.current_stream().synchronize()
|
||||||
self.write_queue.put(
|
self.write_queue.put(
|
||||||
CacheOperation(host_indices, device_indices, node_id, priority)
|
CacheOperation(host_indices, device_indices, node_id, priority)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -378,6 +378,7 @@ class ModelRunner:
|
|||||||
is_hopper_with_cuda_12_3()
|
is_hopper_with_cuda_12_3()
|
||||||
and is_no_spec_infer_or_topk_one(server_args)
|
and is_no_spec_infer_or_topk_one(server_args)
|
||||||
and is_fa3_default_architecture(self.model_config.hf_config)
|
and is_fa3_default_architecture(self.model_config.hf_config)
|
||||||
|
and (not server_args.enable_hierarchical_cache)
|
||||||
):
|
):
|
||||||
server_args.attention_backend = "fa3"
|
server_args.attention_backend = "fa3"
|
||||||
elif _is_hip:
|
elif _is_hip:
|
||||||
@@ -390,7 +391,9 @@ class ModelRunner:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# MLA architecture
|
# MLA architecture
|
||||||
if is_hopper_with_cuda_12_3():
|
if is_hopper_with_cuda_12_3() and (
|
||||||
|
not server_args.enable_hierarchical_cache
|
||||||
|
):
|
||||||
server_args.attention_backend = "fa3"
|
server_args.attention_backend = "fa3"
|
||||||
elif is_sm100_supported():
|
elif is_sm100_supported():
|
||||||
server_args.attention_backend = "flashinfer"
|
server_args.attention_backend = "flashinfer"
|
||||||
|
|||||||
Reference in New Issue
Block a user