From f39037fffbeb463595a1e31d72c85e53b6e7d355 Mon Sep 17 00:00:00 2001 From: Zhiqiang Xie Date: Wed, 23 Jul 2025 01:51:32 -0700 Subject: [PATCH] HiCache Fix (#8288) Co-authored-by: pansicheng --- python/sglang/srt/managers/cache_controller.py | 1 + python/sglang/srt/model_executor/model_runner.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py index 5f43a5e9a..a94fdec78 100644 --- a/python/sglang/srt/managers/cache_controller.py +++ b/python/sglang/srt/managers/cache_controller.py @@ -358,6 +358,7 @@ class HiCacheController: if host_indices is None: return None self.mem_pool_host.protect_write(host_indices) + torch.cuda.current_stream().synchronize() self.write_queue.put( CacheOperation(host_indices, device_indices, node_id, priority) ) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 9e6d14aac..919622cc7 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -378,6 +378,7 @@ class ModelRunner: is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(server_args) and is_fa3_default_architecture(self.model_config.hf_config) + and (not server_args.enable_hierarchical_cache) ): server_args.attention_backend = "fa3" elif _is_hip: @@ -390,7 +391,9 @@ class ModelRunner: ) else: # MLA architecture - if is_hopper_with_cuda_12_3(): + if is_hopper_with_cuda_12_3() and ( + not server_args.enable_hierarchical_cache + ): server_args.attention_backend = "fa3" elif is_sm100_supported(): server_args.attention_backend = "flashinfer"