From 4fea040ca180556be144c8ddac5cb76277823ab4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 13 Mar 2025 03:49:05 -0700
Subject: [PATCH] Fix a regression introduced by overlapping KV cache writing
 (#4375)

---
 python/sglang/srt/mem_cache/memory_pool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index d8ea694c5..b35a1ad1e 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -326,7 +326,7 @@ class MHATokenToKVPool(KVCache):
             cache_k = cache_k.view(self.store_dtype)
             cache_v = cache_v.view(self.store_dtype)
 
-        if self.capture_mode:
+        if self.capture_mode and cache_k.shape[0] < 4:
             self.alt_stream.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(self.alt_stream):
                 self.k_buffer[layer_id][loc] = cache_k