From a4b424c6320cd680abf9d84f07de04458e66f26d Mon Sep 17 00:00:00 2001
From: Trevor Morris <tmorris@nvidia.com>
Date: Wed, 8 Oct 2025 23:59:46 -0700
Subject: [PATCH] [DeepSeek-V3.2] Include indexer kv cache when estimating kv
 cache size (#11309)

---
 python/sglang/srt/mem_cache/memory_pool.py     | 18 ++++++++++++++----
 .../sglang/srt/model_executor/model_runner.py  | 11 +++++++++++
 python/sglang/srt/server_args.py               |  3 ---
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
index b577646a0..eb863f4c8 100644
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -1177,7 +1177,9 @@ class MLATokenToKVPool(KVCache):
             dtype=torch.uint64,
             device=self.device,
         )
-        self._finalize_allocation_log(size)
+        if not use_nsa:
+            # NSA will allocate indexer KV cache later and then log the total size
+            self._finalize_allocation_log(size)
 
     def get_kv_size_bytes(self):
         assert hasattr(self, "kv_buffer")
@@ -1298,6 +1300,9 @@ class MLATokenToKVPool(KVCache):
 
 
 class NSATokenToKVPool(MLATokenToKVPool):
+    quant_block_size = 128
+    index_k_with_scale_buffer_dtype = torch.uint8
+
     def __init__(
         self,
         size: int,
@@ -1331,8 +1336,6 @@ class NSATokenToKVPool(MLATokenToKVPool):
         # num head == 1 and head dim == 128 for index_k in NSA
         assert index_head_dim == 128
 
-        self.quant_block_size = 128
-
         assert self.page_size == 64
         self.index_k_with_scale_buffer = [
             torch.zeros(
@@ -1347,11 +1350,12 @@ class NSATokenToKVPool(MLATokenToKVPool):
                     self.page_size
                     * (index_head_dim + index_head_dim // self.quant_block_size * 4),
                 ),
-                dtype=torch.uint8,
+                dtype=self.index_k_with_scale_buffer_dtype,
                 device=device,
             )
             for _ in range(layer_num)
         ]
+        self._finalize_allocation_log(size)
 
     def get_index_k_with_scale_buffer(self, layer_id: int) -> torch.Tensor:
         if self.layer_transfer_counter is not None:
@@ -1393,6 +1397,12 @@ class NSATokenToKVPool(MLATokenToKVPool):
             pool=self, buf=buf, loc=loc, index_k=index_k, index_k_scale=index_k_scale
         )
 
+    def get_kv_size_bytes(self):
+        kv_size_bytes = super().get_kv_size_bytes()
+        for index_k_cache in self.index_k_with_scale_buffer:
+            kv_size_bytes += get_tensor_size_bytes(index_k_cache)
+        return kv_size_bytes
+
 
 class AscendMLAPagedTokenToKVPool(MLATokenToKVPool):
     def __init__(
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 73e6ccc7f..a74f85d71 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1280,6 +1280,17 @@ class ModelRunner:
                 * num_layers
                 * torch._utils._element_size(self.kv_cache_dtype)
             )
+            # Add indexer KV cache overhead for NSA models (DeepSeek V3.2)
+            if is_deepseek_nsa(self.model_config.hf_config):
+                index_head_dim = get_nsa_index_head_dim(self.model_config.hf_config)
+                indexer_size_per_token = (
+                    index_head_dim
+                    + index_head_dim // NSATokenToKVPool.quant_block_size * 4
+                )
+                element_size = torch._utils._element_size(
+                    NSATokenToKVPool.index_k_with_scale_buffer_dtype
+                )
+                cell_size += indexer_size_per_token * num_layers * element_size
         else:
             cell_size = (
                 self.model_config.get_num_kv_heads(get_attention_tp_size())
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 64d3371df..408d18dda 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -863,9 +863,6 @@ class ServerArgs:
                 self.page_size = 64
                 logger.warning("Setting page size to 64 for DeepSeek NSA.")
 
-                self.mem_fraction_static = 0.8
-                logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
-
                 # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
                 import torch