From 80572c8345d4404ad19e3b75bffdb84bec30ba04 Mon Sep 17 00:00:00 2001
From: brayden-hai <brayden@hippocraticai.com>
Date: Mon, 15 Sep 2025 20:16:49 -0700
Subject: [PATCH] [ModelOpt] Respect `kv_cache_quant_algo` in ModelOpt
 checkpoints (#10336)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
---
 python/sglang/srt/model_executor/model_runner.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 2cb27bcbe..75e493475 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -135,6 +135,7 @@ from sglang.srt.utils import (
     is_no_spec_infer_or_topk_one,
     is_npu,
     is_sm100_supported,
+    log_info_on_rank0,
     monkey_patch_p2p_access_check,
     monkey_patch_vllm_gguf_config,
     parse_connector_type,
@@ -1352,7 +1353,18 @@ class ModelRunner:
     ):
         # Determine the kv cache dtype
         if self.server_args.kv_cache_dtype == "auto":
-            self.kv_cache_dtype = self.dtype
+            quant_config = getattr(self.model, "quant_config", None)
+            kv_cache_quant_algo = getattr(quant_config, "kv_cache_quant_algo", None)
+            if (
+                isinstance(kv_cache_quant_algo, str)
+                and kv_cache_quant_algo.upper() == "FP8"
+            ):
+                if _is_hip:
+                    self.kv_cache_dtype = torch.float8_e4m3fnuz
+                else:
+                    self.kv_cache_dtype = torch.float8_e4m3fn
+            else:
+                self.kv_cache_dtype = self.dtype
         elif self.server_args.kv_cache_dtype == "fp8_e5m2":
             if _is_hip:  # Using natively supported format
                 self.kv_cache_dtype = torch.float8_e5m2fnuz
@@ -1368,6 +1380,8 @@ class ModelRunner:
                 f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
             )
 
+        log_info_on_rank0(logger, f"Using KV cache dtype: {self.kv_cache_dtype}")
+
         self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
         if SGLANG_CI_SMALL_KV_SIZE:
             self.max_total_num_tokens = int(SGLANG_CI_SMALL_KV_SIZE)