From 80572c8345d4404ad19e3b75bffdb84bec30ba04 Mon Sep 17 00:00:00 2001 From: brayden-hai Date: Mon, 15 Sep 2025 20:16:49 -0700 Subject: [PATCH] [ModelOpt] Respect `kv_cache_quant_algo` in ModelOpt checkpoints (#10336) Co-authored-by: Baizhou Zhang --- python/sglang/srt/model_executor/model_runner.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 2cb27bcbe..75e493475 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -135,6 +135,7 @@ from sglang.srt.utils import ( is_no_spec_infer_or_topk_one, is_npu, is_sm100_supported, + log_info_on_rank0, monkey_patch_p2p_access_check, monkey_patch_vllm_gguf_config, parse_connector_type, @@ -1352,7 +1353,18 @@ class ModelRunner: ): # Determine the kv cache dtype if self.server_args.kv_cache_dtype == "auto": - self.kv_cache_dtype = self.dtype + quant_config = getattr(self.model, "quant_config", None) + kv_cache_quant_algo = getattr(quant_config, "kv_cache_quant_algo", None) + if ( + isinstance(kv_cache_quant_algo, str) + and kv_cache_quant_algo.upper() == "FP8" + ): + if _is_hip: + self.kv_cache_dtype = torch.float8_e4m3fnuz + else: + self.kv_cache_dtype = torch.float8_e4m3fn + else: + self.kv_cache_dtype = self.dtype elif self.server_args.kv_cache_dtype == "fp8_e5m2": if _is_hip: # Using natively supported format self.kv_cache_dtype = torch.float8_e5m2fnuz @@ -1368,6 +1380,8 @@ class ModelRunner: f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}." ) + log_info_on_rank0(logger, f"Using KV cache dtype: {self.kv_cache_dtype}") + self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory) if SGLANG_CI_SMALL_KV_SIZE: self.max_total_num_tokens = int(SGLANG_CI_SMALL_KV_SIZE)