Fix loading KV quantization scale; Enable modelopt kv cache (#4686)

Co-authored-by: qingquansong <ustcsqq@gmail.com>
2025-04-08 09:11:35 -07:00
parent 88d6fd9a11
commit 2695ab0537
38 changed files with 151 additions and 76 deletions
--- a/python/sglang/srt/models/gemma3_causal.py
+++ b/python/sglang/srt/models/gemma3_causal.py
@@ -193,6 +193,7 @@ class Gemma3Attention(nn.Module):
            # Module must also define `get_attention_sliding_window_size` to correctly initialize
            # attention backend in `ForwardBatch`.
            sliding_window_size=self.sliding_window,
+            quant_config=quant_config,
            prefix=add_prefix("attn", prefix),
        )