From 7f8fcd39cd405dbb5667265eb4171ae68935b47d Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 21 Nov 2024 12:19:49 -0800
Subject: [PATCH] Turn off autotune for scaled mm for fp8 dynamic quant in
 torchao (#2116)

---
 python/sglang/srt/models/torch_native_llama.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py
index ae4d48434..835158288 100644
--- a/python/sglang/srt/models/torch_native_llama.py
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -401,6 +401,10 @@ class TorchNativeLlamaForCausalLM(nn.Module):
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
 
+        # turning off autotune for fp8dq since it doesn't give speedup and
+        # increases compile time significantly
+        torch._inductor.config.max_autotune_gemm_backends = "ATEN"
+
     @torch.no_grad()
     def forward(
         self,