Turn off autotune for scaled mm for fp8 dynamic quant in torchao (#2116)
This commit is contained in:
@@ -401,6 +401,10 @@ class TorchNativeLlamaForCausalLM(nn.Module):
|
|||||||
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
||||||
self.logits_processor = LogitsProcessor(config)
|
self.logits_processor = LogitsProcessor(config)
|
||||||
|
|
||||||
|
# turning off autotune for fp8dq since it doesn't give speedup and
|
||||||
|
# increases compile time significantly
|
||||||
|
torch._inductor.config.max_autotune_gemm_backends = "ATEN"
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user