Support Llama4 fp8 inference (#5194)

Co-authored-by: laixinn <xielx@shanghaitech.edu.cn> Co-authored-by: sleepcoo <sleepcoo@gmail.com> Co-authored-by: zhyncs <me@zhyncs.com>
2025-04-09 20:14:34 +08:00
parent 86a876d883
commit 4065248214
14 changed files with 537 additions and 106 deletions
--- a/test/srt/test_int8_kernel.py
+++ b/test/srt/test_int8_kernel.py
@@ -124,6 +124,7 @@ class TestW8A8Int8FusedMoE(CustomTestCase):
                use_fp8_w8a8=False,  # Not using fp8
                use_int8_w8a16=False,  # Not using int8-w8a16
                use_int8_w8a8=True,  # Using int8-w8a8
+                per_channel_quant=True,
                w1_scale=w1_s,
                w2_scale=w2_s,
                block_shape=None,  # Not using block quantization