refactor apply_w8a8_block_fp8_linear in fp (#6545)

2025-05-29 00:15:11 -07:00
parent 7e41290082
commit 485a023bd8
5 changed files with 283 additions and 120 deletions
--- a/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py
+++ b/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py
@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_block_fp8_matmul as vllm_w8a8_block_fp8_matmul,
 )

-from sglang.srt.layers.quantization.fp8_kernel import w8a8_block_fp8_matmul
+from sglang.srt.layers.quantization.fp8_kernel import (
+    w8a8_block_fp8_matmul_deepgemm as w8a8_block_fp8_matmul,
+)


 # Adapted from https://github.com/tile-ai/tilelang/blob/a8cfdce92795cb861c9033573534653ee040b5ed/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py#L1