From 35b65cf0ca3db721e1c463d8022503508831d78a Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Fri, 6 Jun 2025 02:37:05 +0800 Subject: [PATCH] Use deepgemm instead of triton for fused_qkv_a_proj_with_mqa (#6890) --- python/sglang/srt/layers/quantization/fp8_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index c180c0a77..e105e50c3 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -227,8 +227,8 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback( output_dtype = input.dtype dtype_supported = output_dtype == torch.bfloat16 - # TODO: add more robust shape check here - shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0 + # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737 + shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0 if not (shape_supported and dtype_supported): # fall back to triton