From 405780bcf0d12b4d8f301b07bd285758c9cc0319 Mon Sep 17 00:00:00 2001 From: kk <43161300+kkHuang-amd@users.noreply.github.com> Date: Tue, 17 Jun 2025 13:26:51 +0800 Subject: [PATCH] [amd] Opt dsv3 moe (#7160) Co-authored-by: wunhuang --- python/sglang/srt/layers/quantization/fp8.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index c779f1f1d..80a5971a0 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -82,6 +82,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip if _is_hip: from aiter import ActivationType, QuantType + from aiter.fused_moe import fused_moe from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages from aiter.ops.shuffle import shuffle_weight @@ -1062,19 +1063,20 @@ class Fp8MoEMethod: if _use_aiter: assert not no_combine, f"{no_combine=} is not supported." if self.block_quant: - # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being. - assert ( - activation == "silu" - ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter" - return asm_moe( + return fused_moe( x, layer.w13_weight, layer.w2_weight, topk_weights, topk_ids, - layer.w13_weight_scale_inv, - layer.w2_weight_scale_inv, - block_shape=tuple(self.quant_config.weight_block_size), + w1_scale=layer.w13_weight_scale_inv, + w2_scale=layer.w2_weight_scale_inv, + quant_type=QuantType.per_128x128, + activation=( + ActivationType.Silu + if activation == "silu" + else ActivationType.Gelu + ), expert_mask=None, ) else: