diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 92f46f009..bb39e2d9d 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -18,6 +18,7 @@ from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 from sglang.srt.layers.quantization.int8_kernel import per_token_group_quant_int8 from sglang.srt.utils import ( direct_register_custom_op, + get_bool_env_var, get_device_name, is_cuda_available, is_hip, @@ -941,7 +942,11 @@ def fused_experts_impl( no_combine: bool = False, ): padded_size = padding_size - if not use_fp8_w8a8 or not use_int8_w8a8 or block_shape is not None: + if ( + not (use_fp8_w8a8 or use_int8_w8a8) + or block_shape is not None + or (is_hip_ and get_bool_env_var("CK_MOE")) + ): padded_size = 0 # Check constraints.