diff --git a/python/sglang/srt/layers/quantization/blockwise_int8.py b/python/sglang/srt/layers/quantization/blockwise_int8.py index ef03a3610..1470ca427 100644 --- a/python/sglang/srt/layers/quantization/blockwise_int8.py +++ b/python/sglang/srt/layers/quantization/blockwise_int8.py @@ -371,6 +371,8 @@ class BlockInt8MoEMethod: custom_routing_function: Optional[Callable] = None, correction_bias: Optional[torch.Tensor] = None, activation: str = "silu", + inplace: bool = True, + no_combine: bool = False, ) -> torch.Tensor: from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts from sglang.srt.layers.moe.topk import select_experts @@ -395,7 +397,7 @@ class BlockInt8MoEMethod: layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - inplace=True, + inplace=inplace, activation=activation, use_int8_w8a8=True, w1_scale=(layer.w13_weight_scale_inv), @@ -403,4 +405,5 @@ class BlockInt8MoEMethod: a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, block_shape=self.quant_config.weight_block_size, + no_combine=no_combine, )