Fix: #3988 using blockwise_int8 (#4023)

2025-03-04 15:49:58 +08:00
parent 95575aa76a
commit 12f2e6c3f1
1 changed files with 4 additions and 1 deletions
--- a/python/sglang/srt/layers/quantization/blockwise_int8.py
+++ b/python/sglang/srt/layers/quantization/blockwise_int8.py
@@ -371,6 +371,8 @@ class BlockInt8MoEMethod:
        custom_routing_function: Optional[Callable] = None,
        correction_bias: Optional[torch.Tensor] = None,
        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
    ) -> torch.Tensor:
        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
        from sglang.srt.layers.moe.topk import select_experts
@@ -395,7 +397,7 @@ class BlockInt8MoEMethod:
            layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
-            inplace=True,
+            inplace=inplace,
            activation=activation,
            use_int8_w8a8=True,
            w1_scale=(layer.w13_weight_scale_inv),
@@ -403,4 +405,5 @@ class BlockInt8MoEMethod:
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            block_shape=self.quant_config.weight_block_size,
+            no_combine=no_combine,
        )