Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -771,6 +771,8 @@ class Fp8MoEMethod:
        custom_routing_function: Optional[Callable] = None,
        correction_bias: Optional[torch.Tensor] = None,
        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
    ) -> torch.Tensor:
        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
        from sglang.srt.layers.moe.topk import select_experts
@@ -793,6 +795,7 @@ class Fp8MoEMethod:
            from aiter.fused_moe import fused_experts_ck

            assert activation == "silu", f"{activation=} is not supported."
+            assert not no_combine, f"{no_combine=} is not supported."

            return fused_experts_ck(
                x,
@@ -823,7 +826,7 @@ class Fp8MoEMethod:
                layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
-                inplace=True,
+                inplace=inplace and not no_combine,
                activation=activation,
                use_fp8_w8a8=True,
                w1_scale=(
@@ -839,6 +842,7 @@ class Fp8MoEMethod:
                a1_scale=layer.w13_input_scale,
                a2_scale=layer.w2_input_scale,
                block_shape=self.quant_config.weight_block_size,
+                no_combine=no_combine,
            )