Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
This commit is contained in:
Lianmin Zheng
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions

View File

@@ -771,6 +771,8 @@ class Fp8MoEMethod:
custom_routing_function: Optional[Callable] = None,
correction_bias: Optional[torch.Tensor] = None,
activation: str = "silu",
inplace: bool = True,
no_combine: bool = False,
) -> torch.Tensor:
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
from sglang.srt.layers.moe.topk import select_experts
@@ -793,6 +795,7 @@ class Fp8MoEMethod:
from aiter.fused_moe import fused_experts_ck
assert activation == "silu", f"{activation=} is not supported."
assert not no_combine, f"{no_combine=} is not supported."
return fused_experts_ck(
x,
@@ -823,7 +826,7 @@ class Fp8MoEMethod:
layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=True,
inplace=inplace and not no_combine,
activation=activation,
use_fp8_w8a8=True,
w1_scale=(
@@ -839,6 +842,7 @@ class Fp8MoEMethod:
a1_scale=layer.w13_input_scale,
a2_scale=layer.w2_input_scale,
block_shape=self.quant_config.weight_block_size,
no_combine=no_combine,
)