Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
This commit is contained in:
Lianmin Zheng
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions

View File

@@ -111,6 +111,8 @@ else:
"cublas_grouped_gemm",
"custom_dispose",
"custom_reduce",
"build_tree_kernel_efficient",
"build_tree_kernel",
"fp8_blockwise_scaled_mm",
"fp8_scaled_mm",
"fused_add_rmsnorm",
@@ -127,12 +129,10 @@ else:
"register_graph_buffers",
"rmsnorm",
"sampling_scaling_penalties",
"sgl_per_token_group_quant_fp8",
"silu_and_mul",
"top_k_renorm_prob",
"top_k_top_p_sampling_from_probs",
"top_p_renorm_prob",
"tree_speculative_sampling_target_only",
"build_tree_kernel_efficient",
"build_tree_kernel",
"sgl_per_token_group_quant_fp8",
]