[benchmark] Add fused_moe_triton benchmark and tuning tools (#2225)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
Co-authored-by: HAI <hixiao@gmail.com>
This commit is contained in:
Xiaoyu Zhang
2024-11-30 05:36:45 +08:00
committed by GitHub
parent 419a57e771
commit 262e370f78
4 changed files with 732 additions and 3 deletions

View File

@@ -169,10 +169,11 @@ class ServerArgs:
gpu_mem = get_amdgpu_memory_capacity()
else:
gpu_mem = get_nvgpu_memory_capacity()
if gpu_mem < 25000:
self.chunked_prefill_size //= 4 # make it 2048
self.cuda_graph_max_bs = 4
logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
logger.warning(
"Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance."
)
# Choose kernel backends
if not is_flashinfer_available():