[benchmark] Add fused_moe_triton benchmark and tuning tools (#2225)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com> Co-authored-by: HAI <hixiao@gmail.com>
2024-11-30 05:36:45 +08:00
parent 419a57e771
commit 262e370f78
4 changed files with 732 additions and 3 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -169,10 +169,11 @@ class ServerArgs:
            gpu_mem = get_amdgpu_memory_capacity()
        else:
            gpu_mem = get_nvgpu_memory_capacity()
+
        if gpu_mem < 25000:
-            self.chunked_prefill_size //= 4  # make it 2048
-            self.cuda_graph_max_bs = 4
-            logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
+            logger.warning(
+                "Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance."
+            )

        # Choose kernel backends
        if not is_flashinfer_available():