diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 033111e45..75f085ae4 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -269,10 +269,10 @@ class CudaGraphRunner: raise Exception( f"Capture cuda graph failed: {e}\n" "Possible solutions:\n" - "1. disable cuda graph by --disable-cuda-graph\n" - "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" + "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" + "2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n" "3. disable torch compile by not using --enable-torch-compile\n" - "4. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n" + "4. disable cuda graph by --disable-cuda-graph\n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" ) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index e7fae3a68..7c9ec61da 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -924,6 +924,12 @@ class ModelRunner: return if self.server_args.disable_cuda_graph: + logger.warning( + "\n\nCUDA Graph is DISABLED.\n" + "This will cause significant performance degradation.\n" + "CUDA Graph should almost never be disabled in most usage scenarios.\n" + "If you encounter OOM issues, please try setting --mem-fraction-static to a lower value (such as 0.8 or 0.7) instead of disabling CUDA Graph.\n" + ) return tic = time.time() diff --git a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py index d3ad50060..19866291b 100644 --- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py @@ -84,10 +84,10 @@ class EAGLEDraftCudaGraphRunner: raise Exception( f"Capture cuda graph failed: {e}\n" "Possible solutions:\n" - "1. disable cuda graph by --disable-cuda-graph\n" - "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" - "3. disable torch compile by not using --enable-torch-compile\n" - "4. specify --dtype to the same dtype (e.g. bfloat16)\n" + "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" + "2. disable torch compile by not using --enable-torch-compile\n" + "3. specify --dtype to the same dtype (e.g. bfloat16)\n" + "4. disable cuda graph by --disable-cuda-graph\n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" )