From 60468da4e2d7bda65ee3ad04857d7e29db9396af Mon Sep 17 00:00:00 2001 From: Garry Fang Date: Sun, 20 Jul 2025 05:41:27 +0800 Subject: [PATCH] bugfix: fix sglang crash in NVIDIA MIG container (#8167) Signed-off-by: Garrybest --- python/sglang/srt/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index dc6e72d75..7123722eb 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1422,6 +1422,13 @@ def get_nvgpu_memory_capacity(): ] if not memory_values: + # Fallback to torch.cuda.mem_get_info() when failed to get memory capacity from nvidia-smi, + # typically in NVIDIA MIG mode. + if torch.cuda.is_available(): + logger.warning( + "Failed to get GPU memory capacity from nvidia-smi, falling back to torch.cuda.mem_get_info()." + ) + return torch.cuda.mem_get_info()[1] // 1024 // 1024 # unit: MB raise ValueError("No GPU memory values found.") # Return the minimum memory value