Warn users when release_memory_occupation is called without memory saver enabled (#4566)

This commit is contained in:
fzyzcjy
2025-03-26 15:18:14 +08:00
committed by GitHub
parent 34e07a65f1
commit 26f07294f1
10 changed files with 50 additions and 12 deletions

View File

@@ -287,7 +287,14 @@ class ModelRunner:
def init_torch_distributed(self):
logger.info("Init torch distributed begin.")
torch.get_device_module(self.device).set_device(self.gpu_id)
try:
torch.get_device_module(self.device).set_device(self.gpu_id)
except Exception:
logger.warning(
f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}"
)
raise
if self.device == "cuda":
backend = "nccl"
elif self.device == "xpu":