From 26f07294f1559d093e322c713478e785f8b684bf Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Wed, 26 Mar 2025 15:18:14 +0800
Subject: [PATCH] Warn users when release_memory_occupation is called without
 memory saver enabled (#4566)

---
 .github/workflows/pr-test-amd.yml             | 12 +++++-----
 .github/workflows/release-docs.yml            |  2 +-
 python/pyproject.toml                         |  2 +-
 python/sglang/srt/managers/scheduler.py       |  4 ++++
 .../sglang/srt/model_executor/model_runner.py |  9 +++++++-
 .../sglang/srt/torch_memory_saver_adapter.py  | 22 +++++++++++++++++++
 .../test/attention/test_flashattn_backend.py  |  3 ++-
 python/sglang/test/test_utils.py              |  5 +++++
 test/srt/test_mla_int8_deepseek_v3.py         |  2 +-
 test/srt/test_vision_openai_server.py         |  1 -
 10 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 0ba7994ff..03b9c433c 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -22,7 +22,7 @@ concurrency:
 jobs:
   accuracy-test-1-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
-        github.event.pull_request.draft == false
+      github.event.pull_request.draft == false
     runs-on: linux-mi300-gpu-1
     steps:
       - name: Checkout code
@@ -56,13 +56,13 @@ jobs:
       - name: Evaluate Accuracy
         timeout-minutes: 20
         run: |
-          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_accuracy_large.py
-          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_fp8_accuracy.py
-          docker exec -w /sglang-checkout/test/srt ci_sglang python3 models/test_qwen_models.py
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py
 
   mla-test-1-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
-        github.event.pull_request.draft == false
+      github.event.pull_request.draft == false
     runs-on: linux-mi300-gpu-1
     steps:
       - name: Checkout code
@@ -96,7 +96,7 @@ jobs:
       - name: MLA TEST
         timeout-minutes: 20
         run: |
-          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py
 
   finish:
     if: always()
diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
index 661865ef8..00a6a3b7b 100644
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -33,7 +33,7 @@ jobs:
           pip install -r docs/requirements.txt
           apt-get update
           apt-get install -y pandoc
-          apt-get update && apt-get install -y parallel
+          apt-get update && apt-get install -y parallel retry
 
       - name: Setup Jupyter Kernel
         run: |
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 736a7dfcb..3a682804e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -72,7 +72,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
-torch_memory_saver = ["torch_memory_saver"]
+torch_memory_saver = ["torch_memory_saver>=0.0.3"]
 test = [
     "jsonlines",
     "matplotlib",
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index f1f0b896f..28d875015 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1790,6 +1790,9 @@ class Scheduler(
         return GetWeightsByNameReqOutput(parameter)
 
     def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
+        self.memory_saver_adapter.check_validity(
+            caller_name="release_memory_occupation"
+        )
         self.stashed_model_static_state = _export_static_state(
             self.tp_worker.worker.model_runner.model
         )
@@ -1798,6 +1801,7 @@ class Scheduler(
         return ReleaseMemoryOccupationReqOutput()
 
     def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
+        self.memory_saver_adapter.check_validity(caller_name="resume_memory_occupation")
         self.memory_saver_adapter.resume()
         _import_static_state(
             self.tp_worker.worker.model_runner.model, self.stashed_model_static_state
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 7bee2cb8a..e6d74998e 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -287,7 +287,14 @@ class ModelRunner:
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
 
-        torch.get_device_module(self.device).set_device(self.gpu_id)
+        try:
+            torch.get_device_module(self.device).set_device(self.gpu_id)
+        except Exception:
+            logger.warning(
+                f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}"
+            )
+            raise
+
         if self.device == "cuda":
             backend = "nccl"
         elif self.device == "xpu":
diff --git a/python/sglang/srt/torch_memory_saver_adapter.py b/python/sglang/srt/torch_memory_saver_adapter.py
index 31f8ebf2f..4fd0611e1 100644
--- a/python/sglang/srt/torch_memory_saver_adapter.py
+++ b/python/sglang/srt/torch_memory_saver_adapter.py
@@ -1,3 +1,4 @@
+import logging
 from abc import ABC
 from contextlib import contextmanager
 
@@ -8,6 +9,8 @@ try:
 except ImportError:
     pass
 
+logger = logging.getLogger(__name__)
+
 
 class TorchMemorySaverAdapter(ABC):
     @staticmethod
@@ -16,6 +19,13 @@ class TorchMemorySaverAdapter(ABC):
             _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
         )
 
+    def check_validity(self, caller_name):
+        if not self.enabled:
+            logger.warning(
+                f"`{caller_name}` will not save memory because torch_memory_saver is not enabled. "
+                f"Potential causes: `enable_memory_saver` is false, or torch_memory_saver has installation issues."
+            )
+
     def configure_subprocess(self):
         raise NotImplementedError
 
@@ -28,6 +38,10 @@ class TorchMemorySaverAdapter(ABC):
     def resume(self):
         raise NotImplementedError
 
+    @property
+    def enabled(self):
+        raise NotImplementedError
+
 
 class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
     def configure_subprocess(self):
@@ -42,6 +56,10 @@ class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
     def resume(self):
         return _primary_memory_saver.resume()
 
+    @property
+    def enabled(self):
+        return _primary_memory_saver.enabled
+
 
 class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
     @contextmanager
@@ -57,3 +75,7 @@ class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
 
     def resume(self):
         pass
+
+    @property
+    def enabled(self):
+        return False
diff --git a/python/sglang/test/attention/test_flashattn_backend.py b/python/sglang/test/attention/test_flashattn_backend.py
index 4c37a8758..41fd3727c 100644
--- a/python/sglang/test/attention/test_flashattn_backend.py
+++ b/python/sglang/test/attention/test_flashattn_backend.py
@@ -6,6 +6,7 @@ from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBac
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.test.test_utils import CustomTestCase
 
 
 class MockModelRunner:
@@ -39,7 +40,7 @@ class MockReqToTokenPool:
 
 
 @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
-class TestFlashAttentionBackend(unittest.TestCase):
+class TestFlashAttentionBackend(CustomTestCase):
     def setUp(self):
         """Set up test fixtures before each test method."""
         self.model_runner = MockModelRunner()
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index f13986e61..095aed0fc 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -3,6 +3,7 @@
 import argparse
 import asyncio
 import copy
+import logging
 import os
 import random
 import subprocess
@@ -922,6 +923,10 @@ def run_mulit_request_test(
 
 
 def write_github_step_summary(content):
+    if not os.environ.get("GITHUB_STEP_SUMMARY"):
+        logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
+        return
+
     with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
         f.write(content)
 
diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py
index 27b7e0af4..643700020 100644
--- a/test/srt/test_mla_int8_deepseek_v3.py
+++ b/test/srt/test_mla_int8_deepseek_v3.py
@@ -46,7 +46,7 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
         metrics = run_eval_few_shot_gsm8k(args)
         print(metrics)
 
-        self.assertGreater(metrics["accuracy"], 0.62)
+        self.assertGreaterEqual(metrics["accuracy"], 0.61)
 
 
 class TestDeepseekV3MTPChannelInt8(CustomTestCase):
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 4ca61d448..2e866d24e 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -624,7 +624,6 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
                 "minicpmo",
                 "--mem-fraction-static",
                 "0.7",
-                "--tp=2",
             ],
         )
         cls.base_url += "/v1"