From 26f07294f1559d093e322c713478e785f8b684bf Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:18:14 +0800 Subject: [PATCH] Warn users when release_memory_occupation is called without memory saver enabled (#4566) --- .github/workflows/pr-test-amd.yml | 12 +++++----- .github/workflows/release-docs.yml | 2 +- python/pyproject.toml | 2 +- python/sglang/srt/managers/scheduler.py | 4 ++++ .../sglang/srt/model_executor/model_runner.py | 9 +++++++- .../sglang/srt/torch_memory_saver_adapter.py | 22 +++++++++++++++++++ .../test/attention/test_flashattn_backend.py | 3 ++- python/sglang/test/test_utils.py | 5 +++++ test/srt/test_mla_int8_deepseek_v3.py | 2 +- test/srt/test_vision_openai_server.py | 1 - 10 files changed, 50 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 0ba7994ff..03b9c433c 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -22,7 +22,7 @@ concurrency: jobs: accuracy-test-1-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false + github.event.pull_request.draft == false runs-on: linux-mi300-gpu-1 steps: - name: Checkout code @@ -56,13 +56,13 @@ jobs: - name: Evaluate Accuracy timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_accuracy_large.py - docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_fp8_accuracy.py - docker exec -w /sglang-checkout/test/srt ci_sglang python3 models/test_qwen_models.py + docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py + docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py + docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py mla-test-1-gpu-amd: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false + github.event.pull_request.draft == false runs-on: linux-mi300-gpu-1 steps: - name: Checkout code @@ -96,7 +96,7 @@ jobs: - name: MLA TEST timeout-minutes: 20 run: | - docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py + docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py finish: if: always() diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml index 661865ef8..00a6a3b7b 100644 --- a/.github/workflows/release-docs.yml +++ b/.github/workflows/release-docs.yml @@ -33,7 +33,7 @@ jobs: pip install -r docs/requirements.txt apt-get update apt-get install -y pandoc - apt-get update && apt-get install -y parallel + apt-get update && apt-get install -y parallel retry - name: Setup Jupyter Kernel run: | diff --git a/python/pyproject.toml b/python/pyproject.toml index 736a7dfcb..3a682804e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -72,7 +72,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] -torch_memory_saver = ["torch_memory_saver"] +torch_memory_saver = ["torch_memory_saver>=0.0.3"] test = [ "jsonlines", "matplotlib", diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index f1f0b896f..28d875015 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1790,6 +1790,9 @@ class Scheduler( return GetWeightsByNameReqOutput(parameter) def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput): + self.memory_saver_adapter.check_validity( + caller_name="release_memory_occupation" + ) self.stashed_model_static_state = _export_static_state( self.tp_worker.worker.model_runner.model ) @@ -1798,6 +1801,7 @@ class Scheduler( return ReleaseMemoryOccupationReqOutput() def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput): + self.memory_saver_adapter.check_validity(caller_name="resume_memory_occupation") self.memory_saver_adapter.resume() _import_static_state( self.tp_worker.worker.model_runner.model, self.stashed_model_static_state diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 7bee2cb8a..e6d74998e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -287,7 +287,14 @@ class ModelRunner: def init_torch_distributed(self): logger.info("Init torch distributed begin.") - torch.get_device_module(self.device).set_device(self.gpu_id) + try: + torch.get_device_module(self.device).set_device(self.gpu_id) + except Exception: + logger.warning( + f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}" + ) + raise + if self.device == "cuda": backend = "nccl" elif self.device == "xpu": diff --git a/python/sglang/srt/torch_memory_saver_adapter.py b/python/sglang/srt/torch_memory_saver_adapter.py index 31f8ebf2f..4fd0611e1 100644 --- a/python/sglang/srt/torch_memory_saver_adapter.py +++ b/python/sglang/srt/torch_memory_saver_adapter.py @@ -1,3 +1,4 @@ +import logging from abc import ABC from contextlib import contextmanager @@ -8,6 +9,8 @@ try: except ImportError: pass +logger = logging.getLogger(__name__) + class TorchMemorySaverAdapter(ABC): @staticmethod @@ -16,6 +19,13 @@ class TorchMemorySaverAdapter(ABC): _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop() ) + def check_validity(self, caller_name): + if not self.enabled: + logger.warning( + f"`{caller_name}` will not save memory because torch_memory_saver is not enabled. " + f"Potential causes: `enable_memory_saver` is false, or torch_memory_saver has installation issues." + ) + def configure_subprocess(self): raise NotImplementedError @@ -28,6 +38,10 @@ class TorchMemorySaverAdapter(ABC): def resume(self): raise NotImplementedError + @property + def enabled(self): + raise NotImplementedError + class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter): def configure_subprocess(self): @@ -42,6 +56,10 @@ class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter): def resume(self): return _primary_memory_saver.resume() + @property + def enabled(self): + return _primary_memory_saver.enabled + class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter): @contextmanager @@ -57,3 +75,7 @@ class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter): def resume(self): pass + + @property + def enabled(self): + return False diff --git a/python/sglang/test/attention/test_flashattn_backend.py b/python/sglang/test/attention/test_flashattn_backend.py index 4c37a8758..41fd3727c 100644 --- a/python/sglang/test/attention/test_flashattn_backend.py +++ b/python/sglang/test/attention/test_flashattn_backend.py @@ -6,6 +6,7 @@ from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBac from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode +from sglang.test.test_utils import CustomTestCase class MockModelRunner: @@ -39,7 +40,7 @@ class MockReqToTokenPool: @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA") -class TestFlashAttentionBackend(unittest.TestCase): +class TestFlashAttentionBackend(CustomTestCase): def setUp(self): """Set up test fixtures before each test method.""" self.model_runner = MockModelRunner() diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index f13986e61..095aed0fc 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -3,6 +3,7 @@ import argparse import asyncio import copy +import logging import os import random import subprocess @@ -922,6 +923,10 @@ def run_mulit_request_test( def write_github_step_summary(content): + if not os.environ.get("GITHUB_STEP_SUMMARY"): + logging.warning("GITHUB_STEP_SUMMARY environment variable not set") + return + with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f: f.write(content) diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py index 27b7e0af4..643700020 100644 --- a/test/srt/test_mla_int8_deepseek_v3.py +++ b/test/srt/test_mla_int8_deepseek_v3.py @@ -46,7 +46,7 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase): metrics = run_eval_few_shot_gsm8k(args) print(metrics) - self.assertGreater(metrics["accuracy"], 0.62) + self.assertGreaterEqual(metrics["accuracy"], 0.61) class TestDeepseekV3MTPChannelInt8(CustomTestCase): diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 4ca61d448..2e866d24e 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -624,7 +624,6 @@ class TestMinicpmoServer(TestOpenAIVisionServer): "minicpmo", "--mem-fraction-static", "0.7", - "--tp=2", ], ) cls.base_url += "/v1"