Use more general heuristics to set the default value of --mem-fraction-static (#10975)

Co-authored-by: sglang-bot <sglangbot@gmail.com>
2025-09-29 10:11:03 -07:00
parent 816b3a433a
commit a17e70f5cc
9 changed files with 167 additions and 151 deletions
--- a/test/srt/lora/test_lora_llama4.py
+++ b/test/srt/lora/test_lora_llama4.py
@@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase):
                        "--tp-size",
                        str(model.tp_size),
                        "--context-length",
-                        "1048576",
+                        "262144",
                        "--attention-backend",
                        "fa3",
                    ],
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -13,6 +13,7 @@ class TestFile:

 suites = {
    "per-commit": [
+        TestFile("function_call/test_json_schema_constraint.py", 30),
        TestFile("hicache/test_hicache.py", 116),
        TestFile("hicache/test_hicache_mla.py", 127),
        TestFile("hicache/test_hicache_storage.py", 127),
@@ -20,11 +21,9 @@ suites = {
        TestFile("lora/test_lora_eviction.py", 200),
        TestFile("lora/test_lora_backend.py", 99),
        TestFile("lora/test_multi_lora_backend.py", 60),
-        TestFile("lora/test_lora_cuda_graph.py", 250),
        TestFile("lora/test_lora_update.py", 400),
        TestFile("lora/test_lora_qwen3.py", 97),
        TestFile("lora/test_lora_radix_cache.py", 100),
-        TestFile("lora/test_chunked_sgmv_backend.py", 30),
        TestFile("models/test_embedding_models.py", 73),
        # TestFile("models/test_clip_models.py", 52),
        TestFile("models/test_encoder_embedding_models.py", 100),
@@ -51,7 +50,6 @@ suites = {
        TestFile("openai_server/features/test_reasoning_content.py", 89),
        TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
        TestFile("openai_server/function_call/test_tool_choice.py", 226),
-        TestFile("function_call/test_json_schema_constraint.py", 30),
        TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
        TestFile("openai_server/validation/test_matched_stop.py", 60),
        TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
@@ -144,8 +142,6 @@ suites = {
        TestFile("test_multi_instance_release_memory_occupation.py", 64),
    ],
    "per-commit-8-gpu": [
-        # Disabled because it hangs on the CI.
-        # TestFile("ep/test_moe_ep.py", 181),
        TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800),
        TestFile("lora/test_lora_llama4.py", 600),
        TestFile("test_disaggregation.py", 499),
--- a/test/srt/test_mla_deepseek_v3.py
+++ b/test/srt/test_mla_deepseek_v3.py
@@ -3,7 +3,6 @@ import unittest
 from types import SimpleNamespace

 import requests
-import torch

 from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
@@ -11,6 +10,7 @@ from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
+    is_in_ci,
    popen_launch_server,
 )

@@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase):
        self.assertGreater(metrics["accuracy"], 0.62)


+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
 class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
    @classmethod
    def setUpClass(cls):
--- a/test/srt/test_multi_instance_release_memory_occupation.py
+++ b/test/srt/test_multi_instance_release_memory_occupation.py
@@ -1,6 +1,6 @@
 import multiprocessing
 import os
-import subprocess
+import time
 import traceback
 import unittest
 from multiprocessing import Process
@@ -21,7 +21,7 @@ from sglang.test.test_utils import (

 TEST_SUITE = dict(
    model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
-    mem_fraction_static=0.85,
+    mem_fraction_static=0.83,
    dp_size=2,
    tp_size=2,
 )
@@ -214,6 +214,9 @@ def _run_sglang_subprocess(
        _mem_usage = get_gpu_memory_gb(rank)
        print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
        del hf_model
+        hf_model = None
+        torch.cuda.empty_cache()
+        time.sleep(5)
        torch.cuda.empty_cache()
        _curr_usage = get_gpu_memory_gb(rank)
        assert (