Use more general heuristics to set the default value of --mem-fraction-static (#10975)

Co-authored-by: sglang-bot <sglangbot@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-09-29 10:11:03 -07:00
committed by GitHub
parent 816b3a433a
commit a17e70f5cc
9 changed files with 167 additions and 151 deletions

View File

@@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase):
"--tp-size",
str(model.tp_size),
"--context-length",
"1048576",
"262144",
"--attention-backend",
"fa3",
],

View File

@@ -13,6 +13,7 @@ class TestFile:
suites = {
"per-commit": [
TestFile("function_call/test_json_schema_constraint.py", 30),
TestFile("hicache/test_hicache.py", 116),
TestFile("hicache/test_hicache_mla.py", 127),
TestFile("hicache/test_hicache_storage.py", 127),
@@ -20,11 +21,9 @@ suites = {
TestFile("lora/test_lora_eviction.py", 200),
TestFile("lora/test_lora_backend.py", 99),
TestFile("lora/test_multi_lora_backend.py", 60),
TestFile("lora/test_lora_cuda_graph.py", 250),
TestFile("lora/test_lora_update.py", 400),
TestFile("lora/test_lora_qwen3.py", 97),
TestFile("lora/test_lora_radix_cache.py", 100),
TestFile("lora/test_chunked_sgmv_backend.py", 30),
TestFile("models/test_embedding_models.py", 73),
# TestFile("models/test_clip_models.py", 52),
TestFile("models/test_encoder_embedding_models.py", 100),
@@ -51,7 +50,6 @@ suites = {
TestFile("openai_server/features/test_reasoning_content.py", 89),
TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
TestFile("openai_server/function_call/test_tool_choice.py", 226),
TestFile("function_call/test_json_schema_constraint.py", 30),
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
TestFile("openai_server/validation/test_matched_stop.py", 60),
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
@@ -144,8 +142,6 @@ suites = {
TestFile("test_multi_instance_release_memory_occupation.py", 64),
],
"per-commit-8-gpu": [
# Disabled because it hangs on the CI.
# TestFile("ep/test_moe_ep.py", 181),
TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800),
TestFile("lora/test_lora_llama4.py", 600),
TestFile("test_disaggregation.py", 499),

View File

@@ -3,7 +3,6 @@ import unittest
from types import SimpleNamespace
import requests
import torch
from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
@@ -11,6 +10,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
)
@@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.62)
@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
@classmethod
def setUpClass(cls):

View File

@@ -1,6 +1,6 @@
import multiprocessing
import os
import subprocess
import time
import traceback
import unittest
from multiprocessing import Process
@@ -21,7 +21,7 @@ from sglang.test.test_utils import (
TEST_SUITE = dict(
model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
mem_fraction_static=0.85,
mem_fraction_static=0.83,
dp_size=2,
tp_size=2,
)
@@ -214,6 +214,9 @@ def _run_sglang_subprocess(
_mem_usage = get_gpu_memory_gb(rank)
print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
del hf_model
hf_model = None
torch.cuda.empty_cache()
time.sleep(5)
torch.cuda.empty_cache()
_curr_usage = get_gpu_memory_gb(rank)
assert (