From 3964b352c3613b06b0f10fa5d7a8b2630fa80d61 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Sat, 19 Jul 2025 08:19:27 +0800
Subject: [PATCH] chore: tune mem fraction static for vlm (#6881)

---
 .../sglang/srt/model_executor/model_runner.py |  4 +-
 python/sglang/srt/server_args.py              | 48 ++++++++++++++++++-
 test/srt/test_vision_openai_server_a.py       | 10 ++--
 test/srt/test_vision_openai_server_b.py       |  8 ++--
 4 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 923b4d02b..bbd5b0000 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -411,7 +411,7 @@ class ModelRunner:
                 else:
                     server_args.attention_backend = "triton"
             logger.info(
-                f"Attention backend not set. Use {server_args.attention_backend} backend by default."
+                f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
             )
         elif self.use_mla_backend:
             if server_args.device != "cpu":
@@ -463,7 +463,7 @@ class ModelRunner:
             if not self.is_multimodal_chunked_prefill_supported:
                 server_args.chunked_prefill_size = -1
                 logger.info(
-                    f"Automatically turn of --chunked-prefill-size as it is not supported for "
+                    f"Automatically turn off --chunked-prefill-size as it is not supported for "
                     f"{self.model_config.hf_config.model_type}"
                 )
 
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index cb8038d33..20db0b4b9 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -337,8 +337,52 @@ class ServerArgs:
 
             # Multimodal models need more memory for the image processor
             model_config = ModelConfig.from_server_args(self)
-            if model_config.is_multimodal:
-                self.mem_fraction_static *= 0.90
+
+            vision_config = getattr(model_config.hf_config, "vision_config", None)
+
+            if model_config.is_multimodal and vision_config:
+                # roughly reduce the mem_fraction_static base on params of Vit
+                original_server_arg_mem_fraction = self.mem_fraction_static
+                # a base mem_fraction_static factor for regular Vit
+                base_mem_fraction_reduction_ratio = 0.95
+
+                vit_num_layers = getattr(vision_config, "num_hidden_layers", 24)
+                vit_hidden_size = getattr(vision_config, "hidden_size", 1024)
+
+                # baseline ViT params (ViT-L/14)
+                baseline_vit_layers = 24
+                baseline_vit_hidden_size = 1024
+
+                # weight params count
+                current_complexity_score = vit_num_layers * (vit_hidden_size**2)
+                baseline_complexity_score = baseline_vit_layers * (
+                    baseline_vit_hidden_size**2
+                )
+                complexity_ratio = (
+                    current_complexity_score / baseline_complexity_score
+                    if baseline_complexity_score > 0
+                    else 1.0
+                )
+
+                # every time the complexity grows 100%, adjust final factor for 10%
+                sensitivity_scale = 0.1
+                dynamic_adjustment_factor = 1.0 - sensitivity_scale * (
+                    complexity_ratio - 1.0
+                )
+                dynamic_adjustment_factor = max(
+                    0.8, min(1.05, dynamic_adjustment_factor)
+                )
+
+                final_overall_factor = (
+                    base_mem_fraction_reduction_ratio * dynamic_adjustment_factor
+                )
+                self.mem_fraction_static = (
+                    original_server_arg_mem_fraction * final_overall_factor
+                )
+                logger.warning(
+                    f"Multimodal model: Dynamically adjusted --mem-fraction-static "
+                    f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
+                )
 
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:
diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py
index 90b91578f..f252c4884 100644
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -30,7 +30,7 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
             api_key=cls.api_key,
             other_args=[
                 "--mem-fraction-static",
-                "0.4",
+                "0.35",
             ],
         )
         cls.base_url += "/v1"
@@ -52,7 +52,7 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
             api_key=cls.api_key,
             other_args=[
                 "--mem-fraction-static",
-                "0.4",
+                "0.35",
             ],
         )
         cls.base_url += "/v1"
@@ -75,7 +75,7 @@ class TestVLMContextLengthIssue(CustomTestCase):
             other_args=[
                 "--context-length",
                 "300",
-                "--mem-fraction-static=0.80",
+                "--mem-fraction-static=0.75",
             ],
         )
         cls.base_url += "/v1"
@@ -147,7 +147,7 @@ class TestMinicpmvServer(TestOpenAIVisionServer):
             other_args=[
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.4",
+                "0.35",
             ],
         )
         cls.base_url += "/v1"
@@ -181,7 +181,7 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
             other_args=[
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.7",
+                "0.65",
             ],
         )
         cls.base_url += "/v1"
diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py
index 7a5716cb1..f6152ea76 100644
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -22,7 +22,7 @@ class TestPixtralServer(TestOpenAIVisionServer):
             other_args=[
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.73",
+                "0.70",
             ],
         )
         cls.base_url += "/v1"
@@ -44,7 +44,7 @@ class TestMistral3_1Server(TestOpenAIVisionServer):
             other_args=[
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.8",
+                "0.75",
             ],
         )
         cls.base_url += "/v1"
@@ -88,7 +88,7 @@ class TestJanusProServer(TestOpenAIVisionServer):
             other_args=[
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.4",
+                "0.35",
             ],
         )
         cls.base_url += "/v1"
@@ -197,7 +197,7 @@ class TestPhi4MMServer(TestOpenAIVisionServer):
             other_args=[
                 "--trust-remote-code",
                 "--mem-fraction-static",
-                "0.75",
+                "0.70",
                 "--disable-radix-cache",
                 "--max-loras-per-batch",
                 "1",