From 3964b352c3613b06b0f10fa5d7a8b2630fa80d61 Mon Sep 17 00:00:00 2001 From: Mick Date: Sat, 19 Jul 2025 08:19:27 +0800 Subject: [PATCH] chore: tune mem fraction static for vlm (#6881) --- .../sglang/srt/model_executor/model_runner.py | 4 +- python/sglang/srt/server_args.py | 48 ++++++++++++++++++- test/srt/test_vision_openai_server_a.py | 10 ++-- test/srt/test_vision_openai_server_b.py | 8 ++-- 4 files changed, 57 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 923b4d02b..bbd5b0000 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -411,7 +411,7 @@ class ModelRunner: else: server_args.attention_backend = "triton" logger.info( - f"Attention backend not set. Use {server_args.attention_backend} backend by default." + f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default." ) elif self.use_mla_backend: if server_args.device != "cpu": @@ -463,7 +463,7 @@ class ModelRunner: if not self.is_multimodal_chunked_prefill_supported: server_args.chunked_prefill_size = -1 logger.info( - f"Automatically turn of --chunked-prefill-size as it is not supported for " + f"Automatically turn off --chunked-prefill-size as it is not supported for " f"{self.model_config.hf_config.model_type}" ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index cb8038d33..20db0b4b9 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -337,8 +337,52 @@ class ServerArgs: # Multimodal models need more memory for the image processor model_config = ModelConfig.from_server_args(self) - if model_config.is_multimodal: - self.mem_fraction_static *= 0.90 + + vision_config = getattr(model_config.hf_config, "vision_config", None) + + if model_config.is_multimodal and vision_config: + # roughly reduce the mem_fraction_static base on params of Vit + original_server_arg_mem_fraction = self.mem_fraction_static + # a base mem_fraction_static factor for regular Vit + base_mem_fraction_reduction_ratio = 0.95 + + vit_num_layers = getattr(vision_config, "num_hidden_layers", 24) + vit_hidden_size = getattr(vision_config, "hidden_size", 1024) + + # baseline ViT params (ViT-L/14) + baseline_vit_layers = 24 + baseline_vit_hidden_size = 1024 + + # weight params count + current_complexity_score = vit_num_layers * (vit_hidden_size**2) + baseline_complexity_score = baseline_vit_layers * ( + baseline_vit_hidden_size**2 + ) + complexity_ratio = ( + current_complexity_score / baseline_complexity_score + if baseline_complexity_score > 0 + else 1.0 + ) + + # every time the complexity grows 100%, adjust final factor for 10% + sensitivity_scale = 0.1 + dynamic_adjustment_factor = 1.0 - sensitivity_scale * ( + complexity_ratio - 1.0 + ) + dynamic_adjustment_factor = max( + 0.8, min(1.05, dynamic_adjustment_factor) + ) + + final_overall_factor = ( + base_mem_fraction_reduction_ratio * dynamic_adjustment_factor + ) + self.mem_fraction_static = ( + original_server_arg_mem_fraction * final_overall_factor + ) + logger.warning( + f"Multimodal model: Dynamically adjusted --mem-fraction-static " + f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}." + ) # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py index 90b91578f..f252c4884 100644 --- a/test/srt/test_vision_openai_server_a.py +++ b/test/srt/test_vision_openai_server_a.py @@ -30,7 +30,7 @@ class TestQwen2VLServer(TestOpenAIVisionServer): api_key=cls.api_key, other_args=[ "--mem-fraction-static", - "0.4", + "0.35", ], ) cls.base_url += "/v1" @@ -52,7 +52,7 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer): api_key=cls.api_key, other_args=[ "--mem-fraction-static", - "0.4", + "0.35", ], ) cls.base_url += "/v1" @@ -75,7 +75,7 @@ class TestVLMContextLengthIssue(CustomTestCase): other_args=[ "--context-length", "300", - "--mem-fraction-static=0.80", + "--mem-fraction-static=0.75", ], ) cls.base_url += "/v1" @@ -147,7 +147,7 @@ class TestMinicpmvServer(TestOpenAIVisionServer): other_args=[ "--trust-remote-code", "--mem-fraction-static", - "0.4", + "0.35", ], ) cls.base_url += "/v1" @@ -181,7 +181,7 @@ class TestMinicpmoServer(TestOpenAIVisionServer): other_args=[ "--trust-remote-code", "--mem-fraction-static", - "0.7", + "0.65", ], ) cls.base_url += "/v1" diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index 7a5716cb1..f6152ea76 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -22,7 +22,7 @@ class TestPixtralServer(TestOpenAIVisionServer): other_args=[ "--trust-remote-code", "--mem-fraction-static", - "0.73", + "0.70", ], ) cls.base_url += "/v1" @@ -44,7 +44,7 @@ class TestMistral3_1Server(TestOpenAIVisionServer): other_args=[ "--trust-remote-code", "--mem-fraction-static", - "0.8", + "0.75", ], ) cls.base_url += "/v1" @@ -88,7 +88,7 @@ class TestJanusProServer(TestOpenAIVisionServer): other_args=[ "--trust-remote-code", "--mem-fraction-static", - "0.4", + "0.35", ], ) cls.base_url += "/v1" @@ -197,7 +197,7 @@ class TestPhi4MMServer(TestOpenAIVisionServer): other_args=[ "--trust-remote-code", "--mem-fraction-static", - "0.75", + "0.70", "--disable-radix-cache", "--max-loras-per-batch", "1",