ci: reduce and refactor vlm ut and combine test files (#11062)

2025-10-17 23:24:50 +08:00
parent d88ac9bc9a
commit 3e4c7da2f5
6 changed files with 85 additions and 555 deletions
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -9,7 +9,12 @@ import requests
 from PIL import Image

 from sglang.srt.utils import kill_process_tree
-from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, CustomTestCase
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)

 # image
 IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"
@@ -24,12 +29,21 @@ AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-fi


 class TestOpenAIMLLMServerBase(CustomTestCase):
+    model: str
+    extra_args: list = []
+    fixed_args: list = ["--trust-remote-code", "--enable-multimodal"]
+
    @classmethod
    def setUpClass(cls):
-        cls.model = ""
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
-        cls.process = None
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=cls.extra_args + cls.fixed_args,
+        )
        cls.base_url += "/v1"

    @classmethod
@@ -421,7 +435,7 @@ class ImageOpenAITestMixin(TestOpenAIMLLMServerBase):
            or "device" in video_response
            or "microphone" in video_response
        ), f"""
-        ====================== video_response =====================
+        ====================== video_images response =====================
        {video_response}
        ===========================================================
        should contain 'iPod' or 'device' or 'microphone'
@@ -435,7 +449,7 @@ class ImageOpenAITestMixin(TestOpenAIMLLMServerBase):
            or "Steve" in video_response
            or "hand" in video_response
        ), f"""
-        ====================== video_response =====================
+        ====================== video_images response =====================
        {video_response}
        ===========================================================
        should contain 'man' or 'person' or 'individual' or 'speaker' or 'presenter' or 'Steve' or 'hand'
@@ -446,7 +460,7 @@ class ImageOpenAITestMixin(TestOpenAIMLLMServerBase):
            or "display" in video_response
            or "hold" in video_response
        ), f"""
-        ====================== video_response =====================
+        ====================== video_images response =====================
        {video_response}
        ===========================================================
        should contain 'present' or 'examine' or 'display' or 'hold'