model: qwen3-omni (thinker-only) (#10911)

Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
2025-10-17 04:20:38 +08:00
parent 85ebeecf06
commit 86b04d25b3
16 changed files with 1954 additions and 335 deletions
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -355,9 +355,10 @@ class TestPhi4MMServer(ImageOpenAITestMixin, AudioOpenAITestMixin):

 if __name__ == "__main__":
    del (
-        TestOpenAIOmniServerBase,
+        TestOpenAIMLLMServerBase,
        ImageOpenAITestMixin,
        VideoOpenAITestMixin,
        AudioOpenAITestMixin,
+        OmniOpenAITestMixin,
    )
    unittest.main()
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -241,11 +241,35 @@ class TestGLM41VServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
        cls.base_url += "/v1"


+class TestQwen3OmniServer(OmniOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[  # workaround to fit into H100
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.90",
+                "--disable-cuda-graph",
+                "--disable-fast-image-processor",
+                "--grammar-backend",
+                "none",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
 if __name__ == "__main__":
    del (
-        TestOpenAIOmniServerBase,
+        TestOpenAIMLLMServerBase,
        ImageOpenAITestMixin,
        VideoOpenAITestMixin,
        AudioOpenAITestMixin,
+        OmniOpenAITestMixin,
    )
    unittest.main()
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -1,6 +1,7 @@
 import base64
 import io
 import os
+from concurrent.futures import ThreadPoolExecutor

 import numpy as np
 import openai
@@ -22,7 +23,7 @@ AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test
 AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"


-class TestOpenAIOmniServerBase(CustomTestCase):
+class TestOpenAIMLLMServerBase(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = ""
@@ -58,7 +59,20 @@ class TestOpenAIOmniServerBase(CustomTestCase):
        return file_path


-class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
+class AudioOpenAITestMixin(TestOpenAIMLLMServerBase):
+    def verify_speech_recognition_response(self, text):
+        check_list = [
+            "thank you",
+            "it's a privilege to be here",
+            "leader",
+            "science",
+            "art",
+        ]
+        for check_word in check_list:
+            assert (
+                check_word in text.lower()
+            ), f"audio_response: ｜{text}｜ should contain ｜{check_word}｜"
+
    def prepare_audio_messages(self, prompt, audio_file_name):
        messages = [
            {
@@ -116,17 +130,7 @@ class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
            "Listen to this audio and write down the audio transcription in English.",
            category="speech",
        )
-        check_list = [
-            "thank you",
-            "it's a privilege to be here",
-            "leader",
-            "science",
-            "art",
-        ]
-        for check_word in check_list:
-            assert (
-                check_word in audio_response
-            ), f"audio_response: ｜{audio_response}｜ should contain ｜{check_word}｜"
+        self.verify_speech_recognition_response(audio_response)

    def test_audio_ambient_completion(self):
        # bird song
@@ -138,7 +142,79 @@ class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
        assert "bird" in audio_response


-class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
+class ImageOpenAITestMixin(TestOpenAIMLLMServerBase):
+    def run_decode_with_image(self, image_id):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        content = []
+        if image_id == 0:
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                }
+            )
+        elif image_id == 1:
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": IMAGE_SGL_LOGO_URL},
+                }
+            )
+        else:
+            pass
+
+        content.append(
+            {
+                "type": "text",
+                "text": "Describe this image in a sentence.",
+            }
+        )
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {"role": "user", "content": content},
+            ],
+            temperature=0,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+
+    def test_mixed_batch(self):
+        image_ids = [0, 1, 2] * 4
+        with ThreadPoolExecutor(4) as executor:
+            list(executor.map(self.run_decode_with_image, image_ids))
+
+    def verify_single_image_response(self, response):
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+
+        # `driver` is for gemma-3-it
+        assert (
+            "man" in text or "person" or "driver" in text
+        ), f"text: {text}, should contain man, person or driver"
+        assert (
+            "cab" in text
+            or "taxi" in text
+            or "SUV" in text
+            or "vehicle" in text
+            or "car" in text
+        ), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
+        # MiniCPMO fails to recognize `iron`, but `hanging`
+        assert (
+            "iron" in text or "hang" in text or "cloth" in text or "holding" in text
+        ), f"text: {text}, should contain iron, hang, cloth or holding"
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
    def test_single_image_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)

@@ -163,34 +239,11 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
            **(self.get_vision_request_kwargs()),
        )

-        assert response.choices[0].message.role == "assistant"
-        text = response.choices[0].message.content
-        assert isinstance(text, str)
-        # `driver` is for gemma-3-it
-        assert (
-            "man" in text or "person" or "driver" in text
-        ), f"text: {text}, should contain man, person or driver"
-        assert (
-            "cab" in text
-            or "taxi" in text
-            or "SUV" in text
-            or "vehicle" in text
-            or "car" in text
-        ), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
-        # MiniCPMO fails to recognize `iron`, but `hanging`
-        assert (
-            "iron" in text
-            or "hang" in text
-            or "cloth" in text
-            or "coat" in text
-            or "holding" in text
-            or "outfit" in text
-        ), f"text: {text}, should contain iron, hang, cloth, coat or holding or outfit"
-        assert response.id
-        assert response.created
-        assert response.usage.prompt_tokens > 0
-        assert response.usage.completion_tokens > 0
-        assert response.usage.total_tokens > 0
+        print("-" * 30)
+        print(f"Single image response:\n{response.choices[0].message.content}")
+        print("-" * 30)
+
+        self.verify_single_image_response(response)

    def test_multi_turn_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
@@ -264,8 +317,7 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
                        },
                        {
                            "type": "text",
-                            "text": "I have two very different images. They are not related at all. "
-                            "Please describe the first image in one sentence, and then describe the second image in another sentence.",
+                            "text": "I have two very different images. Please describe them.",
                        },
                    ],
                },
@@ -296,64 +348,6 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0

-    def _test_mixed_image_audio_chat_completion(self):
-        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
-
-        response = client.chat.completions.create(
-            model="default",
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
-                        },
-                        {
-                            "type": "audio_url",
-                            "audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
-                        },
-                        {
-                            "type": "text",
-                            "text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
-                        },
-                    ],
-                },
-            ],
-            temperature=0,
-            **(self.get_vision_request_kwargs()),
-        )
-
-        assert response.choices[0].message.role == "assistant"
-        text = response.choices[0].message.content
-        assert isinstance(text, str)
-        print("-" * 30)
-        print(f"Mixed image & audio response:\n{text}")
-        print("-" * 30)
-        assert (
-            "man" in text
-            or "cab" in text
-            or "SUV" in text
-            or "taxi" in text
-            or "car" in text
-        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
-        check_list = [
-            "thank you",
-            "it's a privilege to be here",
-            "leader",
-            "science",
-            "art",
-        ]
-        for check_word in check_list:
-            assert (
-                check_word in text
-            ), f"text: ｜{text}｜ should contain ｜{check_word}｜"
-        assert response.id
-        assert response.created
-        assert response.usage.prompt_tokens > 0
-        assert response.usage.completion_tokens > 0
-        assert response.usage.total_tokens > 0
-
    def prepare_video_images_messages(self, video_path):
        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
        # the size of the video embeds differs from the `modality` argument when preprocessed
@@ -461,7 +455,7 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
        self.assertGreater(len(video_response), 0)


-class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
+class VideoOpenAITestMixin(TestOpenAIMLLMServerBase):
    def prepare_video_messages(self, video_path):
        messages = [
            {
@@ -526,3 +520,45 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
        ), f"video_response: {video_response}, should contain 'black' or 'dark'"
        self.assertIsNotNone(video_response)
        self.assertGreater(len(video_response), 0)
+
+
+class OmniOpenAITestMixin(
+    ImageOpenAITestMixin, VideoOpenAITestMixin, AudioOpenAITestMixin
+):
+    def test_mixed_modality_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
+                    },
+                    {
+                        "type": "text",
+                        "text": "I have an image and audio, which are not related at all. Please:  1. Describe the image in a sentence, 2. Repeat the exact words from the audio I provided. Be exact",
+                    },
+                ],
+            },
+        ]
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=128,
+            stream=False,
+        )
+
+        text = response.choices[0].message.content
+
+        print("-" * 30)
+        print(f"Mixed modality response:\n{text}")
+        print("-" * 30)
+
+        self.verify_single_image_response(response=response)
+        self.verify_speech_recognition_response(text=text)