model: qwen3-omni (thinker-only) (#10911)

Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Mick
2025-10-17 04:20:38 +08:00
committed by GitHub
parent 85ebeecf06
commit 86b04d25b3
16 changed files with 1954 additions and 335 deletions

View File

@@ -355,9 +355,10 @@ class TestPhi4MMServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
if __name__ == "__main__":
del (
TestOpenAIOmniServerBase,
TestOpenAIMLLMServerBase,
ImageOpenAITestMixin,
VideoOpenAITestMixin,
AudioOpenAITestMixin,
OmniOpenAITestMixin,
)
unittest.main()

View File

@@ -241,11 +241,35 @@ class TestGLM41VServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
cls.base_url += "/v1"
class TestQwen3OmniServer(OmniOpenAITestMixin):
@classmethod
def setUpClass(cls):
cls.model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[ # workaround to fit into H100
"--trust-remote-code",
"--mem-fraction-static",
"0.90",
"--disable-cuda-graph",
"--disable-fast-image-processor",
"--grammar-backend",
"none",
],
)
cls.base_url += "/v1"
if __name__ == "__main__":
del (
TestOpenAIOmniServerBase,
TestOpenAIMLLMServerBase,
ImageOpenAITestMixin,
VideoOpenAITestMixin,
AudioOpenAITestMixin,
OmniOpenAITestMixin,
)
unittest.main()

View File

@@ -1,6 +1,7 @@
import base64
import io
import os
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import openai
@@ -22,7 +23,7 @@ AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test
AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"
class TestOpenAIOmniServerBase(CustomTestCase):
class TestOpenAIMLLMServerBase(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = ""
@@ -58,7 +59,20 @@ class TestOpenAIOmniServerBase(CustomTestCase):
return file_path
class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
class AudioOpenAITestMixin(TestOpenAIMLLMServerBase):
def verify_speech_recognition_response(self, text):
check_list = [
"thank you",
"it's a privilege to be here",
"leader",
"science",
"art",
]
for check_word in check_list:
assert (
check_word in text.lower()
), f"audio_response: {text} should contain {check_word}"
def prepare_audio_messages(self, prompt, audio_file_name):
messages = [
{
@@ -116,17 +130,7 @@ class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
"Listen to this audio and write down the audio transcription in English.",
category="speech",
)
check_list = [
"thank you",
"it's a privilege to be here",
"leader",
"science",
"art",
]
for check_word in check_list:
assert (
check_word in audio_response
), f"audio_response: {audio_response} should contain {check_word}"
self.verify_speech_recognition_response(audio_response)
def test_audio_ambient_completion(self):
# bird song
@@ -138,7 +142,79 @@ class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
assert "bird" in audio_response
class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
class ImageOpenAITestMixin(TestOpenAIMLLMServerBase):
def run_decode_with_image(self, image_id):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
content = []
if image_id == 0:
content.append(
{
"type": "image_url",
"image_url": {"url": IMAGE_MAN_IRONING_URL},
}
)
elif image_id == 1:
content.append(
{
"type": "image_url",
"image_url": {"url": IMAGE_SGL_LOGO_URL},
}
)
else:
pass
content.append(
{
"type": "text",
"text": "Describe this image in a sentence.",
}
)
response = client.chat.completions.create(
model="default",
messages=[
{"role": "user", "content": content},
],
temperature=0,
**(self.get_vision_request_kwargs()),
)
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
def test_mixed_batch(self):
image_ids = [0, 1, 2] * 4
with ThreadPoolExecutor(4) as executor:
list(executor.map(self.run_decode_with_image, image_ids))
def verify_single_image_response(self, response):
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
# `driver` is for gemma-3-it
assert (
"man" in text or "person" or "driver" in text
), f"text: {text}, should contain man, person or driver"
assert (
"cab" in text
or "taxi" in text
or "SUV" in text
or "vehicle" in text
or "car" in text
), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
# MiniCPMO fails to recognize `iron`, but `hanging`
assert (
"iron" in text or "hang" in text or "cloth" in text or "holding" in text
), f"text: {text}, should contain iron, hang, cloth or holding"
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def test_single_image_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
@@ -163,34 +239,11 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
**(self.get_vision_request_kwargs()),
)
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
# `driver` is for gemma-3-it
assert (
"man" in text or "person" or "driver" in text
), f"text: {text}, should contain man, person or driver"
assert (
"cab" in text
or "taxi" in text
or "SUV" in text
or "vehicle" in text
or "car" in text
), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
# MiniCPMO fails to recognize `iron`, but `hanging`
assert (
"iron" in text
or "hang" in text
or "cloth" in text
or "coat" in text
or "holding" in text
or "outfit" in text
), f"text: {text}, should contain iron, hang, cloth, coat or holding or outfit"
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
print("-" * 30)
print(f"Single image response:\n{response.choices[0].message.content}")
print("-" * 30)
self.verify_single_image_response(response)
def test_multi_turn_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
@@ -264,8 +317,7 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
},
{
"type": "text",
"text": "I have two very different images. They are not related at all. "
"Please describe the first image in one sentence, and then describe the second image in another sentence.",
"text": "I have two very different images. Please describe them.",
},
],
},
@@ -296,64 +348,6 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def _test_mixed_image_audio_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": IMAGE_MAN_IRONING_URL},
},
{
"type": "audio_url",
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
},
{
"type": "text",
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
},
],
},
],
temperature=0,
**(self.get_vision_request_kwargs()),
)
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
print("-" * 30)
print(f"Mixed image & audio response:\n{text}")
print("-" * 30)
assert (
"man" in text
or "cab" in text
or "SUV" in text
or "taxi" in text
or "car" in text
), f"text: {text}, should contain man, cab, SUV, taxi or car"
check_list = [
"thank you",
"it's a privilege to be here",
"leader",
"science",
"art",
]
for check_word in check_list:
assert (
check_word in text
), f"text: {text} should contain {check_word}"
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def prepare_video_images_messages(self, video_path):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed
@@ -461,7 +455,7 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
self.assertGreater(len(video_response), 0)
class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
class VideoOpenAITestMixin(TestOpenAIMLLMServerBase):
def prepare_video_messages(self, video_path):
messages = [
{
@@ -526,3 +520,45 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
), f"video_response: {video_response}, should contain 'black' or 'dark'"
self.assertIsNotNone(video_response)
self.assertGreater(len(video_response), 0)
class OmniOpenAITestMixin(
ImageOpenAITestMixin, VideoOpenAITestMixin, AudioOpenAITestMixin
):
def test_mixed_modality_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": IMAGE_MAN_IRONING_URL},
},
{
"type": "audio_url",
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
},
{
"type": "text",
"text": "I have an image and audio, which are not related at all. Please: 1. Describe the image in a sentence, 2. Repeat the exact words from the audio I provided. Be exact",
},
],
},
]
response = client.chat.completions.create(
model="default",
messages=messages,
temperature=0,
max_tokens=128,
stream=False,
)
text = response.choices[0].message.content
print("-" * 30)
print(f"Mixed modality response:\n{text}")
print("-" * 30)
self.verify_single_image_response(response=response)
self.verify_speech_recognition_response(text=text)