model: qwen3-omni (thinker-only) (#10911)
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
@@ -355,9 +355,10 @@ class TestPhi4MMServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
|
||||
|
||||
if __name__ == "__main__":
|
||||
del (
|
||||
TestOpenAIOmniServerBase,
|
||||
TestOpenAIMLLMServerBase,
|
||||
ImageOpenAITestMixin,
|
||||
VideoOpenAITestMixin,
|
||||
AudioOpenAITestMixin,
|
||||
OmniOpenAITestMixin,
|
||||
)
|
||||
unittest.main()
|
||||
|
||||
@@ -241,11 +241,35 @@ class TestGLM41VServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
|
||||
cls.base_url += "/v1"
|
||||
|
||||
|
||||
class TestQwen3OmniServer(OmniOpenAITestMixin):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.api_key = "sk-123456"
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[ # workaround to fit into H100
|
||||
"--trust-remote-code",
|
||||
"--mem-fraction-static",
|
||||
"0.90",
|
||||
"--disable-cuda-graph",
|
||||
"--disable-fast-image-processor",
|
||||
"--grammar-backend",
|
||||
"none",
|
||||
],
|
||||
)
|
||||
cls.base_url += "/v1"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
del (
|
||||
TestOpenAIOmniServerBase,
|
||||
TestOpenAIMLLMServerBase,
|
||||
ImageOpenAITestMixin,
|
||||
VideoOpenAITestMixin,
|
||||
AudioOpenAITestMixin,
|
||||
OmniOpenAITestMixin,
|
||||
)
|
||||
unittest.main()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import base64
|
||||
import io
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
@@ -22,7 +23,7 @@ AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test
|
||||
AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"
|
||||
|
||||
|
||||
class TestOpenAIOmniServerBase(CustomTestCase):
|
||||
class TestOpenAIMLLMServerBase(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = ""
|
||||
@@ -58,7 +59,20 @@ class TestOpenAIOmniServerBase(CustomTestCase):
|
||||
return file_path
|
||||
|
||||
|
||||
class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
class AudioOpenAITestMixin(TestOpenAIMLLMServerBase):
|
||||
def verify_speech_recognition_response(self, text):
|
||||
check_list = [
|
||||
"thank you",
|
||||
"it's a privilege to be here",
|
||||
"leader",
|
||||
"science",
|
||||
"art",
|
||||
]
|
||||
for check_word in check_list:
|
||||
assert (
|
||||
check_word in text.lower()
|
||||
), f"audio_response: |{text}| should contain |{check_word}|"
|
||||
|
||||
def prepare_audio_messages(self, prompt, audio_file_name):
|
||||
messages = [
|
||||
{
|
||||
@@ -116,17 +130,7 @@ class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
"Listen to this audio and write down the audio transcription in English.",
|
||||
category="speech",
|
||||
)
|
||||
check_list = [
|
||||
"thank you",
|
||||
"it's a privilege to be here",
|
||||
"leader",
|
||||
"science",
|
||||
"art",
|
||||
]
|
||||
for check_word in check_list:
|
||||
assert (
|
||||
check_word in audio_response
|
||||
), f"audio_response: |{audio_response}| should contain |{check_word}|"
|
||||
self.verify_speech_recognition_response(audio_response)
|
||||
|
||||
def test_audio_ambient_completion(self):
|
||||
# bird song
|
||||
@@ -138,7 +142,79 @@ class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
assert "bird" in audio_response
|
||||
|
||||
|
||||
class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
class ImageOpenAITestMixin(TestOpenAIMLLMServerBase):
|
||||
def run_decode_with_image(self, image_id):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
content = []
|
||||
if image_id == 0:
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": IMAGE_MAN_IRONING_URL},
|
||||
}
|
||||
)
|
||||
elif image_id == 1:
|
||||
content.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": IMAGE_SGL_LOGO_URL},
|
||||
}
|
||||
)
|
||||
else:
|
||||
pass
|
||||
|
||||
content.append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image in a sentence.",
|
||||
}
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=[
|
||||
{"role": "user", "content": content},
|
||||
],
|
||||
temperature=0,
|
||||
**(self.get_vision_request_kwargs()),
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
text = response.choices[0].message.content
|
||||
assert isinstance(text, str)
|
||||
|
||||
def test_mixed_batch(self):
|
||||
image_ids = [0, 1, 2] * 4
|
||||
with ThreadPoolExecutor(4) as executor:
|
||||
list(executor.map(self.run_decode_with_image, image_ids))
|
||||
|
||||
def verify_single_image_response(self, response):
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
text = response.choices[0].message.content
|
||||
assert isinstance(text, str)
|
||||
|
||||
# `driver` is for gemma-3-it
|
||||
assert (
|
||||
"man" in text or "person" or "driver" in text
|
||||
), f"text: {text}, should contain man, person or driver"
|
||||
assert (
|
||||
"cab" in text
|
||||
or "taxi" in text
|
||||
or "SUV" in text
|
||||
or "vehicle" in text
|
||||
or "car" in text
|
||||
), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
|
||||
# MiniCPMO fails to recognize `iron`, but `hanging`
|
||||
assert (
|
||||
"iron" in text or "hang" in text or "cloth" in text or "holding" in text
|
||||
), f"text: {text}, should contain iron, hang, cloth or holding"
|
||||
assert response.id
|
||||
assert response.created
|
||||
assert response.usage.prompt_tokens > 0
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def test_single_image_chat_completion(self):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
@@ -163,34 +239,11 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
**(self.get_vision_request_kwargs()),
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
text = response.choices[0].message.content
|
||||
assert isinstance(text, str)
|
||||
# `driver` is for gemma-3-it
|
||||
assert (
|
||||
"man" in text or "person" or "driver" in text
|
||||
), f"text: {text}, should contain man, person or driver"
|
||||
assert (
|
||||
"cab" in text
|
||||
or "taxi" in text
|
||||
or "SUV" in text
|
||||
or "vehicle" in text
|
||||
or "car" in text
|
||||
), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
|
||||
# MiniCPMO fails to recognize `iron`, but `hanging`
|
||||
assert (
|
||||
"iron" in text
|
||||
or "hang" in text
|
||||
or "cloth" in text
|
||||
or "coat" in text
|
||||
or "holding" in text
|
||||
or "outfit" in text
|
||||
), f"text: {text}, should contain iron, hang, cloth, coat or holding or outfit"
|
||||
assert response.id
|
||||
assert response.created
|
||||
assert response.usage.prompt_tokens > 0
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
print("-" * 30)
|
||||
print(f"Single image response:\n{response.choices[0].message.content}")
|
||||
print("-" * 30)
|
||||
|
||||
self.verify_single_image_response(response)
|
||||
|
||||
def test_multi_turn_chat_completion(self):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
@@ -264,8 +317,7 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "I have two very different images. They are not related at all. "
|
||||
"Please describe the first image in one sentence, and then describe the second image in another sentence.",
|
||||
"text": "I have two very different images. Please describe them.",
|
||||
},
|
||||
],
|
||||
},
|
||||
@@ -296,64 +348,6 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def _test_mixed_image_audio_chat_completion(self):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": IMAGE_MAN_IRONING_URL},
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
temperature=0,
|
||||
**(self.get_vision_request_kwargs()),
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
text = response.choices[0].message.content
|
||||
assert isinstance(text, str)
|
||||
print("-" * 30)
|
||||
print(f"Mixed image & audio response:\n{text}")
|
||||
print("-" * 30)
|
||||
assert (
|
||||
"man" in text
|
||||
or "cab" in text
|
||||
or "SUV" in text
|
||||
or "taxi" in text
|
||||
or "car" in text
|
||||
), f"text: {text}, should contain man, cab, SUV, taxi or car"
|
||||
check_list = [
|
||||
"thank you",
|
||||
"it's a privilege to be here",
|
||||
"leader",
|
||||
"science",
|
||||
"art",
|
||||
]
|
||||
for check_word in check_list:
|
||||
assert (
|
||||
check_word in text
|
||||
), f"text: |{text}| should contain |{check_word}|"
|
||||
assert response.id
|
||||
assert response.created
|
||||
assert response.usage.prompt_tokens > 0
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def prepare_video_images_messages(self, video_path):
|
||||
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
|
||||
# the size of the video embeds differs from the `modality` argument when preprocessed
|
||||
@@ -461,7 +455,7 @@ class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
self.assertGreater(len(video_response), 0)
|
||||
|
||||
|
||||
class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
class VideoOpenAITestMixin(TestOpenAIMLLMServerBase):
|
||||
def prepare_video_messages(self, video_path):
|
||||
messages = [
|
||||
{
|
||||
@@ -526,3 +520,45 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
|
||||
), f"video_response: {video_response}, should contain 'black' or 'dark'"
|
||||
self.assertIsNotNone(video_response)
|
||||
self.assertGreater(len(video_response), 0)
|
||||
|
||||
|
||||
class OmniOpenAITestMixin(
|
||||
ImageOpenAITestMixin, VideoOpenAITestMixin, AudioOpenAITestMixin
|
||||
):
|
||||
def test_mixed_modality_chat_completion(self):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": IMAGE_MAN_IRONING_URL},
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "I have an image and audio, which are not related at all. Please: 1. Describe the image in a sentence, 2. Repeat the exact words from the audio I provided. Be exact",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=messages,
|
||||
temperature=0,
|
||||
max_tokens=128,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content
|
||||
|
||||
print("-" * 30)
|
||||
print(f"Mixed modality response:\n{text}")
|
||||
print("-" * 30)
|
||||
|
||||
self.verify_single_image_response(response=response)
|
||||
self.verify_speech_recognition_response(text=text)
|
||||
|
||||
Reference in New Issue
Block a user