diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py index ceef4c332..7d4ae186a 100644 --- a/python/sglang/srt/managers/mm_utils.py +++ b/python/sglang/srt/managers/mm_utils.py @@ -560,7 +560,7 @@ def embed_mm_inputs( ] items_size[i + 1] = len(mm_items) items_offsets.append( - flatten_nested_list([item.offsets for item in mm_inputs.mm_items]) + flatten_nested_list([item.offsets for item in mm_items]) ) items_size = torch.cumsum(items_size, dim=0).tolist() diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index 95941149d..c420c0ad8 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer): # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM # self._test_audio_ambient_completion() + def _test_mixed_image_audio_chat_completion(self): + self._test_mixed_image_audio_chat_completion() + class TestQwen2AudioServer(TestOpenAIVisionServer): @classmethod diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py index e43ba5cfc..6a46a0610 100644 --- a/test/srt/test_vision_openai_server_common.py +++ b/test/srt/test_vision_openai_server_common.py @@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase): assert response.usage.completion_tokens > 0 assert response.usage.total_tokens > 0 + def _test_mixed_image_audio_chat_completion(self): + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + + response = client.chat.completions.create( + model="default", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": IMAGE_MAN_IRONING_URL}, + }, + { + "type": "audio_url", + "audio_url": {"url": AUDIO_TRUMP_SPEECH_URL}, + }, + { + "type": "text", + "text": "Please describe the image in one sentence, and then write down the audio transcription in English.", + }, + ], + }, + ], + temperature=0, + **(self.get_vision_request_kwargs()), + ) + + assert response.choices[0].message.role == "assistant" + text = response.choices[0].message.content + assert isinstance(text, str) + print("-" * 30) + print(f"Mixed image & audio response:\n{text}") + print("-" * 30) + assert ( + "man" in text + or "cab" in text + or "SUV" in text + or "taxi" in text + or "car" in text + ), f"text: {text}, should contain man, cab, SUV, taxi or car" + check_list = [ + "thank you", + "it's a privilege to be here", + "leader", + "science", + "art", + ] + for check_word in check_list: + assert ( + check_word in text + ), f"text: |{text}| should contain |{check_word}|" + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + def prepare_video_images_messages(self, video_path): # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa # the size of the video embeds differs from the `modality` argument when preprocessed