Bug fix: use correct mm_items in embed_mm_inputs (#8893)

2025-08-16 19:55:56 -07:00
parent 1c1f8a118e
commit 66d6be0874
3 changed files with 62 additions and 1 deletions
--- a/python/sglang/srt/managers/mm_utils.py
+++ b/python/sglang/srt/managers/mm_utils.py
@@ -560,7 +560,7 @@ def embed_mm_inputs(
                ]
                items_size[i + 1] = len(mm_items)
                items_offsets.append(
-                    flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
+                    flatten_nested_list([item.offsets for item in mm_items])
                )
            items_size = torch.cumsum(items_size, dim=0).tolist()

--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
        # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
        # self._test_audio_ambient_completion()

+    def _test_mixed_image_audio_chat_completion(self):
+        self._test_mixed_image_audio_chat_completion()
+

 class TestQwen2AudioServer(TestOpenAIVisionServer):
    @classmethod
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0

+    def _test_mixed_image_audio_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                        },
+                        {
+                            "type": "audio_url",
+                            "audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
+                        },
+                    ],
+                },
+            ],
+            temperature=0,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        print("-" * 30)
+        print(f"Mixed image & audio response:\n{text}")
+        print("-" * 30)
+        assert (
+            "man" in text
+            or "cab" in text
+            or "SUV" in text
+            or "taxi" in text
+            or "car" in text
+        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
+        check_list = [
+            "thank you",
+            "it's a privilege to be here",
+            "leader",
+            "science",
+            "art",
+        ]
+        for check_word in check_list:
+            assert (
+                check_word in text
+            ), f"text: ｜{text}｜ should contain ｜{check_word}｜"
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
    def prepare_video_images_messages(self, video_path):
        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
        # the size of the video embeds differs from the `modality` argument when preprocessed