Bug fix: use correct mm_items in embed_mm_inputs (#8893)

2025-08-16 19:55:56 -07:00
parent 1c1f8a118e
commit 66d6be0874
3 changed files with 62 additions and 1 deletions
--- a/python/sglang/srt/managers/mm_utils.py
+++ b/python/sglang/srt/managers/mm_utils.py
@@ -560,7 +560,7 @@ def embed_mm_inputs(
                ]
                items_size[i + 1] = len(mm_items)
                items_offsets.append(
-                    flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
+                    flatten_nested_list([item.offsets for item in mm_items])
                )
            items_size = torch.cumsum(items_size, dim=0).tolist()
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
        # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
        # self._test_audio_ambient_completion()
    def _test_mixed_image_audio_chat_completion(self):
        self._test_mixed_image_audio_chat_completion()
 class TestQwen2AudioServer(TestOpenAIVisionServer):
    @classmethod
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0
    def _test_mixed_image_audio_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        response = client.chat.completions.create(
            model="default",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
                        },
                        {
                            "type": "audio_url",
                            "audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
                        },
                        {
                            "type": "text",
                            "text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
                        },
                    ],
                },
            ],
            temperature=0,
            **(self.get_vision_request_kwargs()),
        )
        assert response.choices[0].message.role == "assistant"
        text = response.choices[0].message.content
        assert isinstance(text, str)
        print("-" * 30)
        print(f"Mixed image & audio response:\n{text}")
        print("-" * 30)
        assert (
            "man" in text
            or "cab" in text
            or "SUV" in text
            or "taxi" in text
            or "car" in text
        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
        check_list = [
            "thank you",
            "it's a privilege to be here",
            "leader",
            "science",
            "art",
        ]
        for check_word in check_list:
            assert (
                check_word in text
            ), f"text: ｜{text}｜ should contain ｜{check_word}｜"
        assert response.id
        assert response.created
        assert response.usage.prompt_tokens > 0
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0
    def prepare_video_images_messages(self, video_path):
        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
        # the size of the video embeds differs from the `modality` argument when preprocessed