Bug fix: use correct mm_items in embed_mm_inputs (#8893)
This commit is contained in:
@@ -560,7 +560,7 @@ def embed_mm_inputs(
|
||||
]
|
||||
items_size[i + 1] = len(mm_items)
|
||||
items_offsets.append(
|
||||
flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
|
||||
flatten_nested_list([item.offsets for item in mm_items])
|
||||
)
|
||||
items_size = torch.cumsum(items_size, dim=0).tolist()
|
||||
|
||||
|
||||
@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
|
||||
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
|
||||
# self._test_audio_ambient_completion()
|
||||
|
||||
def _test_mixed_image_audio_chat_completion(self):
|
||||
self._test_mixed_image_audio_chat_completion()
|
||||
|
||||
|
||||
class TestQwen2AudioServer(TestOpenAIVisionServer):
|
||||
@classmethod
|
||||
|
||||
@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def _test_mixed_image_audio_chat_completion(self):
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": IMAGE_MAN_IRONING_URL},
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
temperature=0,
|
||||
**(self.get_vision_request_kwargs()),
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
text = response.choices[0].message.content
|
||||
assert isinstance(text, str)
|
||||
print("-" * 30)
|
||||
print(f"Mixed image & audio response:\n{text}")
|
||||
print("-" * 30)
|
||||
assert (
|
||||
"man" in text
|
||||
or "cab" in text
|
||||
or "SUV" in text
|
||||
or "taxi" in text
|
||||
or "car" in text
|
||||
), f"text: {text}, should contain man, cab, SUV, taxi or car"
|
||||
check_list = [
|
||||
"thank you",
|
||||
"it's a privilege to be here",
|
||||
"leader",
|
||||
"science",
|
||||
"art",
|
||||
]
|
||||
for check_word in check_list:
|
||||
assert (
|
||||
check_word in text
|
||||
), f"text: |{text}| should contain |{check_word}|"
|
||||
assert response.id
|
||||
assert response.created
|
||||
assert response.usage.prompt_tokens > 0
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def prepare_video_images_messages(self, video_path):
|
||||
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
|
||||
# the size of the video embeds differs from the `modality` argument when preprocessed
|
||||
|
||||
Reference in New Issue
Block a user