Bug fix: use correct mm_items in embed_mm_inputs (#8893)
This commit is contained in:
@@ -560,7 +560,7 @@ def embed_mm_inputs(
|
|||||||
]
|
]
|
||||||
items_size[i + 1] = len(mm_items)
|
items_size[i + 1] = len(mm_items)
|
||||||
items_offsets.append(
|
items_offsets.append(
|
||||||
flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
|
flatten_nested_list([item.offsets for item in mm_items])
|
||||||
)
|
)
|
||||||
items_size = torch.cumsum(items_size, dim=0).tolist()
|
items_size = torch.cumsum(items_size, dim=0).tolist()
|
||||||
|
|
||||||
|
|||||||
@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
|
|||||||
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
|
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
|
||||||
# self._test_audio_ambient_completion()
|
# self._test_audio_ambient_completion()
|
||||||
|
|
||||||
|
def _test_mixed_image_audio_chat_completion(self):
|
||||||
|
self._test_mixed_image_audio_chat_completion()
|
||||||
|
|
||||||
|
|
||||||
class TestQwen2AudioServer(TestOpenAIVisionServer):
|
class TestQwen2AudioServer(TestOpenAIVisionServer):
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
|
|||||||
assert response.usage.completion_tokens > 0
|
assert response.usage.completion_tokens > 0
|
||||||
assert response.usage.total_tokens > 0
|
assert response.usage.total_tokens > 0
|
||||||
|
|
||||||
|
def _test_mixed_image_audio_chat_completion(self):
|
||||||
|
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="default",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": IMAGE_MAN_IRONING_URL},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "audio_url",
|
||||||
|
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
temperature=0,
|
||||||
|
**(self.get_vision_request_kwargs()),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.choices[0].message.role == "assistant"
|
||||||
|
text = response.choices[0].message.content
|
||||||
|
assert isinstance(text, str)
|
||||||
|
print("-" * 30)
|
||||||
|
print(f"Mixed image & audio response:\n{text}")
|
||||||
|
print("-" * 30)
|
||||||
|
assert (
|
||||||
|
"man" in text
|
||||||
|
or "cab" in text
|
||||||
|
or "SUV" in text
|
||||||
|
or "taxi" in text
|
||||||
|
or "car" in text
|
||||||
|
), f"text: {text}, should contain man, cab, SUV, taxi or car"
|
||||||
|
check_list = [
|
||||||
|
"thank you",
|
||||||
|
"it's a privilege to be here",
|
||||||
|
"leader",
|
||||||
|
"science",
|
||||||
|
"art",
|
||||||
|
]
|
||||||
|
for check_word in check_list:
|
||||||
|
assert (
|
||||||
|
check_word in text
|
||||||
|
), f"text: |{text}| should contain |{check_word}|"
|
||||||
|
assert response.id
|
||||||
|
assert response.created
|
||||||
|
assert response.usage.prompt_tokens > 0
|
||||||
|
assert response.usage.completion_tokens > 0
|
||||||
|
assert response.usage.total_tokens > 0
|
||||||
|
|
||||||
def prepare_video_images_messages(self, video_path):
|
def prepare_video_images_messages(self, video_path):
|
||||||
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
|
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
|
||||||
# the size of the video embeds differs from the `modality` argument when preprocessed
|
# the size of the video embeds differs from the `modality` argument when preprocessed
|
||||||
|
|||||||
Reference in New Issue
Block a user