[Refactor] simplify multimodal data processing (#8107)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
This commit is contained in:
Xinyuan Tong
2025-07-20 21:43:09 -07:00
committed by GitHub
parent c9e8613c97
commit 8430bfe3e9
30 changed files with 297 additions and 421 deletions

View File

@@ -308,19 +308,35 @@ class TestOpenAIVisionServer(CustomTestCase):
"iPod" in video_response
or "device" in video_response
or "microphone" in video_response
), video_response
), f"""
====================== video_response =====================
{video_response}
===========================================================
should contain 'iPod' or 'device' or 'microphone'
"""
assert (
"man" in video_response
or "person" in video_response
or "individual" in video_response
or "speaker" in video_response
), video_response
or "Steve" in video_response
), f"""
====================== video_response =====================
{video_response}
===========================================================
should contain 'man' or 'person' or 'individual' or 'speaker'
"""
assert (
"present" in video_response
or "examine" in video_response
or "display" in video_response
or "hold" in video_response
)
), f"""
====================== video_response =====================
{video_response}
===========================================================
should contain 'present' or 'examine' or 'display' or 'hold'
"""
assert "black" in video_response or "dark" in video_response
self.assertIsNotNone(video_response)
self.assertGreater(len(video_response), 0)