Support Phi-4 Multi-Modal (text + vision only) (#6494)

2025-05-24 21:43:38 -07:00
parent 681e7af32b
commit 022012aae8
8 changed files with 650 additions and 6 deletions
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -196,5 +196,31 @@ class TestKimiVLServer(TestOpenAIVisionServer):
        pass


+class TestPhi4MMServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "microsoft/Phi-4-multimodal-instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.75",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    def test_video_chat_completion(self):
+        pass
+
+    def test_multi_images_chat_completion(self):
+        # TODO (lifuhuang): support LoRA to enable Phi4MM multi-image understanding capability.
+        pass
+
+
 if __name__ == "__main__":
    unittest.main()