Support precomputed_embeddings for Llama 4 (#8156)

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
2025-07-27 01:14:49 -07:00
parent 5c9c275bc8
commit 44d600cd67
6 changed files with 449 additions and 123 deletions
--- a/test/srt/test_vlm_input_format.py
+++ b/test/srt/test_vlm_input_format.py
@@ -216,5 +216,43 @@ class TestKimiVLImageUnderstandsImage(
        )


+# not for CI: too large
+# class TestLlama4ImageUnderstandsImage(
+#     VLMInputTestBase, unittest.IsolatedAsyncioTestCase
+# ):
+#     model_path = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+#     chat_template = "llama_4_vision"
+
+#     def setUp(self):
+#         self.engine = Engine(
+#             model_path=self.model_path,
+#             trust_remote_code=True,
+#             chat_template=self.chat_template,
+#             enable_multimodal=True,
+#             mem_fraction_static=0.8,
+#             tp_size=4,
+#             attention_backend="fa3",
+#             context_length=65536,
+#         )
+
+#     @classmethod
+#     def _init_visual(cls):
+#         model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True, torch_dtype="auto")
+#         cls.vision_tower = model.vision_model.eval().to(cls.device)
+#         cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
+
+#         cls.visual = lambda tokenizer_output: cls.mm_projector(
+#             cls.vision_tower(
+#                 pixel_values=tokenizer_output["pixel_values"],
+#             ).last_hidden_state.flatten(0, -2)
+#         )
+
+#     def _pixel_values_image_data(self, processor_output):
+#         return dict(
+#             modality="IMAGE",
+#             pixel_values=processor_output["pixel_values"],
+#         )
+
+
 if __name__ == "__main__":
    unittest.main()