Support precomputed_embeddings for Llama 4 (#8156)

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com>
Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Kevin Xiang Li
2025-07-27 01:14:49 -07:00
committed by GitHub
parent 5c9c275bc8
commit 44d600cd67
6 changed files with 449 additions and 123 deletions

View File

@@ -216,5 +216,43 @@ class TestKimiVLImageUnderstandsImage(
)
# not for CI: too large
# class TestLlama4ImageUnderstandsImage(
# VLMInputTestBase, unittest.IsolatedAsyncioTestCase
# ):
# model_path = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
# chat_template = "llama_4_vision"
# def setUp(self):
# self.engine = Engine(
# model_path=self.model_path,
# trust_remote_code=True,
# chat_template=self.chat_template,
# enable_multimodal=True,
# mem_fraction_static=0.8,
# tp_size=4,
# attention_backend="fa3",
# context_length=65536,
# )
# @classmethod
# def _init_visual(cls):
# model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True, torch_dtype="auto")
# cls.vision_tower = model.vision_model.eval().to(cls.device)
# cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
# cls.visual = lambda tokenizer_output: cls.mm_projector(
# cls.vision_tower(
# pixel_values=tokenizer_output["pixel_values"],
# ).last_hidden_state.flatten(0, -2)
# )
# def _pixel_values_image_data(self, processor_output):
# return dict(
# modality="IMAGE",
# pixel_values=processor_output["pixel_values"],
# )
if __name__ == "__main__":
unittest.main()