Support precomputed_embeddings for Llama 4 (#8156)
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
@@ -216,5 +216,43 @@ class TestKimiVLImageUnderstandsImage(
|
||||
)
|
||||
|
||||
|
||||
# not for CI: too large
|
||||
# class TestLlama4ImageUnderstandsImage(
|
||||
# VLMInputTestBase, unittest.IsolatedAsyncioTestCase
|
||||
# ):
|
||||
# model_path = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
# chat_template = "llama_4_vision"
|
||||
|
||||
# def setUp(self):
|
||||
# self.engine = Engine(
|
||||
# model_path=self.model_path,
|
||||
# trust_remote_code=True,
|
||||
# chat_template=self.chat_template,
|
||||
# enable_multimodal=True,
|
||||
# mem_fraction_static=0.8,
|
||||
# tp_size=4,
|
||||
# attention_backend="fa3",
|
||||
# context_length=65536,
|
||||
# )
|
||||
|
||||
# @classmethod
|
||||
# def _init_visual(cls):
|
||||
# model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True, torch_dtype="auto")
|
||||
# cls.vision_tower = model.vision_model.eval().to(cls.device)
|
||||
# cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
|
||||
|
||||
# cls.visual = lambda tokenizer_output: cls.mm_projector(
|
||||
# cls.vision_tower(
|
||||
# pixel_values=tokenizer_output["pixel_values"],
|
||||
# ).last_hidden_state.flatten(0, -2)
|
||||
# )
|
||||
|
||||
# def _pixel_values_image_data(self, processor_output):
|
||||
# return dict(
|
||||
# modality="IMAGE",
|
||||
# pixel_values=processor_output["pixel_values"],
|
||||
# )
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user