[Refactor] Multimodal data processing for VLM (#6659)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
This commit is contained in:
Xinyuan Tong
2025-06-04 11:22:33 -07:00
committed by GitHub
parent bd75690f4e
commit cf9815ba69
11 changed files with 248 additions and 167 deletions

View File

@@ -156,7 +156,7 @@ class TestQwenVLUnderstandsImage(VLMInputTestBase, unittest.IsolatedAsyncioTestC
def _pixel_values_image_data(self, processor_output):
return dict(
modality="IMAGE",
image_grid_thws=processor_output["image_grid_thw"],
image_grid_thw=processor_output["image_grid_thw"],
pixel_values=processor_output["pixel_values"],
)
@@ -207,8 +207,8 @@ class TestKimiVLImageUnderstandsImage(
def _pixel_values_image_data(self, processor_output):
return dict(
modality="IMAGE",
image_grid_thws=processor_output["image_grid_hws"],
pixel_values=processor_output["pixel_values"],
image_grid_hws=processor_output["image_grid_hws"],
)