fix: InternS1 don't recognize image, updates image token for InternVL processor (#9381)

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Xinyuan Tong
2025-08-21 07:43:03 +08:00
committed by GitHub
parent e99729c9f3
commit 84719b527a
2 changed files with 9 additions and 17 deletions

View File

@@ -44,7 +44,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
self.mm_tokens = MultimodalSpecialTokens(
image_token="<image>",
image_token="<IMG_CONTEXT>",
image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
).build(_image_processor)
@@ -218,13 +218,18 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
pixel_values = torch.cat(pixel_values, dim=0)
original_placeholder = "<<<__IMG_CONTEXT_PLACEHOLDER__>>>"
input_text = input_text.replace(self.IMG_CONTEXT_TOKEN, original_placeholder)
for idx, num_patches in enumerate(num_patches_list):
image_tokens = (
self.IMG_START_TOKEN
+ self.IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+ self.IMG_END_TOKEN
)
input_text = input_text.replace("<image>", image_tokens, 1)
input_text = input_text.replace(original_placeholder, image_tokens, 1)
input_text = input_text.replace(original_placeholder, self.IMG_CONTEXT_TOKEN)
input_ids = self.tokenizer(input_text, return_tensors="pt")[
"input_ids"