Fix hash collision for multi modal models (#2256)
This commit is contained in:
@@ -147,6 +147,11 @@ class LlavaBaseForCausalLM(nn.Module):
|
||||
else:
|
||||
max_image_offset.append(-1)
|
||||
|
||||
# Clamp input ids. This is because the input_ids for the image tokens are
|
||||
# filled with the hash values of the image for the prefix matching in the radix attention.
|
||||
# There values are useless because their embeddings will be replaced by vision embeddings anyway.
|
||||
input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
|
||||
|
||||
# Embed text inputs
|
||||
input_embeds = self.language_model.model.embed_tokens(input_ids)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user