Fix hash collision for multi modal models (#2256)

This commit is contained in:
Lianmin Zheng
2024-11-29 03:15:58 -08:00
committed by GitHub
parent fe97a2d40f
commit f50a6cf443
6 changed files with 42 additions and 39 deletions

View File

@@ -147,6 +147,11 @@ class LlavaBaseForCausalLM(nn.Module):
else:
max_image_offset.append(-1)
# Clamp input ids. This is because the input_ids for the image tokens are
# filled with the hash values of the image for the prefix matching in the radix attention.
# There values are useless because their embeddings will be replaced by vision embeddings anyway.
input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
# Embed text inputs
input_embeds = self.language_model.model.embed_tokens(input_ids)