Fix hash collision for multi modal models (#2256)

2024-11-29 03:15:58 -08:00
parent fe97a2d40f
commit f50a6cf443
6 changed files with 42 additions and 39 deletions
--- a/python/sglang/srt/models/llava.py
+++ b/python/sglang/srt/models/llava.py
@@ -147,6 +147,11 @@ class LlavaBaseForCausalLM(nn.Module):
                else:
                    max_image_offset.append(-1)

+            # Clamp input ids. This is because the input_ids for the image tokens are
+            # filled with the hash values of the image for the prefix matching in the radix attention.
+            # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+            input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
            # Embed text inputs
            input_embeds = self.language_model.model.embed_tokens(input_ids)