Improve the control of streaming and improve the first token latency in streaming (#117)

2024-01-29 17:05:42 -08:00
parent cd6872334e
commit 6f560c761b
12 changed files with 46 additions and 23 deletions
--- a/python/sglang/srt/models/llava.py
+++ b/python/sglang/srt/models/llava.py
@@ -158,7 +158,7 @@ class LlavaLlamaForCausalLM(nn.Module):
                                    num_patch_height, num_patch_width, height, width, -1
                                )
                            else:
-                                raise NotImplementedError
+                                raise NotImplementedError()
                            if "unpad" in self.mm_patch_merge_type:
                                image_feature = image_feature.permute(
                                    4, 0, 2, 1, 3