Improve the control of streaming and improve the first token latency in streaming (#117)

2024-01-29 17:05:42 -08:00
parent cd6872334e
commit 6f560c761b
12 changed files with 46 additions and 23 deletions
--- a/test/srt/model/test_llava_low_api.py
+++ b/test/srt/model/test_llava_low_api.py
@@ -112,6 +112,7 @@ def test_generate_worker(
    prefill_params = (
        torch.tensor(np.array(input_ids)).cuda(),
        np.array(pixel_values),
+        [None],
        [offset],
        *params,
    )