Improve the control of streaming and improve the first token latency in streaming (#117)

This commit is contained in:
Lianmin Zheng
2024-01-29 17:05:42 -08:00
committed by GitHub
parent cd6872334e
commit 6f560c761b
12 changed files with 46 additions and 23 deletions

View File

@@ -28,7 +28,7 @@ def test_generate_worker(model_path, tp_rank, tp_size):
reqs = []
for i in range(len(prompts)):
req = Req(i)
req = Req(i, None, None)
req.input_ids = tokenizer.encode(prompts[i])[:cut_num]
req.sampling_params = sampling_params
reqs.append(req)

View File

@@ -112,6 +112,7 @@ def test_generate_worker(
prefill_params = (
torch.tensor(np.array(input_ids)).cuda(),
np.array(pixel_values),
[None],
[offset],
*params,
)