Improve the control of streaming and improve the first token latency in streaming (#117)
This commit is contained in:
@@ -28,7 +28,7 @@ def test_generate_worker(model_path, tp_rank, tp_size):
|
||||
|
||||
reqs = []
|
||||
for i in range(len(prompts)):
|
||||
req = Req(i)
|
||||
req = Req(i, None, None)
|
||||
req.input_ids = tokenizer.encode(prompts[i])[:cut_num]
|
||||
req.sampling_params = sampling_params
|
||||
reqs.append(req)
|
||||
|
||||
@@ -112,6 +112,7 @@ def test_generate_worker(
|
||||
prefill_params = (
|
||||
torch.tensor(np.array(input_ids)).cuda(),
|
||||
np.array(pixel_values),
|
||||
[None],
|
||||
[offset],
|
||||
*params,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user