Fix MTP error when enabling two-batch overlap (#7569)
This commit is contained in:
@@ -168,7 +168,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|||||||
capture_bs += [model_runner.req_to_token_pool.size]
|
capture_bs += [model_runner.req_to_token_pool.size]
|
||||||
|
|
||||||
if server_args.enable_two_batch_overlap:
|
if server_args.enable_two_batch_overlap:
|
||||||
capture_bs = [bs for bs in capture_bs if bs >= 2]
|
capture_bs = [bs for bs in capture_bs if bs % 2 == 0]
|
||||||
|
|
||||||
if server_args.cuda_graph_max_bs:
|
if server_args.cuda_graph_max_bs:
|
||||||
capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
|
capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
|
||||||
|
|||||||
Reference in New Issue
Block a user