[Feature] Hybrid EP and TP (#8590)
This commit is contained in:
@@ -138,6 +138,7 @@ class BenchArgs:
|
||||
def load_model(server_args, port_args, tp_rank):
|
||||
suppress_other_loggers()
|
||||
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
||||
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
||||
|
||||
model_config = ModelConfig.from_server_args(server_args)
|
||||
model_runner = ModelRunner(
|
||||
@@ -146,6 +147,8 @@ def load_model(server_args, port_args, tp_rank):
|
||||
gpu_id=tp_rank,
|
||||
tp_rank=tp_rank,
|
||||
tp_size=server_args.tp_size,
|
||||
moe_ep_rank=moe_ep_rank,
|
||||
moe_ep_size=server_args.ep_size,
|
||||
pp_rank=0,
|
||||
pp_size=1,
|
||||
nccl_port=port_args.nccl_port,
|
||||
|
||||
Reference in New Issue
Block a user