[Nightly] Remove gen_ranktable logic (#4941)
### What this PR does / why we need it?
Since the `llmdatadist` has sunset, the logic gen_ranktable should also
be removed
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -18,8 +18,6 @@ disaggregated_prefill:
|
|||||||
enabled: true
|
enabled: true
|
||||||
prefiller_host_index: [0, 1]
|
prefiller_host_index: [0, 1]
|
||||||
decoder_host_index: [2]
|
decoder_host_index: [2]
|
||||||
ranktable_gen_path: "examples/disaggregated_prefill_v1/gen_ranktable.py"
|
|
||||||
ranktable_path: "/tmp/ranktable.json"
|
|
||||||
|
|
||||||
deployment:
|
deployment:
|
||||||
-
|
-
|
||||||
|
|||||||
@@ -17,8 +17,6 @@ disaggregated_prefill:
|
|||||||
enabled: true
|
enabled: true
|
||||||
prefiller_host_index: [0, 1]
|
prefiller_host_index: [0, 1]
|
||||||
decoder_host_index: [2]
|
decoder_host_index: [2]
|
||||||
ranktable_gen_path: "examples/disaggregated_prefill_v1/gen_ranktable.py"
|
|
||||||
ranktable_path: "/tmp/ranktable.json"
|
|
||||||
|
|
||||||
deployment:
|
deployment:
|
||||||
-
|
-
|
||||||
|
|||||||
@@ -113,9 +113,6 @@ class MultiNodeConfig:
|
|||||||
self.decode_start_index: int = decode_host_index[0]
|
self.decode_start_index: int = decode_host_index[0]
|
||||||
self.num_prefillers = self.decode_start_index
|
self.num_prefillers = self.decode_start_index
|
||||||
self.num_decoders = self.num_nodes - self.num_prefillers
|
self.num_decoders = self.num_nodes - self.num_prefillers
|
||||||
if self.disaggregated_prefill.get(
|
|
||||||
"ranktable_gen_path") is not None:
|
|
||||||
self._gen_ranktable()
|
|
||||||
|
|
||||||
def _init_dist_env(self):
|
def _init_dist_env(self):
|
||||||
self.envs["HCCL_IF_IP"] = self.cur_ip
|
self.envs["HCCL_IF_IP"] = self.cur_ip
|
||||||
@@ -286,54 +283,3 @@ class MultiNodeConfig:
|
|||||||
@property
|
@property
|
||||||
def is_master(self):
|
def is_master(self):
|
||||||
return self.cur_index == 0
|
return self.cur_index == 0
|
||||||
|
|
||||||
def _gen_ranktable(self):
|
|
||||||
cluster_ip = [nodes.ip for nodes in self.nodes_info]
|
|
||||||
assert len(cluster_ip) > 0
|
|
||||||
nnodes = self.num_nodes
|
|
||||||
node_rank = self.cur_index
|
|
||||||
master_addr = cluster_ip[0]
|
|
||||||
master_port = DISAGGEGATED_PREFILL_PORT
|
|
||||||
assert self.disaggregated_prefill is not None
|
|
||||||
ranktable_gen_path = self.disaggregated_prefill.get(
|
|
||||||
"ranktable_gen_path")
|
|
||||||
ranktable_path = self.disaggregated_prefill.get("ranktable_path")
|
|
||||||
assert ranktable_gen_path is not None and ranktable_path is not None
|
|
||||||
if os.path.exists(str(ranktable_path)):
|
|
||||||
logger.info("ranktable has already generated")
|
|
||||||
return
|
|
||||||
|
|
||||||
local_host = self.cur_ip
|
|
||||||
|
|
||||||
cmd = [
|
|
||||||
"torchrun",
|
|
||||||
"--nproc_per_node",
|
|
||||||
"1",
|
|
||||||
"--nnodes",
|
|
||||||
str(nnodes),
|
|
||||||
"--node_rank",
|
|
||||||
str(node_rank),
|
|
||||||
"--master_addr",
|
|
||||||
master_addr,
|
|
||||||
"--master_port",
|
|
||||||
str(master_port),
|
|
||||||
ranktable_gen_path,
|
|
||||||
"--ranktable-path",
|
|
||||||
str(ranktable_path),
|
|
||||||
"--local-host",
|
|
||||||
local_host,
|
|
||||||
"--prefill-device-cnt",
|
|
||||||
str(self.npu_per_node * self.num_prefillers),
|
|
||||||
"--decode-device-cnt",
|
|
||||||
str(self.npu_per_node * self.num_decoders),
|
|
||||||
]
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
assert self.nic_name is not None
|
|
||||||
env["GLOO_SOCKET_IFNAME"] = self.nic_name
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Generating ranktable from command: {' '.join(map(str, cmd))}")
|
|
||||||
subprocess.run(cmd, env=env, check=True)
|
|
||||||
assert os.path.exists(
|
|
||||||
str(ranktable_path)), "failed generate ranktable.json"
|
|
||||||
|
|||||||
Reference in New Issue
Block a user