### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
Signed-off-by: zzzzwwjj <1183291235@qq.com>
145 lines
5.1 KiB
Python
145 lines
5.1 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
|
|
import torch.distributed as dist
|
|
|
|
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Arguments of rank table generator", )
|
|
parser.add_argument("--local-host", type=str, required=True, help="local ip")
|
|
parser.add_argument("--prefill-device-cnt",
|
|
type=int,
|
|
required=True,
|
|
help="number of prefill devices")
|
|
parser.add_argument("--decode-device-cnt",
|
|
type=int,
|
|
required=True,
|
|
help="number of decode devices")
|
|
parser.add_argument("--local-device-ids",
|
|
type=str,
|
|
required=False,
|
|
help="local device ids")
|
|
parser.add_argument("--ranktable-path",
|
|
type=str,
|
|
default="./ranktable.json",
|
|
help="output rank table path")
|
|
args = parser.parse_args()
|
|
local_host = args.local_host
|
|
prefill_device_cnt = args.prefill_device_cnt
|
|
decode_device_cnt = args.decode_device_cnt
|
|
|
|
print("enter py")
|
|
|
|
hccn_tool_path = os.environ.get("HCCN_TOOL_PATH",
|
|
"/usr/local/Ascend/driver/tools/hccn_tool")
|
|
master_addr = os.environ.get("MASTER_ADDR")
|
|
master_port = os.environ.get("MASTER_PORT")
|
|
rank = os.environ.get("RANK")
|
|
local_rank = os.environ.get("LOCAL_RANK")
|
|
# This variable is set by torchrun,
|
|
# and is different from WORLD_SIZE in gen_rank_table.sh.
|
|
world_size = os.environ.get("WORLD_SIZE")
|
|
|
|
device_type = get_ascend_device_type()
|
|
|
|
|
|
def get_cmd_stdout(cmd):
|
|
import subprocess
|
|
return subprocess.run(cmd, capture_output=True,
|
|
shell=True).stdout.decode("utf-8").strip()
|
|
|
|
|
|
print(f"local_host: {local_host}")
|
|
print("gen ranktable.json")
|
|
|
|
num_cards = get_cmd_stdout("npu-smi info -l | grep \"Total Count\"").split(
|
|
":")[1].strip()
|
|
num_cards = int(num_cards)
|
|
chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
|
|
"\n")[0].split(":")[1].strip()
|
|
chips_per_card = int(chips_per_card)
|
|
|
|
if args.local_device_ids:
|
|
try:
|
|
local_device_ids = [int(id_str) for id_str in args.local_device_ids.split(',')]
|
|
except ValueError:
|
|
print(f"Error: --local-device-ids must be a comma-separated list of integers. Received: '{args.local_device_ids}'")
|
|
exit(1)
|
|
else:
|
|
local_device_ids = []
|
|
for card_id in range(num_cards):
|
|
for chip_id in range(chips_per_card):
|
|
device_id = card_id * chips_per_card + chip_id
|
|
local_device_ids.append(device_id)
|
|
|
|
# generate local device list for local rank 0, and gather it to all ranks
|
|
local_device_list: list[dict[str, str]] = list()
|
|
if local_rank == "0":
|
|
super_pod_id = "0"
|
|
for idx in range(len(local_device_ids)):
|
|
device_id = local_device_ids[idx]
|
|
chip_id = device_id % chips_per_card
|
|
card_id = device_id // chips_per_card
|
|
if device_type == AscendDeviceType._910_93:
|
|
device_ip = get_cmd_stdout(
|
|
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
|
|
).split(":")[1].strip()
|
|
super_device_id = get_cmd_stdout(
|
|
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
|
|
).split(":")[1].strip()
|
|
super_pod_id = get_cmd_stdout(
|
|
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
|
|
).split(":")[1].strip()
|
|
else:
|
|
device_ip = get_cmd_stdout(
|
|
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
|
|
).split(":")[1].strip()
|
|
|
|
device_info = {
|
|
"server_id": local_host,
|
|
"device_id": str(device_id),
|
|
"device_ip": str(device_ip),
|
|
}
|
|
if device_type == AscendDeviceType._910_93:
|
|
device_info.update({
|
|
"super_pod_id": str(super_pod_id),
|
|
"super_device_id": str(super_device_id)
|
|
})
|
|
local_device_list.append(device_info)
|
|
|
|
dist.init_process_group(backend=dist.Backend.GLOO)
|
|
global_device_list = [None] * dist.get_world_size()
|
|
dist.all_gather_object(global_device_list, local_device_list)
|
|
global_device_list = [
|
|
device_info for device_list in global_device_list
|
|
for device_info in device_list # type: ignore[attr-defined]
|
|
]
|
|
cnt = 1
|
|
for device_info in global_device_list: # type: ignore[assignment]
|
|
device_info["cluster_id"] = str(cnt)
|
|
cnt += 1
|
|
assert (prefill_device_cnt + decode_device_cnt) <= len(global_device_list), \
|
|
"prefill_device_cnt + decode_device_cnt must be less than or equal to number of all devices in cluster"
|
|
ranktable = {
|
|
"version":
|
|
"1.2",
|
|
"server_count":
|
|
str(world_size),
|
|
"prefill_device_list":
|
|
global_device_list[:prefill_device_cnt],
|
|
"decode_device_list":
|
|
global_device_list[prefill_device_cnt:prefill_device_cnt +
|
|
decode_device_cnt],
|
|
"status":
|
|
"completed"
|
|
}
|
|
|
|
if local_rank == '0':
|
|
os.makedirs(os.path.dirname(args.ranktable_path), exist_ok=True)
|
|
with open(args.ranktable_path, "w") as f:
|
|
json.dump(ranktable, f, indent=4)
|
|
|
|
print("gen ranktable.json done")
|