[releases/v0.18.0][Build][BugFix] support ascend950 npu-smi info interface changes and make SOC_VERSION actually take effect (#8061)
### What this PR does / why we need it? Cherry-picked from #8062 This PR adds support for the Ascend950 NPU by updating the `npu-smi info` parsing logic to handle interface changes. It also improves robustness by ensuring that `SOC_VERSION` actually takes effect by disabling `get_chip_type` given this environment variable. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed. Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
23
setup.py
23
setup.py
@@ -72,14 +72,31 @@ def get_value_from_lines(lines: list[str], key: str) -> str:
|
|||||||
|
|
||||||
def get_chip_type() -> str:
|
def get_chip_type() -> str:
|
||||||
try:
|
try:
|
||||||
|
# Get NPU ID
|
||||||
npu_info_lines = subprocess.check_output(["npu-smi", "info", "-l"]).decode().strip().split("\n")
|
npu_info_lines = subprocess.check_output(["npu-smi", "info", "-l"]).decode().strip().split("\n")
|
||||||
npu_id = int(get_value_from_lines(npu_info_lines, "NPU ID"))
|
npu_id = int(get_value_from_lines(npu_info_lines, "NPU ID"))
|
||||||
|
|
||||||
|
# Stage 1: query board info without -c flag
|
||||||
|
board_info_lines = (
|
||||||
|
subprocess.check_output(["npu-smi", "info", "-t", "board", "-i", str(npu_id)]).decode().strip().split("\n")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if Chip Name exists (Ascend950 includes it directly)
|
||||||
|
chip_name = get_value_from_lines(board_info_lines, "Chip Name")
|
||||||
|
|
||||||
|
# Stage 2: query with -c flag only if Chip Name not found (A2/A3/310P)
|
||||||
|
if not chip_name:
|
||||||
chip_info_lines = (
|
chip_info_lines = (
|
||||||
subprocess.check_output(["npu-smi", "info", "-t", "board", "-i", str(npu_id), "-c", "0"])
|
subprocess.check_output(["npu-smi", "info", "-t", "board", "-i", str(npu_id), "-c", "0"])
|
||||||
.decode()
|
.decode()
|
||||||
.strip()
|
.strip()
|
||||||
.split("\n")
|
.split("\n")
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
# Ascend950 already has complete info
|
||||||
|
chip_info_lines = board_info_lines
|
||||||
|
|
||||||
|
# Extract required fields
|
||||||
chip_name = get_value_from_lines(chip_info_lines, "Chip Name")
|
chip_name = get_value_from_lines(chip_info_lines, "Chip Name")
|
||||||
chip_type = get_value_from_lines(chip_info_lines, "Chip Type")
|
chip_type = get_value_from_lines(chip_info_lines, "Chip Type")
|
||||||
npu_name = get_value_from_lines(chip_info_lines, "NPU Name")
|
npu_name = get_value_from_lines(chip_info_lines, "NPU Name")
|
||||||
@@ -113,9 +130,8 @@ def get_chip_type() -> str:
|
|||||||
|
|
||||||
envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm_ascend", "envs.py"))
|
envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm_ascend", "envs.py"))
|
||||||
|
|
||||||
soc_version = get_chip_type()
|
|
||||||
|
|
||||||
if not envs.SOC_VERSION:
|
if not envs.SOC_VERSION:
|
||||||
|
soc_version = get_chip_type()
|
||||||
if not soc_version:
|
if not soc_version:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Could not determine chip type automatically via 'npu-smi'. "
|
"Could not determine chip type automatically via 'npu-smi'. "
|
||||||
@@ -128,9 +144,6 @@ if not envs.SOC_VERSION:
|
|||||||
"You can also refer to the SOC_VERSION defaults in Dockerfile*."
|
"You can also refer to the SOC_VERSION defaults in Dockerfile*."
|
||||||
)
|
)
|
||||||
envs.SOC_VERSION = soc_version
|
envs.SOC_VERSION = soc_version
|
||||||
else:
|
|
||||||
if soc_version and soc_version != envs.SOC_VERSION:
|
|
||||||
logging.warning(f"env SOC_VERSION: {envs.SOC_VERSION} is not equal to soc_version from npu-smi: {soc_version}")
|
|
||||||
|
|
||||||
|
|
||||||
def gen_build_info():
|
def gen_build_info():
|
||||||
|
|||||||
Reference in New Issue
Block a user