[refact] unified soc_version code (#4359)

### What this PR does / why we need it?

Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.

We need to unify these codes based on the following points:

1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.

Based on the above points, we have made the following changes:

1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.

### Does this PR introduce _any_ user-facing change?

When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.

- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

Signed-off-by: zzzzwwjj <1183291235@qq.com>
This commit is contained in:
zzzzwwjj
2025-11-26 14:28:55 +08:00
committed by GitHub
parent a91e76cd84
commit 136ea9ff56
42 changed files with 361 additions and 243 deletions

View File

@@ -25,7 +25,8 @@ from vllm.forward_context import get_forward_context
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
get_ascend_device_type, is_enable_nz)
def quant_per_tensor(in_tensor: torch.Tensor,
@@ -45,7 +46,8 @@ class AscendW8A8LinearMethod:
def __init__(self) -> None:
# aclnn quant matmul requires to transpose matrix B, set to true by default.
self.transpose_weight = not is_310p()
self.transpose_weight = get_ascend_device_type(
) != AscendDeviceType._310P
@staticmethod
def get_weight(
@@ -147,7 +149,7 @@ class AscendW8A8LinearMethod:
)
quant_bias = layer.quant_bias if tp_rank == 0 else None
if is_310p():
if get_ascend_device_type() == AscendDeviceType._310P:
# On 300I Duo platform, we need transpose again if
# using nz. This transpose can be skipped in torchair.
output = torch_npu.npu_quant_matmul(
@@ -299,7 +301,7 @@ class AscendW8A8FusedMoEMethod:
e_score_correction_bias=e_score_correction_bias,
global_num_experts=global_num_experts)
if is_310p():
if get_ascend_device_type() == AscendDeviceType._310P:
return fused_experts_310p(hidden_states=x,
w1=layer.w13_weight,
w1_scale=layer.w13_weight_scale,
@@ -328,7 +330,7 @@ class AscendW8A8FusedMoEMethod:
expert_map=expert_map)
def process_weights_after_loading(self, layer):
if not is_310p():
if get_ascend_device_type() != AscendDeviceType._310P:
layer.w13_weight.data = layer.w13_weight.data.transpose(
1, 2).contiguous()
layer.w2_weight.data = layer.w2_weight.data.transpose(
@@ -345,7 +347,7 @@ class AscendW8A8FusedMoEMethod:
expanding_factor_w13 = layer.w13_weight.data.shape[1]
expanding_factor_w2 = layer.w2_weight.data.shape[1]
if is_310p():
if get_ascend_device_type() == AscendDeviceType._310P:
layer.w13_input_scale.data = torch.nn.Parameter(
layer.w13_input_scale.data.max())
layer.w2_input_scale.data = torch.nn.Parameter(
@@ -365,7 +367,8 @@ class AscendW8A8FusedMoEMethod:
# converting ACL_FORMAT_FRACTAL_NZ.
# npu_quant_grouped_matmul_dequant in eager mode does not accept
# ACL_FORMAT_FRACTAL_NZ.
if not is_310p() and is_enable_nz():
if get_ascend_device_type() != AscendDeviceType._310P and is_enable_nz(
):
layer.w13_weight.data = torch_npu.npu_format_cast(
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
layer.w2_weight.data = torch_npu.npu_format_cast(