[refact] unified soc_version code (#4359)

### What this PR does / why we need it? Currently, there are two paths to judge the chip type in code, `get_ascend_soc_version` use `get_soc_version` api in torch_npu, and `is_310p` `use _build_info.__soc_version__`, which generate when install. We need to unify the two paths. We need to unify these codes based on the following points: 1. We need to ensure consistency in chip type judgment between compiling and running states; 2. In compiling state, we need chip type to complete op's compilation, but in running state, we only need device type(910B/910_93/310P/910_95/etc) to make code branch judgement; 3. In compiling state, torch_npu may not have been installed yet, so we can't use torch_npu's api. Based on the above points, we have made the following changes: 1. When user set env `SOC_VERSION`, use it; when not set, query soc_version by `npu-smi`; 2. generate device_type based on soc_version when compiling, and write `__device_type__` instead of `__soc_version__` in `_build_info.py`; 3. In running state, use `__device_type__` to judge code branch. ### Does this PR introduce _any_ user-facing change? When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default, we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in the list `soc_to_device` in `setup.py`. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
parent a91e76cd84
commit 136ea9ff56
42 changed files with 361 additions and 243 deletions
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -25,7 +25,8 @@ from vllm.forward_context import get_forward_context

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, is_enable_nz
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
+                               get_ascend_device_type, is_enable_nz)


 def quant_per_tensor(in_tensor: torch.Tensor,
@@ -45,7 +46,8 @@ class AscendW8A8LinearMethod:

    def __init__(self) -> None:
        # aclnn quant matmul requires to transpose matrix B, set to true by default.
-        self.transpose_weight = not is_310p()
+        self.transpose_weight = get_ascend_device_type(
+        ) != AscendDeviceType._310P

    @staticmethod
    def get_weight(
@@ -147,7 +149,7 @@ class AscendW8A8LinearMethod:
                )

        quant_bias = layer.quant_bias if tp_rank == 0 else None
-        if is_310p():
+        if get_ascend_device_type() == AscendDeviceType._310P:
            # On 300I Duo platform, we need transpose again if
            # using nz. This transpose can be skipped in torchair.
            output = torch_npu.npu_quant_matmul(
@@ -299,7 +301,7 @@ class AscendW8A8FusedMoEMethod:
            e_score_correction_bias=e_score_correction_bias,
            global_num_experts=global_num_experts)

-        if is_310p():
+        if get_ascend_device_type() == AscendDeviceType._310P:
            return fused_experts_310p(hidden_states=x,
                                      w1=layer.w13_weight,
                                      w1_scale=layer.w13_weight_scale,
@@ -328,7 +330,7 @@ class AscendW8A8FusedMoEMethod:
                             expert_map=expert_map)

    def process_weights_after_loading(self, layer):
-        if not is_310p():
+        if get_ascend_device_type() != AscendDeviceType._310P:
            layer.w13_weight.data = layer.w13_weight.data.transpose(
                1, 2).contiguous()
            layer.w2_weight.data = layer.w2_weight.data.transpose(
@@ -345,7 +347,7 @@ class AscendW8A8FusedMoEMethod:
        expanding_factor_w13 = layer.w13_weight.data.shape[1]
        expanding_factor_w2 = layer.w2_weight.data.shape[1]

-        if is_310p():
+        if get_ascend_device_type() == AscendDeviceType._310P:
            layer.w13_input_scale.data = torch.nn.Parameter(
                layer.w13_input_scale.data.max())
            layer.w2_input_scale.data = torch.nn.Parameter(
@@ -365,7 +367,8 @@ class AscendW8A8FusedMoEMethod:
        # converting ACL_FORMAT_FRACTAL_NZ.
        # npu_quant_grouped_matmul_dequant in eager mode does not accept
        # ACL_FORMAT_FRACTAL_NZ.
-        if not is_310p() and is_enable_nz():
+        if get_ascend_device_type() != AscendDeviceType._310P and is_enable_nz(
+        ):
            layer.w13_weight.data = torch_npu.npu_format_cast(
                layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
            layer.w2_weight.data = torch_npu.npu_format_cast(