[Feat] Unquantized linear nz support (#2619)

### What this PR does / why we need it? Currently, when executing to the Linear layer of the model in vLLM-Ascend, the weights input format is ND in unquantized case and skipped ascend case, which is slower than FRACTAL_NZ. This PR supplements the execution logic for Linear layer. When VLLM_ASCEND_ENABLE_MLP_OPTIMIZE=1 and CANN version is 8.3, the weights of the Linear layer will be converted to FRACTAL_NZ, in both unquantized case and skipped ascend case. - vLLM version: main - vLLM main: 267c80d31f Signed-off-by: anon189Ty <Stari_Falcon@outlook.com>
2025-09-11 11:40:00 +08:00
parent 5691104249
commit 7b2ecc1e9a
4 changed files with 111 additions and 10 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -23,8 +23,7 @@ from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                  FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               RowParallelLinear,
-                                               UnquantizedLinearMethod)
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import \
    register_quantization_config
 from vllm.model_executor.layers.quantization.base_config import (
@@ -38,6 +37,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
                                                    get_otp_group)
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
+from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, mlp_tp_enable,
                               oproj_tp_enable)

@@ -92,7 +92,7 @@ class AscendQuantConfig(QuantizationConfig):
        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix,
                                            self.packed_modules_mapping):
-                return UnquantizedLinearMethod()
+                return AscendUnquantizedLinearMethod()
            return AscendLinearMethod(self, prefix,
                                      self.packed_modules_mapping)
        elif isinstance(layer, Attention) and \