init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -19,6 +19,7 @@ from types import MappingProxyType
 from typing import Any, Callable, Dict, List, Mapping, Optional

 import torch
+from vllm.config import get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                  FusedMoeWeightScaleSupported)
@@ -32,13 +33,15 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    UnquantizedEmbeddingMethod, VocabParallelEmbedding)
-from vllm.model_executor.parameter import PerTensorScaleParameter
 from vllm.model_executor.utils import set_weight_attrs

+from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
+                                                    get_otp_group)
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
-from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, mlp_tp_enable,
+                               oproj_tp_enable)

-from .quantizer import AscendQuantizer
+from .utils import get_quant_method


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
@@ -50,6 +53,7 @@ class AscendQuantConfig(QuantizationConfig):
    """

    def __init__(self, quant_config: Dict[str, Any]):
+        super().__init__()
        self.quant_description = quant_config

    def __repr__(self) -> str:
@@ -85,7 +89,14 @@ class AscendQuantConfig(QuantizationConfig):

    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        vllm_config = get_current_vllm_config()
+        model_type = vllm_config.model_config.hf_config.model_type
+        if model_type in packed_modules_model_mapping:
+            self.packed_modules_mapping = packed_modules_model_mapping[
+                model_type]
        from vllm.attention.layer import Attention
+        if prefix.startswith("language_model"):
+            prefix = prefix.split('.', 1)[-1]
        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix,
                                            self.packed_modules_mapping):
@@ -147,21 +158,86 @@ class AscendQuantConfig(QuantizationConfig):
        return []


+packed_modules_model_mapping = {
+    "qwen3_moe": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+    },
+    "deepseek_v2": {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+    "deepseek_v3": {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+    # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
+    # NOTE 2.The description file generated by the current msmodelslim tool does not have
+    # MTP layer info. Please manually add it and set the value to FLOAT.
+    "deepseek_mtp": {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+    "qwen3_next": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
+    },
+    "qwen2_5_vl": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    },
+    "glm4_moe": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    },
+}
+
+
 class AscendLinearMethod(LinearMethodBase):
    """Linear method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for linear methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)

    def create_weights(
        self,
@@ -174,7 +250,6 @@ class AscendLinearMethod(LinearMethodBase):
        **extra_weight_attrs,
    ) -> None:
        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")

        weight_dict = self.quant_method.get_weight(input_size_per_partition,
                                                   output_size_per_partition,
@@ -187,8 +262,7 @@ class AscendLinearMethod(LinearMethodBase):

        pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
        for pertensor_name, pertensor_param in pertensor_dict.items():
-            param = PerTensorScaleParameter(data=pertensor_param,
-                                            weight_loader=weight_loader)
+            param = torch.nn.Parameter(pertensor_param, requires_grad=False)
            # disable warning
            param.ignore_warning = True
            layer.register_parameter(pertensor_name, param)
@@ -223,25 +297,27 @@ class AscendLinearMethod(LinearMethodBase):
        bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if isinstance(layer, RowParallelLinear):
-            tp_rank = get_tensor_model_parallel_rank()
-            return self.quant_method.apply(layer, x, bias, tp_rank)
-        return self.quant_method.apply(layer, x, bias)
+            if layer.prefix.find("o_proj") != -1 and oproj_tp_enable():
+                tp_rank = get_otp_group().rank_in_group
+            elif layer.prefix.find("down_proj") != -1 and mlp_tp_enable():
+                tp_rank = get_mlp_tp_group().rank_in_group
+            else:
+                tp_rank = get_tensor_model_parallel_rank()
+        else:
+            tp_rank = 0
+        return self.quant_method.apply(layer, x, bias, tp_rank)


 class AscendKVCacheMethod(BaseKVCacheMethod):
    """KVCache method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix)
-        self.quant_method = self.quantizer.build_attention_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "attention")

    def create_weights(self, layer: torch.nn.Module) -> None:
        # Different from linear method, there are no weight processing/slicing
@@ -263,18 +339,15 @@ class AscendKVCacheMethod(BaseKVCacheMethod):
 class AscendFusedMoEMethod(FusedMoEMethodBase):
    """FusedMoE method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]):
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_moe_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "moe",
+                                             packed_modules_mapping)

    def create_weights(
        self,
@@ -341,17 +414,20 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
        if hasattr(self.quant_method, "process_weights_after_loading"):
            self.quant_method.process_weights_after_loading(layer)

+    def get_fused_moe_quant_config(self, layer: torch.nn.Module):
+        # TODO: implement this function
+        pass
+

 class AscendEmbeddingMethod(AscendLinearMethod):
    """Embedding method for Ascend quantization.
-      This class calls AscendQuantizer to search a specific quantization
-      implementations supported on ascend hardware for Embedding methods.
+    
      Args:
          quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)