[1/N][Refactor][Quantization] remove redundant quantizer class (#2680)

### What this PR does / why we need it? AscendQuantizer/LLMQuantizer class is used to select quant method based on quant config and some other arguments, but it is more simple and clean replacing these classes with map. So i remove them. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ut and e2e test - vLLM version: v0.10.1.1 - vLLM main: 6997a25ac6 Signed-off-by: 22dimensions <waitingwind@foxmail.com>
2025-09-04 11:35:14 +08:00
parent d4370ebc42
commit 37f5a29cd4
10 changed files with 321 additions and 554 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -38,7 +38,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD

-from .quantizer import AscendQuantizer
+from .utils import get_quant_method


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
@@ -150,18 +150,15 @@ class AscendQuantConfig(QuantizationConfig):
 class AscendLinearMethod(LinearMethodBase):
    """Linear method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for linear methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)

    def create_weights(
        self,
@@ -231,17 +228,13 @@ class AscendLinearMethod(LinearMethodBase):
 class AscendKVCacheMethod(BaseKVCacheMethod):
    """KVCache method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix)
-        self.quant_method = self.quantizer.build_attention_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "attention")

    def create_weights(self, layer: torch.nn.Module) -> None:
        # Different from linear method, there are no weight processing/slicing
@@ -263,18 +256,15 @@ class AscendKVCacheMethod(BaseKVCacheMethod):
 class AscendFusedMoEMethod(FusedMoEMethodBase):
    """FusedMoE method for Ascend quantization.

-    This class calls AscendQuantizer to search a specific quantization
-    implementations supported on ascend hardware for kvcache methods.
-
    Args:
        quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]):
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_moe_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "moe",
+                                             packed_modules_mapping)

    def create_weights(
        self,
@@ -344,14 +334,13 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):

 class AscendEmbeddingMethod(AscendLinearMethod):
    """Embedding method for Ascend quantization.
-      This class calls AscendQuantizer to search a specific quantization
-      implementations supported on ascend hardware for Embedding methods.
+    
      Args:
          quant_config: The Ascend quantization config.
    """

    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                 packed_modules_mapping: Dict[str, Any]) -> None:
-        self.quantizer = AscendQuantizer.get_quantizer(
-            quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
+        self.quant_method = get_quant_method(quant_config.quant_description,
+                                             prefix, "linear",
+                                             packed_modules_mapping)
--- a/vllm_ascend/quantization/quantizer.py
+++ b/vllm_ascend/quantization/quantizer.py
@@ -1,311 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import importlib
-import sys
-import types
-from typing import Any, Dict, List, Optional
-
-from vllm.logger import logger
-
-from .func_wrapper import (wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init,
-                           wrapper_vocab_parallel_embedding_init)
-from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
-                           AscendW4A8DynamicLinearMethod)
-from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
-                   AscendW8A8LinearMethod)
-from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
-                           AscendW8A8DynamicLinearMethod)
-
-CUSTOMIZED_QUANTIZER_TYPE: List[str] = []
-
-
-class AscendQuantizer:
-    """An interface to different quantization implementations for ascend hardwares."""
-
-    @classmethod
-    def get_quantizer(cls,
-                      quant_config: Dict[str, Any],
-                      prefix: str,
-                      packed_modules_mapping: Optional[Dict[str,
-                                                            Any]] = dict()):
-        # TODO: Need a param to choose quantization algorithms.
-        quantization_algorithm = ''
-
-        if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE:
-            return
-
-        return VLLMAscendQuantizer.get_quantizer(quant_config, prefix,
-                                                 packed_modules_mapping)
-
-    def build_linear_method(self):
-        raise NotImplementedError
-
-    def build_moe_method(self):
-        raise NotImplementedError
-
-    def build_attention_method(self):
-        raise NotImplementedError
-
-
-class VLLMAscendQuantizer:
-    _instance: Optional[object] = None
-    patched = False
-
-    def __init__(self, quant_description):
-        if VLLMAscendQuantizer.patched:
-            return
-        for name in quant_description.keys():
-            if "norm.bias" in name:
-                VLLMAscendQuantizer.apply_patch(
-                    "vllm.model_executor.layers.layernorm.RMSNorm", "__init__",
-                    [wrapper_rmsnorm_init])
-                VLLMAscendQuantizer.apply_patch(
-                    "vllm_ascend.ops.layernorm.AscendRMSNorm", "forward_oot",
-                    [wrapper_rmsnorm_forward_oot])
-                VLLMAscendQuantizer.apply_patch(
-                    "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding",
-                    "__init__", [wrapper_vocab_parallel_embedding_init])
-                break
-        VLLMAscendQuantizer.patched = True
-        logger.info("Using the vLLM Ascend Quantizer version now!")
-
-    @staticmethod
-    def apply_patch(target_module, target_function, wrappers):
-
-        original_module, original_function = VLLMAscendQuantizer.parse_path(
-            target_module, target_function, False)
-
-        original_function_id = id(original_function)
-
-        candidate = original_function
-        for wrapper in wrappers:
-            candidate = wrapper(candidate)
-        if target_function is not None:
-            setattr(original_module, target_function, candidate)
-
-        for _, value in sys.modules.copy().items():
-            if target_function is None:
-                continue
-            try:
-                attr = getattr(value, target_function, None)
-                if attr is not None and id(attr) == original_function_id:
-                    setattr(value, target_function, candidate)
-            except ImportError:
-                continue
-
-    @staticmethod
-    def parse_path(module_path, function_name, create_dummy):
-        """
-        Parse module path and resolve/create modules as needed.
-
-        Args:
-            module_path: Dot-separated module path
-            function_name: Target function name (None for module only)
-            create_dummy: Create dummy modules/functions when missing
-
-        Returns:
-            Tuple of (resolved module, target function/none)
-
-        Raises:
-            ModuleNotFoundError: If module path is invalid and create_dummy=False
-            AttributeError: If function is missing and create_dummy=False
-        """
-        from importlib.machinery import ModuleSpec
-
-        def create_dummy_module(full_path, parent=None):
-            """Create and register a placeholder module"""
-            dummy = types.ModuleType(full_path)
-            dummy.__file__ = "vllm_ascend.dummy_module.py"
-            dummy.__spec__ = ModuleSpec(full_path, None)
-            sys.modules[full_path] = dummy
-            if parent:
-                setattr(parent, full_path.split(".")[-1], dummy)
-            return dummy
-
-        def create_placeholder_function(func_name):
-            """Create dummy function that raises when called"""
-
-            def placeholder(*args, **kwargs):
-                raise NotImplementedError(
-                    f"Function {func_name} is a placeholder")
-
-            placeholder.__name__ = func_name
-            return placeholder
-
-        modules = module_path.split(".")
-        current_module = None
-        processed_path = []
-
-        for idx, part in enumerate(modules):
-            current_path = ".".join(modules[:idx + 1])
-            parent_path = ".".join(modules[:idx]) if idx > 0 else None
-
-            try:
-                current_module = importlib.import_module(current_path)
-            except ModuleNotFoundError:
-                # Handle missing module
-                parent = importlib.import_module(
-                    parent_path) if parent_path else None
-                if parent and hasattr(parent, part):
-                    # Use existing attribute from parent
-                    current_module = getattr(parent, part)
-                    # Check for early function resolution
-                    if function_name and hasattr(current_module,
-                                                 function_name):
-                        return current_module, getattr(current_module,
-                                                       function_name)
-                    if function_name and create_dummy:
-                        ph_func = create_placeholder_function(function_name)
-                        setattr(current_module, function_name, ph_func)
-                        return current_module, ph_func
-                    if function_name:
-                        raise AttributeError(
-                            f"Function {function_name} missing in {current_path}"
-                        )
-                else:
-                    if not create_dummy:
-                        raise
-                    # Create and register dummy module
-                    current_module = create_dummy_module(
-                        current_path,
-                        parent=importlib.import_module(parent_path)
-                        if parent_path else None)
-
-            processed_path.append(part)
-
-        # Final function handling
-        final_module = sys.modules[module_path]
-        if function_name is not None:
-            if not hasattr(final_module, function_name):
-                if create_dummy:
-                    ph_func = create_placeholder_function(function_name)
-                    setattr(final_module, function_name, ph_func)
-                else:
-                    setattr(final_module, function_name, None)
-            return final_module, getattr(final_module, function_name)
-
-        return final_module, None
-
-    @staticmethod
-    def build_linear_method():
-        raise NotImplementedError(
-            "Linear method is not implemented for the current quant type.")
-
-    @staticmethod
-    def build_moe_method():
-        raise NotImplementedError(
-            "MoE method is not implemented for the current quant type.")
-
-    @staticmethod
-    def build_attention_method():
-        raise NotImplementedError(
-            "Attention method is not implemented for the current quant type.")
-
-    @staticmethod
-    def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
-                              packed_modules_mapping: Dict[str, Any]):
-        proj_name = prefix.split(".")[-1]
-        if proj_name in packed_modules_mapping:
-            quant_type = None
-            shard_prefixes = [
-                prefix.replace(proj_name, shard_proj_name)
-                for shard_proj_name in packed_modules_mapping[proj_name]
-            ]
-            for shard_prefix in shard_prefixes:
-                shard_quant_type = quant_description[shard_prefix + '.weight']
-
-                if quant_type is None:
-                    quant_type = shard_quant_type
-                elif shard_quant_type != quant_type:
-                    raise ValueError(
-                        f"Not all shards of {prefix} are quantized with same quant type."
-                        f"Shard {proj_name} uses {shard_quant_type}, but another shard"
-                        f"use {quant_type}. Please check quantization config.")
-        else:
-            quant_type = quant_description[prefix + '.weight']
-        return quant_type
-
-    @classmethod
-    def get_quantizer(cls,
-                      quant_description: Dict[str, Any],
-                      prefix: str,
-                      packed_modules_mapping: Optional[Dict[str, Any]] = None):
-        if packed_modules_mapping is None:
-            packed_modules_mapping = dict()
-        # Attention
-        if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
-            quant_type = quant_description['fa_quant_type']
-        # Use KVCache int8
-        elif '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
-            quant_type = quant_description['kv_quant_type']
-        # Linear
-        else:
-            quant_type = cls.get_linear_quant_type(quant_description, prefix,
-                                                   packed_modules_mapping)
-        if quant_type in SUPPORT_ASCEND_QUANTIZER_TYPE.keys():
-            cls = SUPPORT_ASCEND_QUANTIZER_TYPE[quant_type]
-            if not cls._instance:
-                cls._instance = cls(quant_description)
-            return cls._instance
-        raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \
-                                  f"{list(SUPPORT_ASCEND_QUANTIZER_TYPE.keys())}")
-
-
-class W4A8DYNAMICQuantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return AscendW4A8DynamicLinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return AscendW4A8DynamicFusedMoEMethod()
-
-
-class W8A8Quantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return AscendW8A8LinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return AscendW8A8FusedMoEMethod()
-
-    @staticmethod
-    def build_attention_method():
-        return AscendC8KVCacheMethod()
-
-
-class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):
-
-    @staticmethod
-    def build_linear_method():
-        return AscendW8A8DynamicLinearMethod()
-
-    @staticmethod
-    def build_moe_method():
-        return AscendW8A8DynamicFusedMoEMethod()
-
-
-SUPPORT_ASCEND_QUANTIZER_TYPE = {
-    "W4A8_DYNAMIC": W4A8DYNAMICQuantizer,
-    "W8A8": W8A8Quantizer,
-    "W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
-    "C8": W8A8Quantizer,
-}
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -0,0 +1,222 @@
+import importlib
+import sys
+import types
+from typing import Any, Dict, Optional, Type
+
+from vllm.logger import logger
+
+from .func_wrapper import (wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init,
+                           wrapper_vocab_parallel_embedding_init)
+from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
+                           AscendW4A8DynamicLinearMethod)
+from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
+                   AscendW8A8LinearMethod)
+from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
+                           AscendW8A8DynamicLinearMethod)
+
+patched = False
+
+ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
+    "W4A8_DYNAMIC": {
+        "linear": AscendW4A8DynamicLinearMethod,
+        "moe": AscendW4A8DynamicFusedMoEMethod,
+    },
+    "W8A8": {
+        "linear": AscendW8A8LinearMethod,
+        "moe": AscendW8A8FusedMoEMethod,
+        "attention": AscendC8KVCacheMethod,
+    },
+    "W8A8_DYNAMIC": {
+        "linear": AscendW8A8DynamicLinearMethod,
+        "moe": AscendW8A8DynamicFusedMoEMethod,
+    },
+    "C8": {
+        "attention": AscendC8KVCacheMethod,
+    },
+}
+
+
+def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
+                          packed_modules_mapping: Dict[str, Any]):
+    proj_name = prefix.split(".")[-1]
+    if proj_name in packed_modules_mapping:
+        quant_type = None
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in packed_modules_mapping[proj_name]
+        ]
+        for shard_prefix in shard_prefixes:
+            shard_quant_type = quant_description[shard_prefix + '.weight']
+
+            if quant_type is None:
+                quant_type = shard_quant_type
+            elif shard_quant_type != quant_type:
+                raise ValueError(
+                    f"Not all shards of {prefix} are quantized with same quant type."
+                    f"Shard {proj_name} uses {shard_quant_type}, but another shard"
+                    f"use {quant_type}. Please check quantization config.")
+    else:
+        quant_type = quant_description[prefix + '.weight']
+    return quant_type
+
+
+def get_quant_method(quant_description: Dict[str, Any],
+                     prefix: str,
+                     layer_type: str,
+                     packed_modules_mapping: Optional[Dict[str, Any]] = None):
+    apply_quantization_patch(quant_description)
+    if packed_modules_mapping is None:
+        packed_modules_mapping = dict()
+    # Attention
+    if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
+        quant_type = quant_description['fa_quant_type']
+    # Use KVCache int8
+    elif '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
+        quant_type = quant_description['kv_quant_type']
+    # Linear
+    else:
+        quant_type = get_linear_quant_type(quant_description, prefix,
+                                           packed_modules_mapping)
+    if quant_type in ASCEND_QUANTIZATION_METHOD_MAP.keys():
+        method_map = ASCEND_QUANTIZATION_METHOD_MAP[quant_type]
+        if layer_type in method_map.keys():
+            method_cls = method_map[layer_type]
+            return method_cls()
+        else:
+            raise NotImplementedError(
+                f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}."
+            )
+    raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \
+                                f"{list(ASCEND_QUANTIZATION_METHOD_MAP.keys())}")
+
+
+def apply_quantization_patch(quant_description):
+    global patched
+    if patched:
+        return
+    for name in quant_description.keys():
+        if "norm.bias" in name:
+            apply_patch("vllm.model_executor.layers.layernorm.RMSNorm",
+                        "__init__", [wrapper_rmsnorm_init])
+            apply_patch("vllm_ascend.ops.layernorm.AscendRMSNorm",
+                        "forward_oot", [wrapper_rmsnorm_forward_oot])
+            apply_patch(
+                "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding",
+                "__init__", [wrapper_vocab_parallel_embedding_init])
+            break
+    patched = True
+    logger.info("Using the vLLM Ascend Quantization now!")
+
+
+def apply_patch(target_module, target_function, wrappers):
+
+    original_module, original_function = parse_path(target_module,
+                                                    target_function, False)
+
+    original_function_id = id(original_function)
+
+    candidate = original_function
+    for wrapper in wrappers:
+        candidate = wrapper(candidate)
+    if target_function is not None:
+        setattr(original_module, target_function, candidate)
+
+    for _, value in sys.modules.copy().items():
+        if target_function is None:
+            continue
+        try:
+            attr = getattr(value, target_function, None)
+            if attr is not None and id(attr) == original_function_id:
+                setattr(value, target_function, candidate)
+        except ImportError:
+            continue
+
+
+def parse_path(module_path, function_name, create_dummy):
+    """
+    Parse module path and resolve/create modules as needed.
+
+    Args:
+        module_path: Dot-separated module path
+        function_name: Target function name (None for module only)
+        create_dummy: Create dummy modules/functions when missing
+
+    Returns:
+        Tuple of (resolved module, target function/none)
+
+    Raises:
+        ModuleNotFoundError: If module path is invalid and create_dummy=False
+        AttributeError: If function is missing and create_dummy=False
+    """
+    from importlib.machinery import ModuleSpec
+
+    def create_dummy_module(full_path, parent=None):
+        """Create and register a placeholder module"""
+        dummy = types.ModuleType(full_path)
+        dummy.__file__ = "vllm_ascend.dummy_module.py"
+        dummy.__spec__ = ModuleSpec(full_path, None)
+        sys.modules[full_path] = dummy
+        if parent:
+            setattr(parent, full_path.split(".")[-1], dummy)
+        return dummy
+
+    def create_placeholder_function(func_name):
+        """Create dummy function that raises when called"""
+
+        def placeholder(*args, **kwargs):
+            raise NotImplementedError(f"Function {func_name} is a placeholder")
+
+        placeholder.__name__ = func_name
+        return placeholder
+
+    modules = module_path.split(".")
+    current_module = None
+    processed_path = []
+
+    for idx, part in enumerate(modules):
+        current_path = ".".join(modules[:idx + 1])
+        parent_path = ".".join(modules[:idx]) if idx > 0 else None
+
+        try:
+            current_module = importlib.import_module(current_path)
+        except ModuleNotFoundError:
+            # Handle missing module
+            parent = importlib.import_module(
+                parent_path) if parent_path else None
+            if parent and hasattr(parent, part):
+                # Use existing attribute from parent
+                current_module = getattr(parent, part)
+                # Check for early function resolution
+                if function_name and hasattr(current_module, function_name):
+                    return current_module, getattr(current_module,
+                                                   function_name)
+                if function_name and create_dummy:
+                    ph_func = create_placeholder_function(function_name)
+                    setattr(current_module, function_name, ph_func)
+                    return current_module, ph_func
+                if function_name:
+                    raise AttributeError(
+                        f"Function {function_name} missing in {current_path}")
+            else:
+                if not create_dummy:
+                    raise
+                # Create and register dummy module
+                current_module = create_dummy_module(
+                    current_path,
+                    parent=importlib.import_module(parent_path)
+                    if parent_path else None)
+
+        processed_path.append(part)
+
+    # Final function handling
+    final_module = sys.modules[module_path]
+    if function_name is not None:
+        if not hasattr(final_module, function_name):
+            if create_dummy:
+                ph_func = create_placeholder_function(function_name)
+                setattr(final_module, function_name, ph_func)
+            else:
+                setattr(final_module, function_name, None)
+        return final_module, getattr(final_module, function_name)
+
+    return final_module, None