xc-llm-ascend/vllm_ascend/quantization/methods/__init__.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Ascend quantization scheme implementations.

This module provides all quantization scheme implementations for Ascend NPU.
Schemes are automatically registered via the @register_scheme decorator.

Usage:
    from vllm_ascend.quantization.methods import get_scheme_class

    # Get a scheme class by quant_type and layer_type
    scheme_cls = get_scheme_class("W8A8_DYNAMIC", "linear")
    scheme = scheme_cls()
"""

from typing import Any

# Import base classes
from .base import AscendAttentionScheme, AscendLinearScheme, AscendMoEScheme, QuantType

# Import all scheme classes for external access
from .kv_c8 import AscendFAQuantAttentionMethod

# Import registry functions
from .registry import get_scheme_class, register_scheme
from .w4a4_flatquant import AscendW4A4FlatQuantDynamicLinearMethod
from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
from .w4a8 import AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod
from .w4a16 import AscendW4A16FusedMoEMethod
from .w8a8_dynamic import AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod
from .w8a8_mxfp8 import AscendW8A8MXFP8DynamicLinearMethod
from .w8a8_pdmix import AscendW8A8PDMixFusedMoeMethod, AscendW8A8PDMixLinearMethod
from .w8a8_static import AscendW8A8LinearMethod
from .w8a16 import AscendW8A16LinearMethod


def is_mx_quant_type(instance: Any) -> bool:
    """Checks if the quantization method is a microscaling (MX) type."""
    MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod,)
    return isinstance(instance, MX_QUANT_TYPES)


__all__ = [
    # Base classes
    "AscendAttentionScheme",
    "AscendLinearScheme",
    "AscendMoEScheme",
    "QuantType",
    # Registry functions
    "register_scheme",
    "get_scheme_class",
    # Utility functions
    "is_mx_quant_type",
    # Scheme classes
    "AscendW8A8LinearMethod",
    "AscendW8A8DynamicLinearMethod",
    "AscendW8A8DynamicFusedMoEMethod",
    "AscendW8A8MXFP8DynamicLinearMethod",
    "AscendW8A8PDMixLinearMethod",
    "AscendW8A8PDMixFusedMoeMethod",
    "AscendW8A16LinearMethod",
    "AscendW4A8DynamicLinearMethod",
    "AscendW4A8DynamicFusedMoEMethod",
    "AscendW4A16FusedMoEMethod",
    "AscendW4A4FlatQuantDynamicLinearMethod",
    "AscendW4A4LaosDynamicLinearMethod",
    "AscendFAQuantAttentionMethod",
]