### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
|` vllm_ascend/quantization/compressed_tensors/compressed_tensors.py`|
|` vllm_ascend/quantization/quant_config.py`|
|` vllm_ascend/quantization/utils.py`|
|` vllm_ascend/quantization/w4a16.py`|
|` vllm_ascend/quantization/w4a4_flatquant_dynamic.py`|
|` vllm_ascend/quantization/w4a8_dynamic.py`|
|` vllm_ascend/quantization/w8a16.py`|
|` vllm_ascend/quantization/w8a8.py`|
|` vllm_ascend/quantization/w8a8_dynamic.py`|
|` vllm_ascend/quantization/w8a8_pdmix.py`|
|` vllm_ascend/quantization/w8a8mxfp8.py`|
|` vllm_ascend/sample/rejection_sampler.py`|
|` vllm_ascend/sample/sampler.py`|
|` vllm_ascend/worker/block_table.py`|
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -21,7 +21,7 @@ Schemes are automatically registered via the @register_scheme decorator.
|
||||
|
||||
Usage:
|
||||
from vllm_ascend.quantization.methods import get_scheme_class
|
||||
|
||||
|
||||
# Get a scheme class by quant_type and layer_type
|
||||
scheme_cls = get_scheme_class("W8A8_DYNAMIC", "linear")
|
||||
scheme = scheme_cls()
|
||||
@@ -30,28 +30,26 @@ Usage:
|
||||
from typing import Any
|
||||
|
||||
# Import base classes
|
||||
from .base import (AscendAttentionScheme, AscendLinearScheme, AscendMoEScheme,
|
||||
QuantType)
|
||||
from .base import AscendAttentionScheme, AscendLinearScheme, AscendMoEScheme, QuantType
|
||||
|
||||
# Import registry functions
|
||||
from .registry import get_scheme_class, register_scheme
|
||||
|
||||
# Import all scheme classes for external access
|
||||
from .w4a4_flatquant import AscendW4A4FlatQuantDynamicLinearMethod
|
||||
from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
|
||||
from .w4a8 import (AscendW4A8DynamicFusedMoEMethod,
|
||||
AscendW4A8DynamicLinearMethod)
|
||||
from .w4a8 import AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod
|
||||
from .w4a16 import AscendW4A16FusedMoEMethod
|
||||
from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
|
||||
AscendW8A8DynamicLinearMethod)
|
||||
from .w8a8_dynamic import AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod
|
||||
from .w8a8_mxfp8 import AscendW8A8MXFP8DynamicLinearMethod
|
||||
from .w8a8_pdmix import (AscendW8A8PDMixFusedMoeMethod,
|
||||
AscendW8A8PDMixLinearMethod)
|
||||
from .w8a8_pdmix import AscendW8A8PDMixFusedMoeMethod, AscendW8A8PDMixLinearMethod
|
||||
from .w8a8_static import AscendW8A8LinearMethod
|
||||
from .w8a16 import AscendW8A16LinearMethod
|
||||
|
||||
|
||||
def is_mx_quant_type(instance: Any) -> bool:
|
||||
"""Checks if the quantization method is a microscaling (MX) type."""
|
||||
MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod, )
|
||||
MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod,)
|
||||
return isinstance(instance, MX_QUANT_TYPES)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user